{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010468463752944255, "grad_norm": 11.782889401902718, "learning_rate": 6.25e-08, "logits/chosen": -1.444485068321228, "logits/rejected": -1.4456722736358643, "logps/chosen": -7.9825921058654785, "logps/rejected": -8.156225204467773, "loss": 8.9796, "rewards/accuracies": 0.5, "rewards/chosen": -7.9825921058654785, "rewards/margins": 0.17363198101520538, "rewards/rejected": -8.156225204467773, "step": 5 }, { "epoch": 0.02093692750588851, "grad_norm": 9.749361718413306, "learning_rate": 1.25e-07, "logits/chosen": -1.447454810142517, "logits/rejected": -1.4387584924697876, "logps/chosen": -8.047009468078613, "logps/rejected": -7.960066795349121, "loss": 8.9813, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.047009468078613, "rewards/margins": -0.08694207668304443, "rewards/rejected": -7.960066795349121, "step": 10 }, { "epoch": 0.031405391258832765, "grad_norm": 16.533988717004068, "learning_rate": 1.875e-07, "logits/chosen": -1.4474663734436035, "logits/rejected": -1.4442191123962402, "logps/chosen": -7.851595401763916, "logps/rejected": -7.866987705230713, "loss": 8.8899, "rewards/accuracies": 0.5, "rewards/chosen": -7.851595401763916, "rewards/margins": 0.01539215724915266, "rewards/rejected": -7.866987705230713, "step": 15 }, { "epoch": 0.04187385501177702, "grad_norm": 13.917496227050558, "learning_rate": 2.5e-07, "logits/chosen": -1.440216064453125, "logits/rejected": -1.4452197551727295, "logps/chosen": -8.178640365600586, "logps/rejected": -8.201952934265137, "loss": 9.0475, "rewards/accuracies": 0.5, "rewards/chosen": -8.178640365600586, "rewards/margins": 0.023312047123908997, "rewards/rejected": -8.201952934265137, "step": 20 }, { "epoch": 0.05234231876472128, "grad_norm": 13.358220692601913, "learning_rate": 3.125e-07, "logits/chosen": -1.474110722541809, "logits/rejected": -1.463666558265686, "logps/chosen": -8.079231262207031, "logps/rejected": -7.98193883895874, "loss": 9.124, "rewards/accuracies": 0.4375, "rewards/chosen": -8.079231262207031, "rewards/margins": -0.09729210287332535, "rewards/rejected": -7.98193883895874, "step": 25 }, { "epoch": 0.06281078251766553, "grad_norm": 11.375823739582524, "learning_rate": 3.75e-07, "logits/chosen": -1.4473092555999756, "logits/rejected": -1.4344959259033203, "logps/chosen": -7.780773162841797, "logps/rejected": -7.703455448150635, "loss": 9.0197, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -7.780773162841797, "rewards/margins": -0.07731723040342331, "rewards/rejected": -7.703455448150635, "step": 30 }, { "epoch": 0.07327924627060979, "grad_norm": 10.213017154182484, "learning_rate": 4.3749999999999994e-07, "logits/chosen": -1.4583995342254639, "logits/rejected": -1.431770920753479, "logps/chosen": -8.027624130249023, "logps/rejected": -7.8937225341796875, "loss": 8.9843, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.027624130249023, "rewards/margins": -0.13390299677848816, "rewards/rejected": -7.8937225341796875, "step": 35 }, { "epoch": 0.08374771002355404, "grad_norm": 10.12652288345569, "learning_rate": 5e-07, "logits/chosen": -1.4447615146636963, "logits/rejected": -1.458698034286499, "logps/chosen": -7.983005523681641, "logps/rejected": -8.174285888671875, "loss": 9.0094, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -7.983005523681641, "rewards/margins": 0.19128072261810303, "rewards/rejected": -8.174285888671875, "step": 40 }, { "epoch": 0.0942161737764983, "grad_norm": 10.9885005835532, "learning_rate": 5.625e-07, "logits/chosen": -1.4630662202835083, "logits/rejected": -1.4628698825836182, "logps/chosen": -8.03730583190918, "logps/rejected": -7.831875801086426, "loss": 8.9878, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -8.03730583190918, "rewards/margins": -0.20542971789836884, "rewards/rejected": -7.831875801086426, "step": 45 }, { "epoch": 0.10468463752944256, "grad_norm": 13.872196323961617, "learning_rate": 5.999678242522831e-07, "logits/chosen": -1.4442825317382812, "logits/rejected": -1.4613512754440308, "logps/chosen": -8.217935562133789, "logps/rejected": -8.252190589904785, "loss": 9.0757, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -8.217935562133789, "rewards/margins": 0.03425510972738266, "rewards/rejected": -8.252190589904785, "step": 50 }, { "epoch": 0.11515310128238682, "grad_norm": 10.905494395813982, "learning_rate": 5.996059263493219e-07, "logits/chosen": -1.4492484331130981, "logits/rejected": -1.4467532634735107, "logps/chosen": -8.046092987060547, "logps/rejected": -8.062843322753906, "loss": 9.1036, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -8.046092987060547, "rewards/margins": 0.01675090566277504, "rewards/rejected": -8.062843322753906, "step": 55 }, { "epoch": 0.12562156503533106, "grad_norm": 15.995330684554988, "learning_rate": 5.988423976115163e-07, "logits/chosen": -1.443290114402771, "logits/rejected": -1.4562170505523682, "logps/chosen": -8.026491165161133, "logps/rejected": -8.317246437072754, "loss": 8.9008, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -8.026491165161133, "rewards/margins": 0.29075488448143005, "rewards/rejected": -8.317246437072754, "step": 60 }, { "epoch": 0.1360900287882753, "grad_norm": 24.861886587620123, "learning_rate": 5.976782615723061e-07, "logits/chosen": -1.392534613609314, "logits/rejected": -1.4108682870864868, "logps/chosen": -7.828791618347168, "logps/rejected": -8.337072372436523, "loss": 8.934, "rewards/accuracies": 0.59375, "rewards/chosen": -7.828791618347168, "rewards/margins": 0.5082817673683167, "rewards/rejected": -8.337072372436523, "step": 65 }, { "epoch": 0.14655849254121958, "grad_norm": 35.209412870115344, "learning_rate": 5.961150787913738e-07, "logits/chosen": -1.39071524143219, "logits/rejected": -1.3853540420532227, "logps/chosen": -7.945198059082031, "logps/rejected": -8.038311004638672, "loss": 8.9653, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -7.945198059082031, "rewards/margins": 0.0931134819984436, "rewards/rejected": -8.038311004638672, "step": 70 }, { "epoch": 0.15702695629416383, "grad_norm": 12.413941901156766, "learning_rate": 5.941549447626671e-07, "logits/chosen": -1.3913167715072632, "logits/rejected": -1.3984179496765137, "logps/chosen": -7.823273658752441, "logps/rejected": -7.864768981933594, "loss": 8.9142, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -7.823273658752441, "rewards/margins": 0.04149458184838295, "rewards/rejected": -7.864768981933594, "step": 75 }, { "epoch": 0.16749542004710807, "grad_norm": 21.221667512587725, "learning_rate": 5.918004871053251e-07, "logits/chosen": -1.3923091888427734, "logits/rejected": -1.4085341691970825, "logps/chosen": -7.852835178375244, "logps/rejected": -7.9230217933654785, "loss": 8.9088, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -7.852835178375244, "rewards/margins": 0.07018764317035675, "rewards/rejected": -7.9230217933654785, "step": 80 }, { "epoch": 0.17796388380005235, "grad_norm": 12.603711372215182, "learning_rate": 5.890548620412763e-07, "logits/chosen": -1.4011937379837036, "logits/rejected": -1.39864182472229, "logps/chosen": -7.970945835113525, "logps/rejected": -8.160429000854492, "loss": 9.0488, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -7.970945835113525, "rewards/margins": 0.18948234617710114, "rewards/rejected": -8.160429000854492, "step": 85 }, { "epoch": 0.1884323475529966, "grad_norm": 13.164098047063113, "learning_rate": 5.859217501642258e-07, "logits/chosen": -1.375800371170044, "logits/rejected": -1.389070749282837, "logps/chosen": -7.946028232574463, "logps/rejected": -8.130967140197754, "loss": 9.0141, "rewards/accuracies": 0.53125, "rewards/chosen": -7.946028232574463, "rewards/margins": 0.18493881821632385, "rewards/rejected": -8.130967140197754, "step": 90 }, { "epoch": 0.19890081130594087, "grad_norm": 11.129043830781203, "learning_rate": 5.824053515057091e-07, "logits/chosen": -1.384723424911499, "logits/rejected": -1.3767420053482056, "logps/chosen": -8.055198669433594, "logps/rejected": -7.921385288238525, "loss": 9.0835, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -8.055198669433594, "rewards/margins": -0.13381320238113403, "rewards/rejected": -7.921385288238525, "step": 95 }, { "epoch": 0.2093692750588851, "grad_norm": 19.959846628166616, "learning_rate": 5.785103799048218e-07, "logits/chosen": -1.4132357835769653, "logits/rejected": -1.418881893157959, "logps/chosen": -8.033044815063477, "logps/rejected": -8.07997989654541, "loss": 9.0153, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -8.033044815063477, "rewards/margins": 0.04693456366658211, "rewards/rejected": -8.07997989654541, "step": 100 }, { "epoch": 0.21983773881182936, "grad_norm": 12.843923135972, "learning_rate": 5.742420566891749e-07, "logits/chosen": -1.413010835647583, "logits/rejected": -1.4074172973632812, "logps/chosen": -7.718166351318359, "logps/rejected": -7.9243879318237305, "loss": 8.9445, "rewards/accuracies": 0.5625, "rewards/chosen": -7.718166351318359, "rewards/margins": 0.206221342086792, "rewards/rejected": -7.9243879318237305, "step": 105 }, { "epoch": 0.23030620256477363, "grad_norm": 13.825481188162163, "learning_rate": 5.696061036755478e-07, "logits/chosen": -1.4453760385513306, "logits/rejected": -1.4452683925628662, "logps/chosen": -7.982637882232666, "logps/rejected": -8.220747947692871, "loss": 9.0144, "rewards/accuracies": 0.5625, "rewards/chosen": -7.982637882232666, "rewards/margins": 0.23810970783233643, "rewards/rejected": -8.220747947692871, "step": 110 }, { "epoch": 0.24077466631771788, "grad_norm": 1525.1356967991103, "learning_rate": 5.64608735499618e-07, "logits/chosen": -1.3860673904418945, "logits/rejected": -1.3894257545471191, "logps/chosen": -7.8776044845581055, "logps/rejected": -8.189804077148438, "loss": 8.9598, "rewards/accuracies": 0.59375, "rewards/chosen": -7.8776044845581055, "rewards/margins": 0.31219929456710815, "rewards/rejected": -8.189804077148438, "step": 115 }, { "epoch": 0.2512431300706621, "grad_norm": 13.913132246096096, "learning_rate": 5.592566512850545e-07, "logits/chosen": -1.3590507507324219, "logits/rejected": -1.3622348308563232, "logps/chosen": -8.100934982299805, "logps/rejected": -8.155590057373047, "loss": 8.9501, "rewards/accuracies": 0.5, "rewards/chosen": -8.100934982299805, "rewards/margins": 0.054654598236083984, "rewards/rejected": -8.155590057373047, "step": 120 }, { "epoch": 0.26171159382360637, "grad_norm": 14.57715484351377, "learning_rate": 5.535570256631384e-07, "logits/chosen": -1.4173157215118408, "logits/rejected": -1.411921739578247, "logps/chosen": -8.191034317016602, "logps/rejected": -8.077339172363281, "loss": 9.0651, "rewards/accuracies": 0.46875, "rewards/chosen": -8.191034317016602, "rewards/margins": -0.11369502544403076, "rewards/rejected": -8.077339172363281, "step": 125 }, { "epoch": 0.2721800575765506, "grad_norm": 14.54742842440625, "learning_rate": 5.475174991549528e-07, "logits/chosen": -1.37632417678833, "logits/rejected": -1.3858749866485596, "logps/chosen": -8.046875953674316, "logps/rejected": -8.172870635986328, "loss": 8.9777, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.046875953674316, "rewards/margins": 0.12599456310272217, "rewards/rejected": -8.172870635986328, "step": 130 }, { "epoch": 0.2826485213294949, "grad_norm": 18.256408848890032, "learning_rate": 5.411461679290317e-07, "logits/chosen": -1.3864247798919678, "logits/rejected": -1.4004995822906494, "logps/chosen": -7.979268550872803, "logps/rejected": -8.406595230102539, "loss": 8.9672, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -7.979268550872803, "rewards/margins": 0.4273262023925781, "rewards/rejected": -8.406595230102539, "step": 135 }, { "epoch": 0.29311698508243916, "grad_norm": 14.27521931097187, "learning_rate": 5.34451572948201e-07, "logits/chosen": -1.4093233346939087, "logits/rejected": -1.4172067642211914, "logps/chosen": -7.903810977935791, "logps/rejected": -7.975949287414551, "loss": 8.9533, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -7.903810977935791, "rewards/margins": 0.07213909924030304, "rewards/rejected": -7.975949287414551, "step": 140 }, { "epoch": 0.3035854488353834, "grad_norm": 10.523105376926537, "learning_rate": 5.274426885201582e-07, "logits/chosen": -1.4147297143936157, "logits/rejected": -1.4396823644638062, "logps/chosen": -7.8977460861206055, "logps/rejected": -8.05931568145752, "loss": 8.915, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -7.8977460861206055, "rewards/margins": 0.16156847774982452, "rewards/rejected": -8.05931568145752, "step": 145 }, { "epoch": 0.31405391258832765, "grad_norm": 14.122907500033074, "learning_rate": 5.201289102671411e-07, "logits/chosen": -1.4332246780395508, "logits/rejected": -1.436842679977417, "logps/chosen": -7.895875453948975, "logps/rejected": -8.0299072265625, "loss": 8.9785, "rewards/accuracies": 0.5, "rewards/chosen": -7.895875453948975, "rewards/margins": 0.13403132557868958, "rewards/rejected": -8.0299072265625, "step": 150 }, { "epoch": 0.3245223763412719, "grad_norm": 12.92310774863363, "learning_rate": 5.12520042530811e-07, "logits/chosen": -1.402719259262085, "logits/rejected": -1.3787992000579834, "logps/chosen": -7.979246616363525, "logps/rejected": -7.966032981872559, "loss": 9.0256, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -7.979246616363525, "rewards/margins": -0.013212683610618114, "rewards/rejected": -7.966032981872559, "step": 155 }, { "epoch": 0.33499084009421615, "grad_norm": 15.237628673130487, "learning_rate": 5.046262852292346e-07, "logits/chosen": -1.3872135877609253, "logits/rejected": -1.395935297012329, "logps/chosen": -8.034635543823242, "logps/rejected": -8.069303512573242, "loss": 9.0268, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -8.034635543823242, "rewards/margins": 0.03466759994626045, "rewards/rejected": -8.069303512573242, "step": 160 }, { "epoch": 0.34545930384716045, "grad_norm": 11.298592435998462, "learning_rate": 4.964582201835856e-07, "logits/chosen": -1.396750569343567, "logits/rejected": -1.3891570568084717, "logps/chosen": -7.99398946762085, "logps/rejected": -8.040716171264648, "loss": 9.0073, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -7.99398946762085, "rewards/margins": 0.04672648385167122, "rewards/rejected": -8.040716171264648, "step": 165 }, { "epoch": 0.3559277676001047, "grad_norm": 12.492415372530475, "learning_rate": 4.880267969328908e-07, "logits/chosen": -1.3683674335479736, "logits/rejected": -1.3726252317428589, "logps/chosen": -8.114925384521484, "logps/rejected": -8.097586631774902, "loss": 9.0856, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -8.114925384521484, "rewards/margins": -0.01733933761715889, "rewards/rejected": -8.097586631774902, "step": 170 }, { "epoch": 0.36639623135304894, "grad_norm": 12.610496367889976, "learning_rate": 4.793433180558423e-07, "logits/chosen": -1.3843915462493896, "logits/rejected": -1.3853034973144531, "logps/chosen": -7.956766605377197, "logps/rejected": -7.944356441497803, "loss": 9.0054, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -7.956766605377197, "rewards/margins": -0.012410154566168785, "rewards/rejected": -7.944356441497803, "step": 175 }, { "epoch": 0.3768646951059932, "grad_norm": 15.598692092405715, "learning_rate": 4.704194240193467e-07, "logits/chosen": -1.3554438352584839, "logits/rejected": -1.372804880142212, "logps/chosen": -8.031749725341797, "logps/rejected": -8.155205726623535, "loss": 8.9878, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -8.031749725341797, "rewards/margins": 0.12345610558986664, "rewards/rejected": -8.155205726623535, "step": 180 }, { "epoch": 0.38733315885893743, "grad_norm": 13.474501957199323, "learning_rate": 4.6126707757412686e-07, "logits/chosen": -1.3345744609832764, "logits/rejected": -1.3397581577301025, "logps/chosen": -7.977494716644287, "logps/rejected": -8.02932357788086, "loss": 8.9482, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -7.977494716644287, "rewards/margins": 0.051828037947416306, "rewards/rejected": -8.02932357788086, "step": 185 }, { "epoch": 0.39780162261188173, "grad_norm": 11.929724403265839, "learning_rate": 4.5189854771829086e-07, "logits/chosen": -1.3528499603271484, "logits/rejected": -1.3492704629898071, "logps/chosen": -7.803788661956787, "logps/rejected": -7.93734073638916, "loss": 8.9516, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -7.803788661956787, "rewards/margins": 0.1335521936416626, "rewards/rejected": -7.93734073638916, "step": 190 }, { "epoch": 0.408270086364826, "grad_norm": 14.327437395286285, "learning_rate": 4.4232639325036807e-07, "logits/chosen": -1.3263393640518188, "logits/rejected": -1.3331449031829834, "logps/chosen": -8.183530807495117, "logps/rejected": -8.074382781982422, "loss": 9.054, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -8.183530807495117, "rewards/margins": -0.10914800316095352, "rewards/rejected": -8.074382781982422, "step": 195 }, { "epoch": 0.4187385501177702, "grad_norm": 12.623357323327125, "learning_rate": 4.32563445933859e-07, "logits/chosen": -1.3866218328475952, "logits/rejected": -1.376103401184082, "logps/chosen": -7.869284152984619, "logps/rejected": -7.980343818664551, "loss": 9.0216, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -7.869284152984619, "rewards/margins": 0.11105932295322418, "rewards/rejected": -7.980343818664551, "step": 200 }, { "epoch": 0.42920701387071447, "grad_norm": 15.673764218634288, "learning_rate": 4.226227932958664e-07, "logits/chosen": -1.3467977046966553, "logits/rejected": -1.3465808629989624, "logps/chosen": -7.946604251861572, "logps/rejected": -8.12873363494873, "loss": 8.9418, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -7.946604251861572, "rewards/margins": 0.18213000893592834, "rewards/rejected": -8.12873363494873, "step": 205 }, { "epoch": 0.4396754776236587, "grad_norm": 20.82547017360473, "learning_rate": 4.1251776108286854e-07, "logits/chosen": -1.3276244401931763, "logits/rejected": -1.3366806507110596, "logps/chosen": -7.942746639251709, "logps/rejected": -8.075704574584961, "loss": 8.992, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -7.942746639251709, "rewards/margins": 0.13295890390872955, "rewards/rejected": -8.075704574584961, "step": 210 }, { "epoch": 0.45014394137660296, "grad_norm": 11.77567830972404, "learning_rate": 4.022618953971514e-07, "logits/chosen": -1.3542811870574951, "logits/rejected": -1.3621467351913452, "logps/chosen": -7.741019248962402, "logps/rejected": -8.169224739074707, "loss": 8.9028, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -7.741019248962402, "rewards/margins": 0.42820531129837036, "rewards/rejected": -8.169224739074707, "step": 215 }, { "epoch": 0.46061240512954726, "grad_norm": 13.792748846310712, "learning_rate": 3.918689445378477e-07, "logits/chosen": -1.3647044897079468, "logits/rejected": -1.3888493776321411, "logps/chosen": -7.679605960845947, "logps/rejected": -7.820864677429199, "loss": 9.0059, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -7.679605960845947, "rewards/margins": 0.14125962555408478, "rewards/rejected": -7.820864677429199, "step": 220 }, { "epoch": 0.4710808688824915, "grad_norm": 10.698537268464346, "learning_rate": 3.813528405709251e-07, "logits/chosen": -1.3668994903564453, "logits/rejected": -1.370476484298706, "logps/chosen": -7.723212242126465, "logps/rejected": -7.974145412445068, "loss": 8.9131, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -7.723212242126465, "rewards/margins": 0.25093379616737366, "rewards/rejected": -7.974145412445068, "step": 225 }, { "epoch": 0.48154933263543576, "grad_norm": 12.476277662413903, "learning_rate": 3.707276806528282e-07, "logits/chosen": -1.37067449092865, "logits/rejected": -1.3700437545776367, "logps/chosen": -8.093690872192383, "logps/rejected": -8.251599311828613, "loss": 9.068, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -8.093690872192383, "rewards/margins": 0.157908633351326, "rewards/rejected": -8.251599311828613, "step": 230 }, { "epoch": 0.49201779638838, "grad_norm": 12.703214615987921, "learning_rate": 3.6000770813281334e-07, "logits/chosen": -1.3918092250823975, "logits/rejected": -1.3941457271575928, "logps/chosen": -7.891854286193848, "logps/rejected": -8.121790885925293, "loss": 8.9911, "rewards/accuracies": 0.53125, "rewards/chosen": -7.891854286193848, "rewards/margins": 0.22993668913841248, "rewards/rejected": -8.121790885925293, "step": 235 }, { "epoch": 0.5024862601413242, "grad_norm": 18.912862114031174, "learning_rate": 3.4920729345930654e-07, "logits/chosen": -1.3598334789276123, "logits/rejected": -1.3656227588653564, "logps/chosen": -7.972811698913574, "logps/rejected": -8.120051383972168, "loss": 9.0708, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -7.972811698913574, "rewards/margins": 0.14723989367485046, "rewards/rejected": -8.120051383972168, "step": 240 }, { "epoch": 0.5129547238942685, "grad_norm": 15.322600609417346, "learning_rate": 3.383409149158814e-07, "logits/chosen": -1.3441493511199951, "logits/rejected": -1.3492319583892822, "logps/chosen": -8.092975616455078, "logps/rejected": -8.160036087036133, "loss": 8.9194, "rewards/accuracies": 0.53125, "rewards/chosen": -8.092975616455078, "rewards/margins": 0.06706006824970245, "rewards/rejected": -8.160036087036133, "step": 245 }, { "epoch": 0.5234231876472127, "grad_norm": 16.724538535729355, "learning_rate": 3.2742313921268035e-07, "logits/chosen": -1.3152296543121338, "logits/rejected": -1.3239524364471436, "logps/chosen": -7.889418601989746, "logps/rejected": -8.20849323272705, "loss": 8.8184, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -7.889418601989746, "rewards/margins": 0.31907448172569275, "rewards/rejected": -8.20849323272705, "step": 250 }, { "epoch": 0.533891651400157, "grad_norm": 12.327867536896116, "learning_rate": 3.1646860195929825e-07, "logits/chosen": -1.3065917491912842, "logits/rejected": -1.3107439279556274, "logps/chosen": -8.116486549377441, "logps/rejected": -8.308655738830566, "loss": 8.9949, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -8.116486549377441, "rewards/margins": 0.19216908514499664, "rewards/rejected": -8.308655738830566, "step": 255 }, { "epoch": 0.5443601151531012, "grad_norm": 14.17754725379555, "learning_rate": 3.054919880453032e-07, "logits/chosen": -1.246124029159546, "logits/rejected": -1.2508999109268188, "logps/chosen": -7.7648186683654785, "logps/rejected": -8.22431755065918, "loss": 8.941, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -7.7648186683654785, "rewards/margins": 0.4594977796077728, "rewards/rejected": -8.22431755065918, "step": 260 }, { "epoch": 0.5548285789060455, "grad_norm": 11.969966746660198, "learning_rate": 2.9450801195469686e-07, "logits/chosen": -1.3018732070922852, "logits/rejected": -1.3149497509002686, "logps/chosen": -7.904818058013916, "logps/rejected": -8.152360916137695, "loss": 8.9657, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -7.904818058013916, "rewards/margins": 0.24754443764686584, "rewards/rejected": -8.152360916137695, "step": 265 }, { "epoch": 0.5652970426589898, "grad_norm": 14.47186665684816, "learning_rate": 2.835313980407017e-07, "logits/chosen": -1.3108150959014893, "logits/rejected": -1.288703441619873, "logps/chosen": -8.249927520751953, "logps/rejected": -8.318041801452637, "loss": 9.0073, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -8.249927520751953, "rewards/margins": 0.06811434030532837, "rewards/rejected": -8.318041801452637, "step": 270 }, { "epoch": 0.575765506411934, "grad_norm": 26.602745593974163, "learning_rate": 2.7257686078731973e-07, "logits/chosen": -1.337909460067749, "logits/rejected": -1.348547339439392, "logps/chosen": -7.881032466888428, "logps/rejected": -8.068848609924316, "loss": 8.8981, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -7.881032466888428, "rewards/margins": 0.18781575560569763, "rewards/rejected": -8.068848609924316, "step": 275 }, { "epoch": 0.5862339701648783, "grad_norm": 14.906273538361356, "learning_rate": 2.6165908508411857e-07, "logits/chosen": -1.3503994941711426, "logits/rejected": -1.3676143884658813, "logps/chosen": -7.861943244934082, "logps/rejected": -8.101309776306152, "loss": 8.9213, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -7.861943244934082, "rewards/margins": 0.23936741054058075, "rewards/rejected": -8.101309776306152, "step": 280 }, { "epoch": 0.5967024339178225, "grad_norm": 14.643252229490672, "learning_rate": 2.5079270654069354e-07, "logits/chosen": -1.3024542331695557, "logits/rejected": -1.3081843852996826, "logps/chosen": -7.836719512939453, "logps/rejected": -8.08849048614502, "loss": 8.8721, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -7.836719512939453, "rewards/margins": 0.251770943403244, "rewards/rejected": -8.08849048614502, "step": 285 }, { "epoch": 0.6071708976707668, "grad_norm": 12.350106404715637, "learning_rate": 2.399922918671867e-07, "logits/chosen": -1.337571620941162, "logits/rejected": -1.3552089929580688, "logps/chosen": -7.821458339691162, "logps/rejected": -8.146204948425293, "loss": 8.9032, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -7.821458339691162, "rewards/margins": 0.3247470557689667, "rewards/rejected": -8.146204948425293, "step": 290 }, { "epoch": 0.6176393614237111, "grad_norm": 13.544262102627407, "learning_rate": 2.2927231934717176e-07, "logits/chosen": -1.331067442893982, "logits/rejected": -1.3430246114730835, "logps/chosen": -7.9300737380981445, "logps/rejected": -8.060845375061035, "loss": 8.9735, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -7.9300737380981445, "rewards/margins": 0.1307719349861145, "rewards/rejected": -8.060845375061035, "step": 295 }, { "epoch": 0.6281078251766553, "grad_norm": 12.721635836612304, "learning_rate": 2.1864715942907487e-07, "logits/chosen": -1.299328088760376, "logits/rejected": -1.3065472841262817, "logps/chosen": -7.961094856262207, "logps/rejected": -8.206907272338867, "loss": 8.9027, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -7.961094856262207, "rewards/margins": 0.24581179022789001, "rewards/rejected": -8.206907272338867, "step": 300 }, { "epoch": 0.6385762889295996, "grad_norm": 12.545308362098348, "learning_rate": 2.081310554621522e-07, "logits/chosen": -1.3111393451690674, "logits/rejected": -1.335069179534912, "logps/chosen": -8.182366371154785, "logps/rejected": -8.390935897827148, "loss": 9.0314, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -8.182366371154785, "rewards/margins": 0.20856896042823792, "rewards/rejected": -8.390935897827148, "step": 305 }, { "epoch": 0.6490447526825438, "grad_norm": 15.575558913925711, "learning_rate": 1.9773810460284862e-07, "logits/chosen": -1.3477294445037842, "logits/rejected": -1.3550546169281006, "logps/chosen": -8.07054328918457, "logps/rejected": -8.061942100524902, "loss": 9.0612, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.07054328918457, "rewards/margins": -0.008599767461419106, "rewards/rejected": -8.061942100524902, "step": 310 }, { "epoch": 0.6595132164354881, "grad_norm": 20.765211421302535, "learning_rate": 1.874822389171314e-07, "logits/chosen": -1.3256926536560059, "logits/rejected": -1.3409112691879272, "logps/chosen": -7.885645389556885, "logps/rejected": -8.124526977539062, "loss": 8.8864, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -7.885645389556885, "rewards/margins": 0.2388812005519867, "rewards/rejected": -8.124526977539062, "step": 315 }, { "epoch": 0.6699816801884323, "grad_norm": 14.208218028523063, "learning_rate": 1.7737720670413356e-07, "logits/chosen": -1.344118595123291, "logits/rejected": -1.336096167564392, "logps/chosen": -8.159255981445312, "logps/rejected": -8.0567045211792, "loss": 8.9837, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -8.159255981445312, "rewards/margins": -0.10255154222249985, "rewards/rejected": -8.0567045211792, "step": 320 }, { "epoch": 0.6804501439413766, "grad_norm": 14.61227257116642, "learning_rate": 1.6743655406614095e-07, "logits/chosen": -1.340541958808899, "logits/rejected": -1.3474371433258057, "logps/chosen": -8.056330680847168, "logps/rejected": -8.348928451538086, "loss": 8.9222, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.056330680847168, "rewards/margins": 0.29259705543518066, "rewards/rejected": -8.348928451538086, "step": 325 }, { "epoch": 0.6909186076943209, "grad_norm": 13.778075151913542, "learning_rate": 1.5767360674963198e-07, "logits/chosen": -1.3218133449554443, "logits/rejected": -1.3337442874908447, "logps/chosen": -7.961134910583496, "logps/rejected": -7.996614933013916, "loss": 9.0247, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -7.961134910583496, "rewards/margins": 0.035479746758937836, "rewards/rejected": -7.996614933013916, "step": 330 }, { "epoch": 0.7013870714472651, "grad_norm": 13.684086792814428, "learning_rate": 1.4810145228170922e-07, "logits/chosen": -1.3398381471633911, "logits/rejected": -1.3437585830688477, "logps/chosen": -7.856637001037598, "logps/rejected": -8.111886978149414, "loss": 8.8913, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -7.856637001037598, "rewards/margins": 0.25525030493736267, "rewards/rejected": -8.111886978149414, "step": 335 }, { "epoch": 0.7118555352002094, "grad_norm": 15.39649445200101, "learning_rate": 1.3873292242587306e-07, "logits/chosen": -1.3376450538635254, "logits/rejected": -1.3476964235305786, "logps/chosen": -8.228338241577148, "logps/rejected": -8.340727806091309, "loss": 9.0269, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -8.228338241577148, "rewards/margins": 0.11239071190357208, "rewards/rejected": -8.340727806091309, "step": 340 }, { "epoch": 0.7223239989531536, "grad_norm": 15.302013253785537, "learning_rate": 1.295805759806533e-07, "logits/chosen": -1.3724461793899536, "logits/rejected": -1.3841075897216797, "logps/chosen": -8.054750442504883, "logps/rejected": -8.403682708740234, "loss": 9.0089, "rewards/accuracies": 0.5, "rewards/chosen": -8.054750442504883, "rewards/margins": 0.3489326238632202, "rewards/rejected": -8.403682708740234, "step": 345 }, { "epoch": 0.7327924627060979, "grad_norm": 18.608453972243662, "learning_rate": 1.2065668194415777e-07, "logits/chosen": -1.3417284488677979, "logits/rejected": -1.3348530530929565, "logps/chosen": -7.915482997894287, "logps/rejected": -8.044729232788086, "loss": 8.9016, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -7.915482997894287, "rewards/margins": 0.12924641370773315, "rewards/rejected": -8.044729232788086, "step": 350 }, { "epoch": 0.7432609264590422, "grad_norm": 14.900748845819772, "learning_rate": 1.1197320306710923e-07, "logits/chosen": -1.3621351718902588, "logits/rejected": -1.3541442155838013, "logps/chosen": -8.007196426391602, "logps/rejected": -7.965734004974365, "loss": 8.9062, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -8.007196426391602, "rewards/margins": -0.04146287590265274, "rewards/rejected": -7.965734004974365, "step": 355 }, { "epoch": 0.7537293902119864, "grad_norm": 11.569520650790327, "learning_rate": 1.035417798164145e-07, "logits/chosen": -1.3260619640350342, "logits/rejected": -1.3356263637542725, "logps/chosen": -7.753990173339844, "logps/rejected": -8.039525985717773, "loss": 8.8536, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -7.753990173339844, "rewards/margins": 0.2855362296104431, "rewards/rejected": -8.039525985717773, "step": 360 }, { "epoch": 0.7641978539649307, "grad_norm": 13.480030507608214, "learning_rate": 9.537371477076535e-08, "logits/chosen": -1.2944828271865845, "logits/rejected": -1.2956254482269287, "logps/chosen": -7.926826477050781, "logps/rejected": -7.9895477294921875, "loss": 8.9487, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -7.926826477050781, "rewards/margins": 0.06272158026695251, "rewards/rejected": -7.9895477294921875, "step": 365 }, { "epoch": 0.7746663177178749, "grad_norm": 14.634365970472302, "learning_rate": 8.747995746918898e-08, "logits/chosen": -1.3467233180999756, "logits/rejected": -1.3351846933364868, "logps/chosen": -8.043527603149414, "logps/rejected": -8.186015129089355, "loss": 8.9627, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.043527603149414, "rewards/margins": 0.14248715341091156, "rewards/rejected": -8.186015129089355, "step": 370 }, { "epoch": 0.7851347814708192, "grad_norm": 28.38170473677795, "learning_rate": 7.987108973285888e-08, "logits/chosen": -1.3258306980133057, "logits/rejected": -1.3155487775802612, "logps/chosen": -8.005027770996094, "logps/rejected": -8.246636390686035, "loss": 8.9413, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -8.005027770996094, "rewards/margins": 0.24160809814929962, "rewards/rejected": -8.246636390686035, "step": 375 }, { "epoch": 0.7956032452237635, "grad_norm": 13.711915418794124, "learning_rate": 7.255731147984174e-08, "logits/chosen": -1.3438084125518799, "logits/rejected": -1.297163963317871, "logps/chosen": -8.208559036254883, "logps/rejected": -8.363499641418457, "loss": 8.942, "rewards/accuracies": 0.5625, "rewards/chosen": -8.208559036254883, "rewards/margins": 0.15493938326835632, "rewards/rejected": -8.363499641418457, "step": 380 }, { "epoch": 0.8060717089767077, "grad_norm": 13.642711731891415, "learning_rate": 6.554842705179898e-08, "logits/chosen": -1.3352845907211304, "logits/rejected": -1.3314430713653564, "logps/chosen": -8.112469673156738, "logps/rejected": -8.209820747375488, "loss": 8.9588, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.112469673156738, "rewards/margins": 0.09735036641359329, "rewards/rejected": -8.209820747375488, "step": 385 }, { "epoch": 0.816540172729652, "grad_norm": 14.269345053816819, "learning_rate": 5.885383207096832e-08, "logits/chosen": -1.3467012643814087, "logits/rejected": -1.3490493297576904, "logps/chosen": -7.833376884460449, "logps/rejected": -8.030352592468262, "loss": 8.8689, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -7.833376884460449, "rewards/margins": 0.1969761848449707, "rewards/rejected": -8.030352592468262, "step": 390 }, { "epoch": 0.8270086364825961, "grad_norm": 16.14618551872646, "learning_rate": 5.2482500845047165e-08, "logits/chosen": -1.3177175521850586, "logits/rejected": -1.3296372890472412, "logps/chosen": -7.635066032409668, "logps/rejected": -7.791895866394043, "loss": 8.9076, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -7.635066032409668, "rewards/margins": 0.15682990849018097, "rewards/rejected": -7.791895866394043, "step": 395 }, { "epoch": 0.8374771002355405, "grad_norm": 12.278193076130206, "learning_rate": 4.644297433686162e-08, "logits/chosen": -1.3246910572052002, "logits/rejected": -1.315019965171814, "logps/chosen": -7.837827205657959, "logps/rejected": -7.908313751220703, "loss": 8.951, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -7.837827205657959, "rewards/margins": 0.0704866498708725, "rewards/rejected": -7.908313751220703, "step": 400 }, { "epoch": 0.8479455639884846, "grad_norm": 12.786235556241849, "learning_rate": 4.074334871494558e-08, "logits/chosen": -1.3545995950698853, "logits/rejected": -1.3624496459960938, "logps/chosen": -8.024687767028809, "logps/rejected": -8.172109603881836, "loss": 8.9198, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -8.024687767028809, "rewards/margins": 0.1474229097366333, "rewards/rejected": -8.172109603881836, "step": 405 }, { "epoch": 0.8584140277414289, "grad_norm": 30.758577870183032, "learning_rate": 3.5391264500382e-08, "logits/chosen": -1.360478401184082, "logits/rejected": -1.3552910089492798, "logps/chosen": -7.844922065734863, "logps/rejected": -7.852625846862793, "loss": 8.8997, "rewards/accuracies": 0.53125, "rewards/chosen": -7.844922065734863, "rewards/margins": 0.0077047706581652164, "rewards/rejected": -7.852625846862793, "step": 410 }, { "epoch": 0.8688824914943732, "grad_norm": 14.260374307768236, "learning_rate": 3.0393896324452226e-08, "logits/chosen": -1.372036337852478, "logits/rejected": -1.3762390613555908, "logps/chosen": -7.982748508453369, "logps/rejected": -8.225188255310059, "loss": 8.9748, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -7.982748508453369, "rewards/margins": 0.24244041740894318, "rewards/rejected": -8.225188255310059, "step": 415 }, { "epoch": 0.8793509552473174, "grad_norm": 14.09879602927017, "learning_rate": 2.5757943310825026e-08, "logits/chosen": -1.3225996494293213, "logits/rejected": -1.3161330223083496, "logps/chosen": -7.865872859954834, "logps/rejected": -7.931491851806641, "loss": 8.9372, "rewards/accuracies": 0.5, "rewards/chosen": -7.865872859954834, "rewards/margins": 0.06561894714832306, "rewards/rejected": -7.931491851806641, "step": 420 }, { "epoch": 0.8898194190002617, "grad_norm": 13.351401390808332, "learning_rate": 2.148962009517823e-08, "logits/chosen": -1.342071771621704, "logits/rejected": -1.337024450302124, "logps/chosen": -8.03447151184082, "logps/rejected": -8.085325241088867, "loss": 8.9767, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -8.03447151184082, "rewards/margins": 0.050852321088314056, "rewards/rejected": -8.085325241088867, "step": 425 }, { "epoch": 0.9002878827532059, "grad_norm": 16.02345328859732, "learning_rate": 1.759464849429082e-08, "logits/chosen": -1.3405394554138184, "logits/rejected": -1.3419816493988037, "logps/chosen": -7.878898620605469, "logps/rejected": -8.002215385437012, "loss": 8.9292, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -7.878898620605469, "rewards/margins": 0.123316690325737, "rewards/rejected": -8.002215385437012, "step": 430 }, { "epoch": 0.9107563465061502, "grad_norm": 13.971661978504134, "learning_rate": 1.4078249835774169e-08, "logits/chosen": -1.3646373748779297, "logits/rejected": -1.3699538707733154, "logps/chosen": -7.937603950500488, "logps/rejected": -8.069661140441895, "loss": 8.8372, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -7.937603950500488, "rewards/margins": 0.13205692172050476, "rewards/rejected": -8.069661140441895, "step": 435 }, { "epoch": 0.9212248102590945, "grad_norm": 13.636923891581842, "learning_rate": 1.0945137958723705e-08, "logits/chosen": -1.3303980827331543, "logits/rejected": -1.3274564743041992, "logps/chosen": -8.00455379486084, "logps/rejected": -8.096671104431152, "loss": 8.9997, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -8.00455379486084, "rewards/margins": 0.09211695194244385, "rewards/rejected": -8.096671104431152, "step": 440 }, { "epoch": 0.9316932740120387, "grad_norm": 14.66331138432002, "learning_rate": 8.19951289467482e-09, "logits/chosen": -1.3527616262435913, "logits/rejected": -1.352975606918335, "logps/chosen": -7.898123741149902, "logps/rejected": -8.020647048950195, "loss": 8.9114, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -7.898123741149902, "rewards/margins": 0.1225227564573288, "rewards/rejected": -8.020647048950195, "step": 445 }, { "epoch": 0.942161737764983, "grad_norm": 34.64920022108061, "learning_rate": 5.84505523733293e-09, "logits/chosen": -1.3027703762054443, "logits/rejected": -1.2922091484069824, "logps/chosen": -8.017878532409668, "logps/rejected": -8.019991874694824, "loss": 9.0038, "rewards/accuracies": 0.46875, "rewards/chosen": -8.017878532409668, "rewards/margins": 0.0021121830213814974, "rewards/rejected": -8.019991874694824, "step": 450 }, { "epoch": 0.9526302015179272, "grad_norm": 14.096689301269398, "learning_rate": 3.8849212086261466e-09, "logits/chosen": -1.3568954467773438, "logits/rejected": -1.345536231994629, "logps/chosen": -7.817251682281494, "logps/rejected": -8.18480110168457, "loss": 8.9022, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -7.817251682281494, "rewards/margins": 0.3675496578216553, "rewards/rejected": -8.18480110168457, "step": 455 }, { "epoch": 0.9630986652708715, "grad_norm": 21.948748802651522, "learning_rate": 2.3217384276938756e-09, "logits/chosen": -1.3387937545776367, "logits/rejected": -1.349258542060852, "logps/chosen": -7.9868292808532715, "logps/rejected": -8.197335243225098, "loss": 8.8854, "rewards/accuracies": 0.53125, "rewards/chosen": -7.9868292808532715, "rewards/margins": 0.21050508320331573, "rewards/rejected": -8.197335243225098, "step": 460 }, { "epoch": 0.9735671290238157, "grad_norm": 12.715751305789052, "learning_rate": 1.1576023884836472e-09, "logits/chosen": -1.3674533367156982, "logits/rejected": -1.3665874004364014, "logps/chosen": -8.10934066772461, "logps/rejected": -8.27099323272705, "loss": 8.9853, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.10934066772461, "rewards/margins": 0.16165266931056976, "rewards/rejected": -8.27099323272705, "step": 465 }, { "epoch": 0.98403559277676, "grad_norm": 13.20358280327505, "learning_rate": 3.940736506780395e-10, "logits/chosen": -1.348550796508789, "logits/rejected": -1.3657060861587524, "logps/chosen": -7.707891941070557, "logps/rejected": -7.990015983581543, "loss": 8.9804, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -7.707891941070557, "rewards/margins": 0.2821243703365326, "rewards/rejected": -7.990015983581543, "step": 470 }, { "epoch": 0.9945040565297043, "grad_norm": 17.668181816444864, "learning_rate": 3.2175747716822744e-11, "logits/chosen": -1.3433798551559448, "logits/rejected": -1.3304665088653564, "logps/chosen": -8.101046562194824, "logps/rejected": -8.15410041809082, "loss": 8.9813, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -8.101046562194824, "rewards/margins": 0.05305204540491104, "rewards/rejected": -8.15410041809082, "step": 475 }, { "epoch": 0.998691442030882, "step": 477, "total_flos": 0.0, "train_loss": 8.967987340451286, "train_runtime": 8184.2286, "train_samples_per_second": 7.47, "train_steps_per_second": 0.058 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }