{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9986987638256344, "eval_steps": 100, "global_step": 768, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002602472348731295, "grad_norm": 9.0, "learning_rate": 6.493506493506494e-09, "logits/chosen": 411.1029357910156, "logits/rejected": 362.02178955078125, "logps/chosen": -352.47296142578125, "logps/rejected": -387.4255065917969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.026024723487312947, "grad_norm": 8.5625, "learning_rate": 6.493506493506492e-08, "logits/chosen": 378.90606689453125, "logits/rejected": 347.66900634765625, "logps/chosen": -288.4970397949219, "logps/rejected": -333.1199645996094, "loss": 0.7101, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": 0.0017441289965063334, "rewards/margins": 0.01993246003985405, "rewards/rejected": -0.01818833500146866, "step": 10 }, { "epoch": 0.05204944697462589, "grad_norm": 8.75, "learning_rate": 1.2987012987012984e-07, "logits/chosen": 381.0403137207031, "logits/rejected": 360.0636291503906, "logps/chosen": -274.6163024902344, "logps/rejected": -299.0308532714844, "loss": 0.7185, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.016971342265605927, "rewards/margins": -0.047712840139865875, "rewards/rejected": 0.030741501599550247, "step": 20 }, { "epoch": 0.07807417046193885, "grad_norm": 9.0625, "learning_rate": 1.948051948051948e-07, "logits/chosen": 382.80517578125, "logits/rejected": 375.8597412109375, "logps/chosen": -305.6173095703125, "logps/rejected": -332.44134521484375, "loss": 0.709, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.005826466716825962, "rewards/margins": -0.014216383919119835, "rewards/rejected": 0.00838992465287447, "step": 30 }, { "epoch": 0.10409889394925179, "grad_norm": 8.125, "learning_rate": 2.597402597402597e-07, "logits/chosen": 368.2964172363281, "logits/rejected": 368.75994873046875, "logps/chosen": -303.63433837890625, "logps/rejected": -310.3212890625, "loss": 0.7122, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0020552538335323334, "rewards/margins": 0.017374467104673386, "rewards/rejected": -0.019429724663496017, "step": 40 }, { "epoch": 0.13012361743656473, "grad_norm": 10.0, "learning_rate": 3.2467532467532465e-07, "logits/chosen": 374.7029724121094, "logits/rejected": 367.4124450683594, "logps/chosen": -314.57452392578125, "logps/rejected": -332.7318420410156, "loss": 0.7084, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.022785179316997528, "rewards/margins": 0.039472438395023346, "rewards/rejected": -0.01668725535273552, "step": 50 }, { "epoch": 0.1561483409238777, "grad_norm": 9.0, "learning_rate": 3.896103896103896e-07, "logits/chosen": 370.33184814453125, "logits/rejected": 370.1338806152344, "logps/chosen": -295.30523681640625, "logps/rejected": -298.3065185546875, "loss": 0.6999, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.04296935349702835, "rewards/margins": 0.012664027512073517, "rewards/rejected": 0.03030533157289028, "step": 60 }, { "epoch": 0.18217306441119063, "grad_norm": 9.3125, "learning_rate": 4.545454545454545e-07, "logits/chosen": 411.93780517578125, "logits/rejected": 376.9971923828125, "logps/chosen": -285.2339172363281, "logps/rejected": -330.7354431152344, "loss": 0.7152, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.061416469514369965, "rewards/margins": 0.01710355281829834, "rewards/rejected": 0.044312912970781326, "step": 70 }, { "epoch": 0.20819778789850357, "grad_norm": 9.6875, "learning_rate": 4.999767464405451e-07, "logits/chosen": 388.18426513671875, "logits/rejected": 372.40509033203125, "logps/chosen": -304.78607177734375, "logps/rejected": -350.2904357910156, "loss": 0.7012, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0943768322467804, "rewards/margins": 0.008914275094866753, "rewards/rejected": 0.0854625552892685, "step": 80 }, { "epoch": 0.2342225113858165, "grad_norm": 9.875, "learning_rate": 4.995634701567891e-07, "logits/chosen": 383.4454040527344, "logits/rejected": 348.91796875, "logps/chosen": -281.39263916015625, "logps/rejected": -323.1919860839844, "loss": 0.704, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.15481603145599365, "rewards/margins": 0.03042496182024479, "rewards/rejected": 0.12439107894897461, "step": 90 }, { "epoch": 0.26024723487312945, "grad_norm": 10.0625, "learning_rate": 4.986344312601082e-07, "logits/chosen": 387.20343017578125, "logits/rejected": 395.9942932128906, "logps/chosen": -293.97088623046875, "logps/rejected": -312.58624267578125, "loss": 0.7052, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.19738170504570007, "rewards/margins": 0.052183426916599274, "rewards/rejected": 0.14519831538200378, "step": 100 }, { "epoch": 0.26024723487312945, "eval_logits/chosen": 454.44757080078125, "eval_logits/rejected": 437.89306640625, "eval_logps/chosen": -285.4035339355469, "eval_logps/rejected": -320.2303161621094, "eval_loss": 0.7031933069229126, "eval_rewards/accuracies": 0.5147929191589355, "eval_rewards/chosen": 0.18281590938568115, "eval_rewards/margins": 0.04541104659438133, "eval_rewards/rejected": 0.13740485906600952, "eval_runtime": 253.4391, "eval_samples_per_second": 21.331, "eval_steps_per_second": 0.667, "step": 100 }, { "epoch": 0.28627195836044245, "grad_norm": 9.5625, "learning_rate": 4.971915497571788e-07, "logits/chosen": 383.96282958984375, "logits/rejected": 361.9601135253906, "logps/chosen": -300.56646728515625, "logps/rejected": -338.2516174316406, "loss": 0.6997, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.2231082022190094, "rewards/margins": 0.06056695431470871, "rewards/rejected": 0.1625412404537201, "step": 110 }, { "epoch": 0.3122966818477554, "grad_norm": 9.6875, "learning_rate": 4.952378075921676e-07, "logits/chosen": 384.23895263671875, "logits/rejected": 363.35589599609375, "logps/chosen": -306.78302001953125, "logps/rejected": -331.3634033203125, "loss": 0.6849, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.18709978461265564, "rewards/margins": 0.051194410771131516, "rewards/rejected": 0.13590538501739502, "step": 120 }, { "epoch": 0.3383214053350683, "grad_norm": 8.625, "learning_rate": 4.927772424840701e-07, "logits/chosen": 397.5968017578125, "logits/rejected": 365.3117980957031, "logps/chosen": -282.23553466796875, "logps/rejected": -322.7259216308594, "loss": 0.6841, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.23038265109062195, "rewards/margins": 0.12032196670770645, "rewards/rejected": 0.1100606918334961, "step": 130 }, { "epoch": 0.36434612882238127, "grad_norm": 9.6875, "learning_rate": 4.898149395821217e-07, "logits/chosen": 390.6437072753906, "logits/rejected": 364.450439453125, "logps/chosen": -276.3985900878906, "logps/rejected": -327.2912292480469, "loss": 0.6807, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.2373248040676117, "rewards/margins": 0.10833799839019775, "rewards/rejected": 0.12898679077625275, "step": 140 }, { "epoch": 0.3903708523096942, "grad_norm": 8.25, "learning_rate": 4.863570209565277e-07, "logits/chosen": 382.4102478027344, "logits/rejected": 376.72918701171875, "logps/chosen": -300.83026123046875, "logps/rejected": -310.814697265625, "loss": 0.6788, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.2639835476875305, "rewards/margins": 0.0948825255036354, "rewards/rejected": 0.1691010296344757, "step": 150 }, { "epoch": 0.41639557579700714, "grad_norm": 9.5, "learning_rate": 4.824106329462312e-07, "logits/chosen": 358.6318664550781, "logits/rejected": 373.10601806640625, "logps/chosen": -314.7348327636719, "logps/rejected": -300.9188232421875, "loss": 0.6969, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.17297904193401337, "rewards/margins": -0.0854780301451683, "rewards/rejected": 0.25845709443092346, "step": 160 }, { "epoch": 0.4424202992843201, "grad_norm": 9.0625, "learning_rate": 4.779839313898674e-07, "logits/chosen": 383.4333190917969, "logits/rejected": 393.3439025878906, "logps/chosen": -330.3350524902344, "logps/rejected": -342.2207946777344, "loss": 0.6778, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.19442406296730042, "rewards/margins": 0.1012936383485794, "rewards/rejected": 0.09313042461872101, "step": 170 }, { "epoch": 0.468445022771633, "grad_norm": 9.75, "learning_rate": 4.730860647704252e-07, "logits/chosen": 382.30145263671875, "logits/rejected": 371.52410888671875, "logps/chosen": -282.8478088378906, "logps/rejected": -313.7384948730469, "loss": 0.6791, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.27071475982666016, "rewards/margins": 0.12174586206674576, "rewards/rejected": 0.14896893501281738, "step": 180 }, { "epoch": 0.494469746258946, "grad_norm": 8.25, "learning_rate": 4.677271553084514e-07, "logits/chosen": 369.3918151855469, "logits/rejected": 352.20196533203125, "logps/chosen": -303.9125671386719, "logps/rejected": -319.8887939453125, "loss": 0.6857, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.14780566096305847, "rewards/margins": 0.09987500309944153, "rewards/rejected": 0.04793066531419754, "step": 190 }, { "epoch": 0.5204944697462589, "grad_norm": 9.1875, "learning_rate": 4.619182780428723e-07, "logits/chosen": 383.06085205078125, "logits/rejected": 361.99041748046875, "logps/chosen": -280.6048278808594, "logps/rejected": -300.1059875488281, "loss": 0.6851, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1767120063304901, "rewards/margins": 0.10154225677251816, "rewards/rejected": 0.07516975700855255, "step": 200 }, { "epoch": 0.5204944697462589, "eval_logits/chosen": 453.2177429199219, "eval_logits/rejected": 436.5673522949219, "eval_logps/chosen": -285.99169921875, "eval_logps/rejected": -322.4987487792969, "eval_loss": 0.6793849468231201, "eval_rewards/accuracies": 0.5991124510765076, "eval_rewards/chosen": 0.15340714156627655, "eval_rewards/margins": 0.12942208349704742, "eval_rewards/rejected": 0.02398504503071308, "eval_runtime": 244.8119, "eval_samples_per_second": 22.082, "eval_steps_per_second": 0.69, "step": 200 }, { "epoch": 0.5465191932335719, "grad_norm": 9.375, "learning_rate": 4.5567143794266337e-07, "logits/chosen": 400.5397033691406, "logits/rejected": 364.52850341796875, "logps/chosen": -275.1554870605469, "logps/rejected": -342.13067626953125, "loss": 0.6697, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.12358863651752472, "rewards/margins": 0.12654754519462585, "rewards/rejected": -0.002958917524665594, "step": 210 }, { "epoch": 0.5725439167208849, "grad_norm": 8.375, "learning_rate": 4.4899954509667134e-07, "logits/chosen": 376.6017150878906, "logits/rejected": 354.7825622558594, "logps/chosen": -288.1871337890625, "logps/rejected": -336.48321533203125, "loss": 0.6807, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.1387133151292801, "rewards/margins": 0.1342148333787918, "rewards/rejected": 0.004498471971601248, "step": 220 }, { "epoch": 0.5985686402081978, "grad_norm": 10.1875, "learning_rate": 4.4191638803286144e-07, "logits/chosen": 372.1169128417969, "logits/rejected": 363.5325012207031, "logps/chosen": -299.6369934082031, "logps/rejected": -315.469970703125, "loss": 0.6743, "rewards/accuracies": 0.59375, "rewards/chosen": 0.11066161096096039, "rewards/margins": 0.11400572210550308, "rewards/rejected": -0.0033441067207604647, "step": 230 }, { "epoch": 0.6245933636955108, "grad_norm": 8.4375, "learning_rate": 4.3443660522213157e-07, "logits/chosen": 385.5279235839844, "logits/rejected": 365.88372802734375, "logps/chosen": -293.209228515625, "logps/rejected": -317.05718994140625, "loss": 0.6522, "rewards/accuracies": 0.5625, "rewards/chosen": 0.16928274929523468, "rewards/margins": 0.16216634213924408, "rewards/rejected": 0.007116401102393866, "step": 240 }, { "epoch": 0.6506180871828237, "grad_norm": 9.375, "learning_rate": 4.265756548255822e-07, "logits/chosen": 362.12689208984375, "logits/rejected": 352.41986083984375, "logps/chosen": -302.4978332519531, "logps/rejected": -311.34228515625, "loss": 0.6726, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0975576788187027, "rewards/margins": 0.1262621432542801, "rewards/rejected": -0.0287044458091259, "step": 250 }, { "epoch": 0.6766428106701367, "grad_norm": 9.125, "learning_rate": 4.1834978274776867e-07, "logits/chosen": 379.18072509765625, "logits/rejected": 365.29168701171875, "logps/chosen": -290.32928466796875, "logps/rejected": -310.4072265625, "loss": 0.6705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.08413325250148773, "rewards/margins": 0.1175273060798645, "rewards/rejected": -0.033394038677215576, "step": 260 }, { "epoch": 0.7026675341574495, "grad_norm": 11.3125, "learning_rate": 4.0977598906195386e-07, "logits/chosen": 375.8214416503906, "logits/rejected": 360.27264404296875, "logps/chosen": -273.41180419921875, "logps/rejected": -320.2526550292969, "loss": 0.6468, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.08752383291721344, "rewards/margins": 0.22164182364940643, "rewards/rejected": -0.134117990732193, "step": 270 }, { "epoch": 0.7286922576447625, "grad_norm": 11.1875, "learning_rate": 4.00871992876753e-07, "logits/chosen": 373.965087890625, "logits/rejected": 366.2101135253906, "logps/chosen": -300.20367431640625, "logps/rejected": -311.20953369140625, "loss": 0.6494, "rewards/accuracies": 0.59375, "rewards/chosen": 0.033880796283483505, "rewards/margins": 0.18614129722118378, "rewards/rejected": -0.15226049721240997, "step": 280 }, { "epoch": 0.7547169811320755, "grad_norm": 7.90625, "learning_rate": 3.9165619571677645e-07, "logits/chosen": 363.5823059082031, "logits/rejected": 354.50970458984375, "logps/chosen": -297.46087646484375, "logps/rejected": -325.23760986328125, "loss": 0.6671, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09576047956943512, "rewards/margins": 0.17094558477401733, "rewards/rejected": -0.26670604944229126, "step": 290 }, { "epoch": 0.7807417046193884, "grad_norm": 10.5, "learning_rate": 3.8214764349295194e-07, "logits/chosen": 377.7972412109375, "logits/rejected": 362.15045166015625, "logps/chosen": -326.63104248046875, "logps/rejected": -344.16009521484375, "loss": 0.6545, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.101107656955719, "rewards/margins": 0.14511139690876007, "rewards/rejected": -0.24621903896331787, "step": 300 }, { "epoch": 0.7807417046193884, "eval_logits/chosen": 452.603515625, "eval_logits/rejected": 435.95166015625, "eval_logps/chosen": -289.7298583984375, "eval_logps/rejected": -327.5587463378906, "eval_loss": 0.6631770133972168, "eval_rewards/accuracies": 0.5961538553237915, "eval_rewards/chosen": -0.03350303694605827, "eval_rewards/margins": 0.19551357626914978, "eval_rewards/rejected": -0.22901661694049835, "eval_runtime": 244.6727, "eval_samples_per_second": 22.095, "eval_steps_per_second": 0.691, "step": 300 }, { "epoch": 0.8067664281067014, "grad_norm": 7.53125, "learning_rate": 3.7236598714111955e-07, "logits/chosen": 378.12774658203125, "logits/rejected": 344.31512451171875, "logps/chosen": -293.63519287109375, "logps/rejected": -344.9774169921875, "loss": 0.6417, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05039479583501816, "rewards/margins": 0.2865242063999176, "rewards/rejected": -0.33691897988319397, "step": 310 }, { "epoch": 0.8327911515940143, "grad_norm": 9.5625, "learning_rate": 3.623314420102467e-07, "logits/chosen": 377.9627685546875, "logits/rejected": 364.48223876953125, "logps/chosen": -284.373779296875, "logps/rejected": -311.5481262207031, "loss": 0.6498, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.042569078505039215, "rewards/margins": 0.20671768486499786, "rewards/rejected": -0.2492867410182953, "step": 320 }, { "epoch": 0.8588158750813273, "grad_norm": 8.6875, "learning_rate": 3.520647460841938e-07, "logits/chosen": 371.0107727050781, "logits/rejected": 363.4261169433594, "logps/chosen": -309.06732177734375, "logps/rejected": -317.2771911621094, "loss": 0.6432, "rewards/accuracies": 0.625, "rewards/chosen": -0.14327314496040344, "rewards/margins": 0.23333874344825745, "rewards/rejected": -0.3766118884086609, "step": 330 }, { "epoch": 0.8848405985686402, "grad_norm": 8.75, "learning_rate": 3.415871171233708e-07, "logits/chosen": 382.25439453125, "logits/rejected": 356.82818603515625, "logps/chosen": -283.73760986328125, "logps/rejected": -317.1445617675781, "loss": 0.6552, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.18387103080749512, "rewards/margins": 0.21161150932312012, "rewards/rejected": -0.39548248052597046, "step": 340 }, { "epoch": 0.9108653220559532, "grad_norm": 9.0625, "learning_rate": 3.309202088148608e-07, "logits/chosen": 386.0477600097656, "logits/rejected": 353.7298278808594, "logps/chosen": -316.064453125, "logps/rejected": -360.25201416015625, "loss": 0.6564, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45992183685302734, "rewards/margins": 0.1660740077495575, "rewards/rejected": -0.6259958148002625, "step": 350 }, { "epoch": 0.936890045543266, "grad_norm": 11.625, "learning_rate": 3.200860660216302e-07, "logits/chosen": 394.1874084472656, "logits/rejected": 357.0087585449219, "logps/chosen": -284.3465270996094, "logps/rejected": -341.0668640136719, "loss": 0.6376, "rewards/accuracies": 0.65625, "rewards/chosen": -0.26841384172439575, "rewards/margins": 0.28497201204299927, "rewards/rejected": -0.5533859133720398, "step": 360 }, { "epoch": 0.962914769030579, "grad_norm": 9.1875, "learning_rate": 3.091070792233124e-07, "logits/chosen": 379.9867248535156, "logits/rejected": 363.60565185546875, "logps/chosen": -330.55035400390625, "logps/rejected": -357.5587158203125, "loss": 0.6562, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.36982443928718567, "rewards/margins": 0.1465008407831192, "rewards/rejected": -0.5163252949714661, "step": 370 }, { "epoch": 0.988939492517892, "grad_norm": 112.0, "learning_rate": 2.9800593824272024e-07, "logits/chosen": 377.43341064453125, "logits/rejected": 351.53948974609375, "logps/chosen": -288.1562194824219, "logps/rejected": -326.09930419921875, "loss": 0.6717, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.4318203032016754, "rewards/margins": 0.1724158674478531, "rewards/rejected": -0.6042361259460449, "step": 380 }, { "epoch": 1.014964216005205, "grad_norm": 11.4375, "learning_rate": 2.8680558535371687e-07, "logits/chosen": 367.515380859375, "logits/rejected": 362.1560974121094, "logps/chosen": -295.6492614746094, "logps/rejected": -316.13189697265625, "loss": 0.6406, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.38629141449928284, "rewards/margins": 0.227385476231575, "rewards/rejected": -0.613676905632019, "step": 390 }, { "epoch": 1.0409889394925178, "grad_norm": 8.75, "learning_rate": 2.755291678673574e-07, "logits/chosen": 382.78900146484375, "logits/rejected": 349.6816101074219, "logps/chosen": -312.42962646484375, "logps/rejected": -368.1940002441406, "loss": 0.6428, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.32089877128601074, "rewards/margins": 0.36698684096336365, "rewards/rejected": -0.6878856420516968, "step": 400 }, { "epoch": 1.0409889394925178, "eval_logits/chosen": 452.67681884765625, "eval_logits/rejected": 436.037109375, "eval_logps/chosen": -296.22650146484375, "eval_logps/rejected": -334.6671142578125, "eval_loss": 0.6531640291213989, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.3583340048789978, "eval_rewards/margins": 0.2260989397764206, "eval_rewards/rejected": -0.5844328999519348, "eval_runtime": 244.5749, "eval_samples_per_second": 22.104, "eval_steps_per_second": 0.691, "step": 400 }, { "epoch": 1.0670136629798308, "grad_norm": 9.8125, "learning_rate": 2.6419999029428816e-07, "logits/chosen": 393.82427978515625, "logits/rejected": 353.9914855957031, "logps/chosen": -282.713623046875, "logps/rejected": -346.3770751953125, "loss": 0.6338, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3410654664039612, "rewards/margins": 0.20948953926563263, "rewards/rejected": -0.5505550503730774, "step": 410 }, { "epoch": 1.0930383864671438, "grad_norm": 8.9375, "learning_rate": 2.5284146618226805e-07, "logits/chosen": 399.9950256347656, "logits/rejected": 378.6581115722656, "logps/chosen": -306.4803466796875, "logps/rejected": -339.96636962890625, "loss": 0.6206, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2285558432340622, "rewards/margins": 0.35167884826660156, "rewards/rejected": -0.580234706401825, "step": 420 }, { "epoch": 1.1190631099544568, "grad_norm": 8.375, "learning_rate": 2.414770697283471e-07, "logits/chosen": 388.2931213378906, "logits/rejected": 368.3885498046875, "logps/chosen": -279.95123291015625, "logps/rejected": -318.58660888671875, "loss": 0.6254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.31028804183006287, "rewards/margins": 0.23735752701759338, "rewards/rejected": -0.5476455092430115, "step": 430 }, { "epoch": 1.1450878334417696, "grad_norm": 11.1875, "learning_rate": 2.3013028726570433e-07, "logits/chosen": 384.466552734375, "logits/rejected": 378.22186279296875, "logps/chosen": -324.28790283203125, "logps/rejected": -317.89752197265625, "loss": 0.6483, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.37500661611557007, "rewards/margins": 0.15624721348285675, "rewards/rejected": -0.5312538743019104, "step": 440 }, { "epoch": 1.1711125569290826, "grad_norm": 10.8125, "learning_rate": 2.1882456872540343e-07, "logits/chosen": 389.3873291015625, "logits/rejected": 368.3778381347656, "logps/chosen": -296.072998046875, "logps/rejected": -338.9192810058594, "loss": 0.6395, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3294864594936371, "rewards/margins": 0.21306844055652618, "rewards/rejected": -0.5425547957420349, "step": 450 }, { "epoch": 1.1971372804163956, "grad_norm": 9.875, "learning_rate": 2.075832791733802e-07, "logits/chosen": 373.2823791503906, "logits/rejected": 373.5628967285156, "logps/chosen": -311.630126953125, "logps/rejected": -316.8753356933594, "loss": 0.6239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2192085236310959, "rewards/margins": 0.2490987330675125, "rewards/rejected": -0.4683072566986084, "step": 460 }, { "epoch": 1.2231620039037086, "grad_norm": 8.4375, "learning_rate": 1.9642965052281615e-07, "logits/chosen": 375.5801696777344, "logits/rejected": 357.92327880859375, "logps/chosen": -288.80816650390625, "logps/rejected": -314.3056335449219, "loss": 0.627, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2664826512336731, "rewards/margins": 0.22547881305217743, "rewards/rejected": -0.4919614791870117, "step": 470 }, { "epoch": 1.2491867273910215, "grad_norm": 11.8125, "learning_rate": 1.8538673352169466e-07, "logits/chosen": 392.02789306640625, "logits/rejected": 378.76702880859375, "logps/chosen": -311.27508544921875, "logps/rejected": -346.3468322753906, "loss": 0.6254, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3370845913887024, "rewards/margins": 0.30168086290359497, "rewards/rejected": -0.6387654542922974, "step": 480 }, { "epoch": 1.2752114508783343, "grad_norm": 7.84375, "learning_rate": 1.7447735011476267e-07, "logits/chosen": 372.53924560546875, "logits/rejected": 353.3471374511719, "logps/chosen": -287.72393798828125, "logps/rejected": -329.53619384765625, "loss": 0.6328, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3344346880912781, "rewards/margins": 0.40995296835899353, "rewards/rejected": -0.7443875670433044, "step": 490 }, { "epoch": 1.3012361743656473, "grad_norm": 8.375, "learning_rate": 1.6372404627835178e-07, "logits/chosen": 376.4175109863281, "logits/rejected": 357.60369873046875, "logps/chosen": -305.36932373046875, "logps/rejected": -334.09979248046875, "loss": 0.6366, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2382354736328125, "rewards/margins": 0.29306212067604065, "rewards/rejected": -0.5312975645065308, "step": 500 }, { "epoch": 1.3012361743656473, "eval_logits/chosen": 452.81121826171875, "eval_logits/rejected": 436.1842956542969, "eval_logps/chosen": -295.1855773925781, "eval_logps/rejected": -334.1830749511719, "eval_loss": 0.6521105170249939, "eval_rewards/accuracies": 0.6124260425567627, "eval_rewards/chosen": -0.30628812313079834, "eval_rewards/margins": 0.2539446949958801, "eval_rewards/rejected": -0.5602327585220337, "eval_runtime": 244.6483, "eval_samples_per_second": 22.097, "eval_steps_per_second": 0.691, "step": 500 }, { "epoch": 1.3272608978529603, "grad_norm": 27.375, "learning_rate": 1.5314904542553098e-07, "logits/chosen": 381.5070495605469, "logits/rejected": 362.86529541015625, "logps/chosen": -311.13287353515625, "logps/rejected": -328.11505126953125, "loss": 0.6295, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2357090413570404, "rewards/margins": 0.3728547990322113, "rewards/rejected": -0.6085638999938965, "step": 510 }, { "epoch": 1.3532856213402733, "grad_norm": 10.75, "learning_rate": 1.4277420247788842e-07, "logits/chosen": 373.0950622558594, "logits/rejected": 353.08367919921875, "logps/chosen": -291.31494140625, "logps/rejected": -334.69415283203125, "loss": 0.6207, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.318630576133728, "rewards/margins": 0.3065961003303528, "rewards/rejected": -0.6252266764640808, "step": 520 }, { "epoch": 1.3793103448275863, "grad_norm": 8.8125, "learning_rate": 1.3262095869885905e-07, "logits/chosen": 385.84002685546875, "logits/rejected": 390.05853271484375, "logps/chosen": -330.57440185546875, "logps/rejected": -347.67303466796875, "loss": 0.624, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3374934792518616, "rewards/margins": 0.22613167762756348, "rewards/rejected": -0.563625156879425, "step": 530 }, { "epoch": 1.405335068314899, "grad_norm": 9.75, "learning_rate": 1.2271029738194257e-07, "logits/chosen": 392.96490478515625, "logits/rejected": 369.8482360839844, "logps/chosen": -286.5850524902344, "logps/rejected": -321.9965515136719, "loss": 0.6266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.23913364112377167, "rewards/margins": 0.265684574842453, "rewards/rejected": -0.5048182606697083, "step": 540 }, { "epoch": 1.431359791802212, "grad_norm": 9.5, "learning_rate": 1.1306270048538966e-07, "logits/chosen": 375.2906799316406, "logits/rejected": 369.0203552246094, "logps/chosen": -302.0591735839844, "logps/rejected": -318.1397399902344, "loss": 0.6149, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.330008327960968, "rewards/margins": 0.2058134377002716, "rewards/rejected": -0.535821795463562, "step": 550 }, { "epoch": 1.457384515289525, "grad_norm": 11.25, "learning_rate": 1.0369810630297657e-07, "logits/chosen": 370.2327575683594, "logits/rejected": 373.281005859375, "logps/chosen": -332.6309814453125, "logps/rejected": -339.60247802734375, "loss": 0.6251, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3642995357513428, "rewards/margins": 0.20028725266456604, "rewards/rejected": -0.5645867586135864, "step": 560 }, { "epoch": 1.483409238776838, "grad_norm": 9.75, "learning_rate": 9.463586825834938e-08, "logits/chosen": 371.750244140625, "logits/rejected": 374.09259033203125, "logps/chosen": -320.38555908203125, "logps/rejected": -332.42120361328125, "loss": 0.6486, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2637823820114136, "rewards/margins": 0.28345996141433716, "rewards/rejected": -0.5472423434257507, "step": 570 }, { "epoch": 1.509433962264151, "grad_norm": 9.9375, "learning_rate": 8.589471490809472e-08, "logits/chosen": 388.10174560546875, "logits/rejected": 363.66436767578125, "logps/chosen": -293.32745361328125, "logps/rejected": -341.19866943359375, "loss": 0.6028, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.26182278990745544, "rewards/margins": 0.36554405093193054, "rewards/rejected": -0.6273669004440308, "step": 580 }, { "epoch": 1.5354586857514638, "grad_norm": 12.375, "learning_rate": 7.749271123619888e-08, "logits/chosen": 380.56036376953125, "logits/rejected": 354.895263671875, "logps/chosen": -273.80010986328125, "logps/rejected": -324.5169372558594, "loss": 0.631, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1536741703748703, "rewards/margins": 0.3123035430908203, "rewards/rejected": -0.46597766876220703, "step": 590 }, { "epoch": 1.5614834092387768, "grad_norm": 9.5625, "learning_rate": 6.944722131988392e-08, "logits/chosen": 388.1387023925781, "logits/rejected": 355.89056396484375, "logps/chosen": -300.9095153808594, "logps/rejected": -333.97088623046875, "loss": 0.6058, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23438140749931335, "rewards/margins": 0.3118259310722351, "rewards/rejected": -0.5462073087692261, "step": 600 }, { "epoch": 1.5614834092387768, "eval_logits/chosen": 452.652099609375, "eval_logits/rejected": 436.0275573730469, "eval_logps/chosen": -295.8379821777344, "eval_logps/rejected": -334.4803771972656, "eval_loss": 0.6497182250022888, "eval_rewards/accuracies": 0.6139053106307983, "eval_rewards/chosen": -0.3389085829257965, "eval_rewards/margins": 0.23618672788143158, "eval_rewards/rejected": -0.5750953555107117, "eval_runtime": 244.5623, "eval_samples_per_second": 22.105, "eval_steps_per_second": 0.691, "step": 600 }, { "epoch": 1.5875081327260898, "grad_norm": 9.25, "learning_rate": 6.177487244398008e-08, "logits/chosen": 386.1551513671875, "logits/rejected": 376.6158142089844, "logps/chosen": -306.791259765625, "logps/rejected": -328.063232421875, "loss": 0.6288, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.24376244843006134, "rewards/margins": 0.39431554079055786, "rewards/rejected": -0.638077974319458, "step": 610 }, { "epoch": 1.6135328562134026, "grad_norm": 8.875, "learning_rate": 5.449152073799615e-08, "logits/chosen": 390.0867614746094, "logits/rejected": 356.75006103515625, "logps/chosen": -306.01861572265625, "logps/rejected": -342.045166015625, "loss": 0.6306, "rewards/accuracies": 0.59375, "rewards/chosen": -0.274763286113739, "rewards/margins": 0.3244563043117523, "rewards/rejected": -0.599219560623169, "step": 620 }, { "epoch": 1.6395575797007158, "grad_norm": 10.3125, "learning_rate": 4.761221840690585e-08, "logits/chosen": 384.375244140625, "logits/rejected": 363.5393981933594, "logps/chosen": -304.85076904296875, "logps/rejected": -343.6796875, "loss": 0.6297, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2900192439556122, "rewards/margins": 0.30680161714553833, "rewards/rejected": -0.5968209505081177, "step": 630 }, { "epoch": 1.6655823031880286, "grad_norm": 9.1875, "learning_rate": 4.115118262337128e-08, "logits/chosen": 381.7267150878906, "logits/rejected": 368.2362060546875, "logps/chosen": -302.25628662109375, "logps/rejected": -344.24676513671875, "loss": 0.6238, "rewards/accuracies": 0.625, "rewards/chosen": -0.2637556195259094, "rewards/margins": 0.33825674653053284, "rewards/rejected": -0.6020123362541199, "step": 640 }, { "epoch": 1.6916070266753416, "grad_norm": 9.4375, "learning_rate": 3.5121766145694173e-08, "logits/chosen": 382.5013732910156, "logits/rejected": 357.2362976074219, "logps/chosen": -290.78717041015625, "logps/rejected": -344.69677734375, "loss": 0.628, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.31657862663269043, "rewards/margins": 0.3126802146434784, "rewards/rejected": -0.6292588710784912, "step": 650 }, { "epoch": 1.7176317501626546, "grad_norm": 9.4375, "learning_rate": 2.9536429722216205e-08, "logits/chosen": 376.56842041015625, "logits/rejected": 344.4474182128906, "logps/chosen": -279.5611267089844, "logps/rejected": -347.4361572265625, "loss": 0.6492, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.393665611743927, "rewards/margins": 0.2809843122959137, "rewards/rejected": -0.6746498942375183, "step": 660 }, { "epoch": 1.7436564736499673, "grad_norm": 9.0625, "learning_rate": 2.4406716339200743e-08, "logits/chosen": 373.3739013671875, "logits/rejected": 362.2355041503906, "logps/chosen": -303.25, "logps/rejected": -325.88995361328125, "loss": 0.6358, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.233059361577034, "rewards/margins": 0.3218488097190857, "rewards/rejected": -0.5549081563949585, "step": 670 }, { "epoch": 1.7696811971372806, "grad_norm": 8.0625, "learning_rate": 1.974322736541509e-08, "logits/chosen": 396.20916748046875, "logits/rejected": 368.4210510253906, "logps/chosen": -294.27386474609375, "logps/rejected": -334.74005126953125, "loss": 0.6414, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.34432071447372437, "rewards/margins": 0.23387746512889862, "rewards/rejected": -0.5781981945037842, "step": 680 }, { "epoch": 1.7957059206245933, "grad_norm": 9.0, "learning_rate": 1.5555600642715437e-08, "logits/chosen": 397.7704162597656, "logits/rejected": 368.53717041015625, "logps/chosen": -307.35321044921875, "logps/rejected": -370.9716796875, "loss": 0.6298, "rewards/accuracies": 0.59375, "rewards/chosen": -0.25926095247268677, "rewards/margins": 0.26611703634262085, "rewards/rejected": -0.5253779888153076, "step": 690 }, { "epoch": 1.8217306441119063, "grad_norm": 9.5625, "learning_rate": 1.1852490567913653e-08, "logits/chosen": 377.498291015625, "logits/rejected": 364.7190856933594, "logps/chosen": -297.59246826171875, "logps/rejected": -319.68499755859375, "loss": 0.6368, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3016144633293152, "rewards/margins": 0.32235556840896606, "rewards/rejected": -0.6239700317382812, "step": 700 }, { "epoch": 1.8217306441119063, "eval_logits/chosen": 452.6395263671875, "eval_logits/rejected": 436.0116882324219, "eval_logps/chosen": -295.8665466308594, "eval_logps/rejected": -334.6864013671875, "eval_loss": 0.6448772549629211, "eval_rewards/accuracies": 0.6065088510513306, "eval_rewards/chosen": -0.3403345048427582, "eval_rewards/margins": 0.24506251513957977, "eval_rewards/rejected": -0.5853970646858215, "eval_runtime": 244.6363, "eval_samples_per_second": 22.098, "eval_steps_per_second": 0.691, "step": 700 }, { "epoch": 1.8477553675992193, "grad_norm": 11.625, "learning_rate": 8.641550207089038e-09, "logits/chosen": 388.95428466796875, "logits/rejected": 358.6634826660156, "logps/chosen": -323.3162841796875, "logps/rejected": -371.1590270996094, "loss": 0.6193, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3497847318649292, "rewards/margins": 0.32699352502822876, "rewards/rejected": -0.676778256893158, "step": 710 }, { "epoch": 1.873780091086532, "grad_norm": 10.4375, "learning_rate": 5.929415479310279e-09, "logits/chosen": 371.3935241699219, "logits/rejected": 376.6529235839844, "logps/chosen": -321.6485290527344, "logps/rejected": -330.58721923828125, "loss": 0.6461, "rewards/accuracies": 0.53125, "rewards/chosen": -0.29033127427101135, "rewards/margins": 0.1948806494474411, "rewards/rejected": -0.48521193861961365, "step": 720 }, { "epoch": 1.8998048145738453, "grad_norm": 9.3125, "learning_rate": 3.721691442452768e-09, "logits/chosen": 370.8646240234375, "logits/rejected": 361.8319396972656, "logps/chosen": -303.0124816894531, "logps/rejected": -321.4366760253906, "loss": 0.6354, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2958618998527527, "rewards/margins": 0.2960815727710724, "rewards/rejected": -0.5919433832168579, "step": 730 }, { "epoch": 1.925829538061158, "grad_norm": 9.9375, "learning_rate": 2.0229407094547735e-09, "logits/chosen": 377.24005126953125, "logits/rejected": 357.33294677734375, "logps/chosen": -277.88214111328125, "logps/rejected": -321.13739013671875, "loss": 0.6277, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2274915874004364, "rewards/margins": 0.35707369446754456, "rewards/rejected": -0.584565281867981, "step": 740 }, { "epoch": 1.951854261548471, "grad_norm": 10.0625, "learning_rate": 8.366740189520715e-10, "logits/chosen": 353.9988098144531, "logits/rejected": 346.21380615234375, "logps/chosen": -301.86627197265625, "logps/rejected": -304.49468994140625, "loss": 0.6407, "rewards/accuracies": 0.625, "rewards/chosen": -0.36653071641921997, "rewards/margins": 0.22654423117637634, "rewards/rejected": -0.5930749773979187, "step": 750 }, { "epoch": 1.977878985035784, "grad_norm": 9.375, "learning_rate": 1.6534297977804923e-10, "logits/chosen": 381.37811279296875, "logits/rejected": 356.3533020019531, "logps/chosen": -299.4158935546875, "logps/rejected": -340.39569091796875, "loss": 0.6361, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2855472266674042, "rewards/margins": 0.29285091161727905, "rewards/rejected": -0.5783981084823608, "step": 760 }, { "epoch": 1.9986987638256344, "step": 768, "total_flos": 0.0, "train_loss": 0.6547494133313497, "train_runtime": 8032.2795, "train_samples_per_second": 12.243, "train_steps_per_second": 0.096 } ], "logging_steps": 10, "max_steps": 768, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }