{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994767137624281, "eval_steps": 500, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010465724751439037, "grad_norm": 26.42055892944336, "learning_rate": 2.0833333333333333e-07, "logits/chosen": 0.7578125, "logits/rejected": 0.97265625, "logps/chosen": -284.0, "logps/rejected": -294.0, "loss": 0.6958, "rewards/accuracies": 0.23125000298023224, "rewards/chosen": 0.00531005859375, "rewards/margins": 0.0027618408203125, "rewards/rejected": 0.0025177001953125, "step": 10 }, { "epoch": 0.020931449502878074, "grad_norm": 25.812650680541992, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.65234375, "logps/chosen": -382.0, "logps/rejected": -308.0, "loss": 0.6917, "rewards/accuracies": 0.34375, "rewards/chosen": 0.0172119140625, "rewards/margins": 0.01953125, "rewards/rejected": -0.002410888671875, "step": 20 }, { "epoch": 0.03139717425431711, "grad_norm": 28.49739646911621, "learning_rate": 6.249999999999999e-07, "logits/chosen": 0.8203125, "logits/rejected": 0.7421875, "logps/chosen": -304.0, "logps/rejected": -260.0, "loss": 0.6961, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": 0.00148773193359375, "rewards/margins": -0.005279541015625, "rewards/rejected": 0.00677490234375, "step": 30 }, { "epoch": 0.04186289900575615, "grad_norm": 25.176631927490234, "learning_rate": 8.333333333333333e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.98046875, "logps/chosen": -340.0, "logps/rejected": -320.0, "loss": 0.6748, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0247802734375, "rewards/margins": 0.0751953125, "rewards/rejected": -0.05029296875, "step": 40 }, { "epoch": 0.052328623757195186, "grad_norm": 23.213665008544922, "learning_rate": 9.999880027023293e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.87109375, "logps/chosen": -322.0, "logps/rejected": -280.0, "loss": 0.6482, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.08544921875, "rewards/margins": 0.1982421875, "rewards/rejected": -0.11328125, "step": 50 }, { "epoch": 0.06279434850863422, "grad_norm": 22.0967960357666, "learning_rate": 9.995681577335256e-07, "logits/chosen": 0.79296875, "logits/rejected": 0.984375, "logps/chosen": -320.0, "logps/rejected": -300.0, "loss": 0.6378, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.029541015625, "rewards/margins": 0.1767578125, "rewards/rejected": -0.1474609375, "step": 60 }, { "epoch": 0.07326007326007326, "grad_norm": 22.011857986450195, "learning_rate": 9.985490234976131e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.671875, "logps/chosen": -342.0, "logps/rejected": -264.0, "loss": 0.6117, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.09375, "rewards/margins": 0.443359375, "rewards/rejected": -0.349609375, "step": 70 }, { "epoch": 0.0837257980115123, "grad_norm": 27.431547164916992, "learning_rate": 9.969318225629239e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.6875, "logps/chosen": -388.0, "logps/rejected": -342.0, "loss": 0.6463, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0264892578125, "rewards/margins": 0.451171875, "rewards/rejected": -0.423828125, "step": 80 }, { "epoch": 0.09419152276295134, "grad_norm": 24.862520217895508, "learning_rate": 9.947184949473476e-07, "logits/chosen": 0.796875, "logits/rejected": 0.8359375, "logps/chosen": -356.0, "logps/rejected": -298.0, "loss": 0.6146, "rewards/accuracies": 0.65625, "rewards/chosen": 0.055419921875, "rewards/margins": 0.44921875, "rewards/rejected": -0.392578125, "step": 90 }, { "epoch": 0.10465724751439037, "grad_norm": 23.47401237487793, "learning_rate": 9.919116957910565e-07, "logits/chosen": 0.4921875, "logits/rejected": 0.51953125, "logps/chosen": -358.0, "logps/rejected": -298.0, "loss": 0.6125, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0224609375, "rewards/margins": 0.49609375, "rewards/rejected": -0.474609375, "step": 100 }, { "epoch": 0.1151229722658294, "grad_norm": 24.995296478271484, "learning_rate": 9.88514792171362e-07, "logits/chosen": 0.671875, "logits/rejected": 0.6796875, "logps/chosen": -330.0, "logps/rejected": -308.0, "loss": 0.5918, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06494140625, "rewards/margins": 0.341796875, "rewards/rejected": -0.408203125, "step": 110 }, { "epoch": 0.12558869701726844, "grad_norm": 22.512697219848633, "learning_rate": 9.845318590635185e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.62109375, "logps/chosen": -340.0, "logps/rejected": -270.0, "loss": 0.5845, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.0174560546875, "rewards/margins": 0.5390625, "rewards/rejected": -0.55859375, "step": 120 }, { "epoch": 0.1360544217687075, "grad_norm": 26.06734848022461, "learning_rate": 9.799676744523238e-07, "logits/chosen": 0.54296875, "logits/rejected": 0.62890625, "logps/chosen": -348.0, "logps/rejected": -290.0, "loss": 0.5964, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0166015625, "rewards/margins": 0.55078125, "rewards/rejected": -0.56640625, "step": 130 }, { "epoch": 0.14652014652014653, "grad_norm": 21.467668533325195, "learning_rate": 9.748277136003789e-07, "logits/chosen": 0.62890625, "logits/rejected": 0.8828125, "logps/chosen": -324.0, "logps/rejected": -308.0, "loss": 0.6083, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.041015625, "rewards/margins": 0.369140625, "rewards/rejected": -0.41015625, "step": 140 }, { "epoch": 0.15698587127158556, "grad_norm": 24.102495193481445, "learning_rate": 9.691181424798824e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.66796875, "logps/chosen": -298.0, "logps/rejected": -276.0, "loss": 0.6033, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.05615234375, "rewards/margins": 0.423828125, "rewards/rejected": -0.3671875, "step": 150 }, { "epoch": 0.1674515960230246, "grad_norm": 22.33710479736328, "learning_rate": 9.628458103758402e-07, "logits/chosen": 0.3828125, "logits/rejected": 0.5234375, "logps/chosen": -348.0, "logps/rejected": -320.0, "loss": 0.5875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.00665283203125, "rewards/margins": 0.44921875, "rewards/rejected": -0.455078125, "step": 160 }, { "epoch": 0.17791732077446362, "grad_norm": 21.965810775756836, "learning_rate": 9.560182416695637e-07, "logits/chosen": 0.490234375, "logits/rejected": 0.5, "logps/chosen": -332.0, "logps/rejected": -306.0, "loss": 0.6099, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0615234375, "rewards/margins": 0.28515625, "rewards/rejected": -0.34765625, "step": 170 }, { "epoch": 0.18838304552590268, "grad_norm": 23.698848724365234, "learning_rate": 9.486436268123111e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.6015625, "logps/chosen": -344.0, "logps/rejected": -314.0, "loss": 0.6223, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0267333984375, "rewards/margins": 0.390625, "rewards/rejected": -0.41796875, "step": 180 }, { "epoch": 0.1988487702773417, "grad_norm": 20.521411895751953, "learning_rate": 9.40730812499903e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.60546875, "logps/chosen": -322.0, "logps/rejected": -282.0, "loss": 0.6127, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.005859375, "rewards/margins": 0.482421875, "rewards/rejected": -0.48828125, "step": 190 }, { "epoch": 0.20931449502878074, "grad_norm": 25.220121383666992, "learning_rate": 9.322892910600958e-07, "logits/chosen": 0.73046875, "logits/rejected": 0.9765625, "logps/chosen": -316.0, "logps/rejected": -292.0, "loss": 0.5889, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0478515625, "rewards/margins": 0.52734375, "rewards/rejected": -0.478515625, "step": 200 }, { "epoch": 0.21978021978021978, "grad_norm": 26.859111785888672, "learning_rate": 9.233291890654476e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.498046875, "logps/chosen": -338.0, "logps/rejected": -296.0, "loss": 0.5686, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.04443359375, "rewards/margins": 0.62890625, "rewards/rejected": -0.671875, "step": 210 }, { "epoch": 0.2302459445316588, "grad_norm": 19.048847198486328, "learning_rate": 9.138612551853332e-07, "logits/chosen": 0.427734375, "logits/rejected": 0.51953125, "logps/chosen": -306.0, "logps/rejected": -314.0, "loss": 0.6049, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1962890625, "rewards/margins": 0.458984375, "rewards/rejected": -0.65625, "step": 220 }, { "epoch": 0.24071166928309787, "grad_norm": 29.931427001953125, "learning_rate": 9.03896847291683e-07, "logits/chosen": 0.498046875, "logits/rejected": 0.490234375, "logps/chosen": -368.0, "logps/rejected": -320.0, "loss": 0.5824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06884765625, "rewards/margins": 0.54296875, "rewards/rejected": -0.61328125, "step": 230 }, { "epoch": 0.25117739403453687, "grad_norm": 24.653114318847656, "learning_rate": 8.934479188339137e-07, "logits/chosen": 0.515625, "logits/rejected": 0.384765625, "logps/chosen": -358.0, "logps/rejected": -300.0, "loss": 0.5824, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.11669921875, "rewards/margins": 0.53125, "rewards/rejected": -0.6484375, "step": 240 }, { "epoch": 0.2616431187859759, "grad_norm": 27.315216064453125, "learning_rate": 8.825270044993962e-07, "logits/chosen": 0.447265625, "logits/rejected": 0.71875, "logps/chosen": -394.0, "logps/rejected": -350.0, "loss": 0.5854, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.056640625, "rewards/margins": 0.76953125, "rewards/rejected": -0.828125, "step": 250 }, { "epoch": 0.272108843537415, "grad_norm": 26.475204467773438, "learning_rate": 8.711472051766605e-07, "logits/chosen": 0.390625, "logits/rejected": 0.466796875, "logps/chosen": -322.0, "logps/rejected": -324.0, "loss": 0.5773, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.162109375, "rewards/margins": 0.59375, "rewards/rejected": -0.7578125, "step": 260 }, { "epoch": 0.282574568288854, "grad_norm": 24.804954528808594, "learning_rate": 8.593221722393789e-07, "logits/chosen": 0.419921875, "logits/rejected": 0.443359375, "logps/chosen": -324.0, "logps/rejected": -290.0, "loss": 0.5661, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.16796875, "rewards/margins": 0.63671875, "rewards/rejected": -0.8046875, "step": 270 }, { "epoch": 0.29304029304029305, "grad_norm": 26.523075103759766, "learning_rate": 8.470660911699782e-07, "logits/chosen": 0.498046875, "logits/rejected": 0.640625, "logps/chosen": -328.0, "logps/rejected": -286.0, "loss": 0.5697, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.34375, "rewards/margins": 0.6171875, "rewards/rejected": -0.96484375, "step": 280 }, { "epoch": 0.3035060177917321, "grad_norm": 18.05160140991211, "learning_rate": 8.343936645425276e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.75390625, "logps/chosen": -344.0, "logps/rejected": -312.0, "loss": 0.5687, "rewards/accuracies": 0.6875, "rewards/chosen": -0.63671875, "rewards/margins": 0.59375, "rewards/rejected": -1.2265625, "step": 290 }, { "epoch": 0.3139717425431711, "grad_norm": 30.55604362487793, "learning_rate": 8.213200943853158e-07, "logits/chosen": 0.275390625, "logits/rejected": 0.6015625, "logps/chosen": -312.0, "logps/rejected": -310.0, "loss": 0.6047, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.453125, "rewards/margins": 0.341796875, "rewards/rejected": -0.796875, "step": 300 }, { "epoch": 0.32443746729461015, "grad_norm": 25.680500030517578, "learning_rate": 8.07861063944276e-07, "logits/chosen": 0.421875, "logits/rejected": 0.423828125, "logps/chosen": -394.0, "logps/rejected": -288.0, "loss": 0.5542, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.013427734375, "rewards/margins": 0.859375, "rewards/rejected": -0.84375, "step": 310 }, { "epoch": 0.3349031920460492, "grad_norm": 27.477874755859375, "learning_rate": 7.940327188691341e-07, "logits/chosen": 0.283203125, "logits/rejected": 0.326171875, "logps/chosen": -332.0, "logps/rejected": -294.0, "loss": 0.5681, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2177734375, "rewards/margins": 0.59765625, "rewards/rejected": -0.81640625, "step": 320 }, { "epoch": 0.3453689167974882, "grad_norm": 23.42730712890625, "learning_rate": 7.798516478448514e-07, "logits/chosen": 0.2490234375, "logits/rejected": 0.25390625, "logps/chosen": -356.0, "logps/rejected": -302.0, "loss": 0.5724, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05810546875, "rewards/margins": 0.44921875, "rewards/rejected": -0.5078125, "step": 330 }, { "epoch": 0.35583464154892724, "grad_norm": 24.55620574951172, "learning_rate": 7.653348626915957e-07, "logits/chosen": 0.302734375, "logits/rejected": 0.3125, "logps/chosen": -320.0, "logps/rejected": -298.0, "loss": 0.5743, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1396484375, "rewards/margins": 0.51953125, "rewards/rejected": -0.66015625, "step": 340 }, { "epoch": 0.3663003663003663, "grad_norm": 23.369258880615234, "learning_rate": 7.504997779571132e-07, "logits/chosen": 0.53515625, "logits/rejected": 0.5546875, "logps/chosen": -350.0, "logps/rejected": -302.0, "loss": 0.5759, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.291015625, "rewards/margins": 0.443359375, "rewards/rejected": -0.734375, "step": 350 }, { "epoch": 0.37676609105180536, "grad_norm": 26.217023849487305, "learning_rate": 7.353641900259823e-07, "logits/chosen": 0.3203125, "logits/rejected": 0.2138671875, "logps/chosen": -368.0, "logps/rejected": -302.0, "loss": 0.5576, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.345703125, "rewards/margins": 0.63671875, "rewards/rejected": -0.984375, "step": 360 }, { "epoch": 0.3872318158032444, "grad_norm": 26.125743865966797, "learning_rate": 7.199462557708097e-07, "logits/chosen": 0.265625, "logits/rejected": 0.455078125, "logps/chosen": -332.0, "logps/rejected": -336.0, "loss": 0.5876, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.23828125, "rewards/margins": 0.6015625, "rewards/rejected": -0.83984375, "step": 370 }, { "epoch": 0.3976975405546834, "grad_norm": 25.499277114868164, "learning_rate": 7.042644707709815e-07, "logits/chosen": 0.287109375, "logits/rejected": 0.423828125, "logps/chosen": -312.0, "logps/rejected": -251.0, "loss": 0.5691, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.275390625, "rewards/margins": 0.56640625, "rewards/rejected": -0.83984375, "step": 380 }, { "epoch": 0.40816326530612246, "grad_norm": 24.560373306274414, "learning_rate": 6.883376471250955e-07, "logits/chosen": 0.388671875, "logits/rejected": 0.263671875, "logps/chosen": -326.0, "logps/rejected": -296.0, "loss": 0.59, "rewards/accuracies": 0.65625, "rewards/chosen": -0.244140625, "rewards/margins": 0.50390625, "rewards/rejected": -0.75, "step": 390 }, { "epoch": 0.4186289900575615, "grad_norm": 27.33049774169922, "learning_rate": 6.72184890883692e-07, "logits/chosen": 0.349609375, "logits/rejected": 0.4375, "logps/chosen": -306.0, "logps/rejected": -262.0, "loss": 0.5551, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0361328125, "rewards/margins": 0.546875, "rewards/rejected": -0.58203125, "step": 400 }, { "epoch": 0.4290947148090005, "grad_norm": 24.623563766479492, "learning_rate": 6.558255791293571e-07, "logits/chosen": 0.453125, "logits/rejected": 0.44921875, "logps/chosen": -320.0, "logps/rejected": -282.0, "loss": 0.5582, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.07958984375, "rewards/margins": 0.72265625, "rewards/rejected": -0.8046875, "step": 410 }, { "epoch": 0.43956043956043955, "grad_norm": 31.462345123291016, "learning_rate": 6.392793367316904e-07, "logits/chosen": 0.3984375, "logits/rejected": 0.3203125, "logps/chosen": -360.0, "logps/rejected": -316.0, "loss": 0.549, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0059814453125, "rewards/margins": 0.70703125, "rewards/rejected": -0.7109375, "step": 420 }, { "epoch": 0.4500261643118786, "grad_norm": 29.43328285217285, "learning_rate": 6.225660128050247e-07, "logits/chosen": 0.373046875, "logits/rejected": 0.515625, "logps/chosen": -308.0, "logps/rejected": -282.0, "loss": 0.5875, "rewards/accuracies": 0.625, "rewards/chosen": -0.376953125, "rewards/margins": 0.48828125, "rewards/rejected": -0.8671875, "step": 430 }, { "epoch": 0.4604918890633176, "grad_norm": 25.328622817993164, "learning_rate": 6.057056568971383e-07, "logits/chosen": 0.4140625, "logits/rejected": 0.46484375, "logps/chosen": -408.0, "logps/rejected": -318.0, "loss": 0.5225, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.474609375, "rewards/margins": 0.89453125, "rewards/rejected": -1.3671875, "step": 440 }, { "epoch": 0.47095761381475665, "grad_norm": 20.754179000854492, "learning_rate": 5.887184949375242e-07, "logits/chosen": 0.48828125, "logits/rejected": 0.388671875, "logps/chosen": -350.0, "logps/rejected": -310.0, "loss": 0.5658, "rewards/accuracies": 0.6875, "rewards/chosen": -0.50390625, "rewards/margins": 0.7578125, "rewards/rejected": -1.265625, "step": 450 }, { "epoch": 0.48142333856619574, "grad_norm": 25.111303329467773, "learning_rate": 5.716249049740689e-07, "logits/chosen": 0.41796875, "logits/rejected": 0.578125, "logps/chosen": -368.0, "logps/rejected": -336.0, "loss": 0.5623, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.1806640625, "rewards/margins": 0.79296875, "rewards/rejected": -0.97265625, "step": 460 }, { "epoch": 0.49188906331763477, "grad_norm": 20.85157585144043, "learning_rate": 5.544453927272492e-07, "logits/chosen": 0.44140625, "logits/rejected": 0.466796875, "logps/chosen": -348.0, "logps/rejected": -330.0, "loss": 0.545, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.103515625, "rewards/margins": 0.7578125, "rewards/rejected": -0.86328125, "step": 470 }, { "epoch": 0.5023547880690737, "grad_norm": 27.79718017578125, "learning_rate": 5.372005669911693e-07, "logits/chosen": 0.50390625, "logits/rejected": 0.5546875, "logps/chosen": -346.0, "logps/rejected": -280.0, "loss": 0.6289, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07666015625, "rewards/margins": 0.61328125, "rewards/rejected": -0.69140625, "step": 480 }, { "epoch": 0.5128205128205128, "grad_norm": 25.19495964050293, "learning_rate": 5.199111149109497e-07, "logits/chosen": 0.302734375, "logits/rejected": 0.49609375, "logps/chosen": -330.0, "logps/rejected": -292.0, "loss": 0.5888, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03759765625, "rewards/margins": 0.89453125, "rewards/rejected": -0.93359375, "step": 490 }, { "epoch": 0.5232862375719518, "grad_norm": 23.607749938964844, "learning_rate": 5.025977771661266e-07, "logits/chosen": 0.494140625, "logits/rejected": 0.50390625, "logps/chosen": -306.0, "logps/rejected": -312.0, "loss": 0.592, "rewards/accuracies": 0.65625, "rewards/chosen": -0.330078125, "rewards/margins": 0.4296875, "rewards/rejected": -0.7578125, "step": 500 }, { "epoch": 0.533751962323391, "grad_norm": 28.496578216552734, "learning_rate": 4.852813230898279e-07, "logits/chosen": 0.462890625, "logits/rejected": 0.498046875, "logps/chosen": -374.0, "logps/rejected": -310.0, "loss": 0.599, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.050048828125, "rewards/margins": 0.6953125, "rewards/rejected": -0.64453125, "step": 510 }, { "epoch": 0.54421768707483, "grad_norm": 23.15301513671875, "learning_rate": 4.679825257535794e-07, "logits/chosen": 0.439453125, "logits/rejected": 0.43359375, "logps/chosen": -366.0, "logps/rejected": -334.0, "loss": 0.5337, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0294189453125, "rewards/margins": 0.71484375, "rewards/rejected": -0.68359375, "step": 520 }, { "epoch": 0.554683411826269, "grad_norm": 22.724777221679688, "learning_rate": 4.507221370476223e-07, "logits/chosen": 0.4609375, "logits/rejected": 0.59765625, "logps/chosen": -336.0, "logps/rejected": -328.0, "loss": 0.5767, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.02880859375, "rewards/margins": 0.671875, "rewards/rejected": -0.640625, "step": 530 }, { "epoch": 0.565149136577708, "grad_norm": 20.862764358520508, "learning_rate": 4.3352086278664377e-07, "logits/chosen": 0.4921875, "logits/rejected": 0.515625, "logps/chosen": -318.0, "logps/rejected": -270.0, "loss": 0.5311, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.051025390625, "rewards/margins": 0.7109375, "rewards/rejected": -0.66015625, "step": 540 }, { "epoch": 0.5756148613291471, "grad_norm": 23.62275505065918, "learning_rate": 4.1639933787077854e-07, "logits/chosen": 0.46484375, "logits/rejected": 0.609375, "logps/chosen": -332.0, "logps/rejected": -304.0, "loss": 0.5508, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01165771484375, "rewards/margins": 0.6640625, "rewards/rejected": -0.6796875, "step": 550 }, { "epoch": 0.5860805860805861, "grad_norm": 26.077327728271484, "learning_rate": 3.9937810153168016e-07, "logits/chosen": 0.5078125, "logits/rejected": 0.427734375, "logps/chosen": -374.0, "logps/rejected": -334.0, "loss": 0.5848, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2197265625, "rewards/margins": 0.609375, "rewards/rejected": -0.828125, "step": 560 }, { "epoch": 0.5965463108320251, "grad_norm": 25.223384857177734, "learning_rate": 3.8247757269335957e-07, "logits/chosen": 0.37890625, "logits/rejected": 0.474609375, "logps/chosen": -334.0, "logps/rejected": -302.0, "loss": 0.5283, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1943359375, "rewards/margins": 0.72265625, "rewards/rejected": -0.91796875, "step": 570 }, { "epoch": 0.6070120355834642, "grad_norm": 23.150033950805664, "learning_rate": 3.657180254773445e-07, "logits/chosen": 0.421875, "logits/rejected": 0.546875, "logps/chosen": -334.0, "logps/rejected": -274.0, "loss": 0.5584, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.359375, "rewards/margins": 0.828125, "rewards/rejected": -1.1875, "step": 580 }, { "epoch": 0.6174777603349032, "grad_norm": 26.779611587524414, "learning_rate": 3.4911956488154694e-07, "logits/chosen": 0.48828125, "logits/rejected": 0.625, "logps/chosen": -360.0, "logps/rejected": -310.0, "loss": 0.5908, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4296875, "rewards/margins": 0.73046875, "rewards/rejected": -1.15625, "step": 590 }, { "epoch": 0.6279434850863422, "grad_norm": 30.289024353027344, "learning_rate": 3.327021026620137e-07, "logits/chosen": 0.53125, "logits/rejected": 0.5234375, "logps/chosen": -356.0, "logps/rejected": -328.0, "loss": 0.5481, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4921875, "rewards/margins": 0.734375, "rewards/rejected": -1.2265625, "step": 600 }, { "epoch": 0.6384092098377813, "grad_norm": 26.568483352661133, "learning_rate": 3.16485333446493e-07, "logits/chosen": 0.396484375, "logits/rejected": 0.81640625, "logps/chosen": -328.0, "logps/rejected": -368.0, "loss": 0.5517, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.58984375, "rewards/margins": 0.76953125, "rewards/rejected": -1.359375, "step": 610 }, { "epoch": 0.6488749345892203, "grad_norm": 23.833829879760742, "learning_rate": 3.004887111084704e-07, "logits/chosen": 0.314453125, "logits/rejected": 0.37890625, "logps/chosen": -358.0, "logps/rejected": -312.0, "loss": 0.5447, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.373046875, "rewards/margins": 0.8125, "rewards/rejected": -1.1875, "step": 620 }, { "epoch": 0.6593406593406593, "grad_norm": 24.47187042236328, "learning_rate": 2.8473142543001816e-07, "logits/chosen": 0.45703125, "logits/rejected": 0.455078125, "logps/chosen": -324.0, "logps/rejected": -310.0, "loss": 0.552, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.55859375, "rewards/margins": 0.70703125, "rewards/rejected": -1.265625, "step": 630 }, { "epoch": 0.6698063840920984, "grad_norm": 29.288145065307617, "learning_rate": 2.6923237908145226e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.494140625, "logps/chosen": -344.0, "logps/rejected": -324.0, "loss": 0.5347, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.396484375, "rewards/margins": 0.76171875, "rewards/rejected": -1.15625, "step": 640 }, { "epoch": 0.6802721088435374, "grad_norm": 25.19725799560547, "learning_rate": 2.540101649454119e-07, "logits/chosen": 0.6171875, "logits/rejected": 0.5859375, "logps/chosen": -346.0, "logps/rejected": -316.0, "loss": 0.5428, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.302734375, "rewards/margins": 0.8984375, "rewards/rejected": -1.1953125, "step": 650 }, { "epoch": 0.6907378335949764, "grad_norm": 23.4490909576416, "learning_rate": 2.3908304381256603e-07, "logits/chosen": 0.439453125, "logits/rejected": 0.69921875, "logps/chosen": -326.0, "logps/rejected": -298.0, "loss": 0.5694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.515625, "rewards/margins": 0.625, "rewards/rejected": -1.140625, "step": 660 }, { "epoch": 0.7012035583464155, "grad_norm": 25.44082260131836, "learning_rate": 2.2446892247570255e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.55859375, "logps/chosen": -334.0, "logps/rejected": -314.0, "loss": 0.5794, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.216796875, "rewards/margins": 0.859375, "rewards/rejected": -1.078125, "step": 670 }, { "epoch": 0.7116692830978545, "grad_norm": 27.55499839782715, "learning_rate": 2.1018533224847633e-07, "logits/chosen": 0.51171875, "logits/rejected": 0.5, "logps/chosen": -328.0, "logps/rejected": -286.0, "loss": 0.5717, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.171875, "rewards/margins": 0.7890625, "rewards/rejected": -0.9609375, "step": 680 }, { "epoch": 0.7221350078492935, "grad_norm": 24.051036834716797, "learning_rate": 1.9624940793459055e-07, "logits/chosen": 0.3125, "logits/rejected": 0.486328125, "logps/chosen": -358.0, "logps/rejected": -318.0, "loss": 0.5578, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03173828125, "rewards/margins": 0.69921875, "rewards/rejected": -0.734375, "step": 690 }, { "epoch": 0.7326007326007326, "grad_norm": 28.70475196838379, "learning_rate": 1.8267786727263424e-07, "logits/chosen": 0.58203125, "logits/rejected": 0.8046875, "logps/chosen": -292.0, "logps/rejected": -288.0, "loss": 0.5475, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1865234375, "rewards/margins": 0.484375, "rewards/rejected": -0.671875, "step": 700 }, { "epoch": 0.7430664573521716, "grad_norm": 21.762775421142578, "learning_rate": 1.694869908812399e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.419921875, "logps/chosen": -320.0, "logps/rejected": -284.0, "loss": 0.5696, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2578125, "rewards/margins": 0.53515625, "rewards/rejected": -0.7890625, "step": 710 }, { "epoch": 0.7535321821036107, "grad_norm": 24.395294189453125, "learning_rate": 1.5669260272861422e-07, "logits/chosen": 0.4140625, "logits/rejected": 0.33984375, "logps/chosen": -302.0, "logps/rejected": -290.0, "loss": 0.5357, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.018310546875, "rewards/margins": 0.75, "rewards/rejected": -0.73046875, "step": 720 }, { "epoch": 0.7639979068550498, "grad_norm": 25.604717254638672, "learning_rate": 1.4431005114987483e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.466796875, "logps/chosen": -360.0, "logps/rejected": -312.0, "loss": 0.5581, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.09716796875, "rewards/margins": 0.6796875, "rewards/rejected": -0.77734375, "step": 730 }, { "epoch": 0.7744636316064888, "grad_norm": 27.11602783203125, "learning_rate": 1.323541904349636e-07, "logits/chosen": 0.5234375, "logits/rejected": 0.58984375, "logps/chosen": -362.0, "logps/rejected": -288.0, "loss": 0.5973, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1513671875, "rewards/margins": 0.74609375, "rewards/rejected": -0.8984375, "step": 740 }, { "epoch": 0.7849293563579278, "grad_norm": 25.82685661315918, "learning_rate": 1.2083936300922237e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.6015625, "logps/chosen": -356.0, "logps/rejected": -344.0, "loss": 0.5742, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.11083984375, "rewards/margins": 0.640625, "rewards/rejected": -0.75, "step": 750 }, { "epoch": 0.7953950811093669, "grad_norm": 24.40570068359375, "learning_rate": 1.0977938222801004e-07, "logits/chosen": 0.578125, "logits/rejected": 0.40234375, "logps/chosen": -334.0, "logps/rejected": -300.0, "loss": 0.5828, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19140625, "rewards/margins": 0.734375, "rewards/rejected": -0.92578125, "step": 760 }, { "epoch": 0.8058608058608059, "grad_norm": 22.45295524597168, "learning_rate": 9.918751580599999e-08, "logits/chosen": 0.54296875, "logits/rejected": 0.53515625, "logps/chosen": -364.0, "logps/rejected": -308.0, "loss": 0.5766, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09814453125, "rewards/margins": 0.69921875, "rewards/rejected": -0.796875, "step": 770 }, { "epoch": 0.8163265306122449, "grad_norm": 23.214521408081055, "learning_rate": 8.907646990103495e-08, "logits/chosen": 0.59375, "logits/rejected": 0.5625, "logps/chosen": -306.0, "logps/rejected": -278.0, "loss": 0.5229, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.212890625, "rewards/margins": 0.69140625, "rewards/rejected": -0.90234375, "step": 780 }, { "epoch": 0.826792255363684, "grad_norm": 21.733230590820312, "learning_rate": 7.945837387163424e-08, "logits/chosen": 0.5234375, "logits/rejected": 0.53125, "logps/chosen": -356.0, "logps/rejected": -350.0, "loss": 0.5792, "rewards/accuracies": 0.71875, "rewards/chosen": -0.216796875, "rewards/margins": 0.80078125, "rewards/rejected": -1.015625, "step": 790 }, { "epoch": 0.837257980115123, "grad_norm": 28.44561004638672, "learning_rate": 7.034476572643854e-08, "logits/chosen": 0.609375, "logits/rejected": 0.625, "logps/chosen": -324.0, "logps/rejected": -318.0, "loss": 0.5711, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.220703125, "rewards/margins": 0.625, "rewards/rejected": -0.84765625, "step": 800 }, { "epoch": 0.847723704866562, "grad_norm": 24.274005889892578, "learning_rate": 6.174657828304541e-08, "logits/chosen": 0.46875, "logits/rejected": 0.55078125, "logps/chosen": -324.0, "logps/rejected": -310.0, "loss": 0.5782, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1640625, "rewards/margins": 0.74609375, "rewards/rejected": -0.91015625, "step": 810 }, { "epoch": 0.858189429618001, "grad_norm": 18.96664810180664, "learning_rate": 5.36741260528415e-08, "logits/chosen": 0.423828125, "logits/rejected": 0.310546875, "logps/chosen": -364.0, "logps/rejected": -294.0, "loss": 0.529, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.058349609375, "rewards/margins": 0.97265625, "rewards/rejected": -1.03125, "step": 820 }, { "epoch": 0.8686551543694401, "grad_norm": 21.882705688476562, "learning_rate": 4.613709286756412e-08, "logits/chosen": 0.4921875, "logits/rejected": 0.443359375, "logps/chosen": -326.0, "logps/rejected": -284.0, "loss": 0.5432, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.07568359375, "rewards/margins": 0.85546875, "rewards/rejected": -0.9296875, "step": 830 }, { "epoch": 0.8791208791208791, "grad_norm": 16.712879180908203, "learning_rate": 3.914452026243509e-08, "logits/chosen": 0.482421875, "logits/rejected": 0.4765625, "logps/chosen": -356.0, "logps/rejected": -320.0, "loss": 0.5462, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1796875, "rewards/margins": 0.58984375, "rewards/rejected": -0.76953125, "step": 840 }, { "epoch": 0.8895866038723181, "grad_norm": 23.453189849853516, "learning_rate": 3.270479662980247e-08, "logits/chosen": 0.57421875, "logits/rejected": 0.361328125, "logps/chosen": -328.0, "logps/rejected": -286.0, "loss": 0.5781, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0693359375, "rewards/margins": 0.703125, "rewards/rejected": -0.7734375, "step": 850 }, { "epoch": 0.9000523286237572, "grad_norm": 25.8822021484375, "learning_rate": 2.6825647156302865e-08, "logits/chosen": 0.60546875, "logits/rejected": 0.66015625, "logps/chosen": -356.0, "logps/rejected": -360.0, "loss": 0.5378, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2099609375, "rewards/margins": 0.70703125, "rewards/rejected": -0.9140625, "step": 860 }, { "epoch": 0.9105180533751962, "grad_norm": 29.531282424926758, "learning_rate": 2.151412455561441e-08, "logits/chosen": 0.5234375, "logits/rejected": 0.61328125, "logps/chosen": -342.0, "logps/rejected": -298.0, "loss": 0.5749, "rewards/accuracies": 0.65625, "rewards/chosen": -0.224609375, "rewards/margins": 0.75390625, "rewards/rejected": -0.9765625, "step": 870 }, { "epoch": 0.9209837781266352, "grad_norm": 27.597761154174805, "learning_rate": 1.6776600607918356e-08, "logits/chosen": 0.5078125, "logits/rejected": 0.41015625, "logps/chosen": -328.0, "logps/rejected": -278.0, "loss": 0.5578, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2333984375, "rewards/margins": 0.71875, "rewards/rejected": -0.953125, "step": 880 }, { "epoch": 0.9314495028780743, "grad_norm": 22.400959014892578, "learning_rate": 1.2618758516218186e-08, "logits/chosen": 0.384765625, "logits/rejected": 0.443359375, "logps/chosen": -348.0, "logps/rejected": -312.0, "loss": 0.5566, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09814453125, "rewards/margins": 0.7734375, "rewards/rejected": -0.87109375, "step": 890 }, { "epoch": 0.9419152276295133, "grad_norm": 23.831453323364258, "learning_rate": 9.045586088686496e-09, "logits/chosen": 0.443359375, "logits/rejected": 0.427734375, "logps/chosen": -372.0, "logps/rejected": -308.0, "loss": 0.5636, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0031890869140625, "rewards/margins": 0.65625, "rewards/rejected": -0.65234375, "step": 900 }, { "epoch": 0.9523809523809523, "grad_norm": 24.923038482666016, "learning_rate": 6.06136975521715e-09, "logits/chosen": 0.3671875, "logits/rejected": 0.40625, "logps/chosen": -304.0, "logps/rejected": -274.0, "loss": 0.5489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1318359375, "rewards/margins": 0.81640625, "rewards/rejected": -0.94921875, "step": 910 }, { "epoch": 0.9628466771323915, "grad_norm": 24.579931259155273, "learning_rate": 3.6696894253614442e-09, "logits/chosen": 0.375, "logits/rejected": 0.3671875, "logps/chosen": -344.0, "logps/rejected": -312.0, "loss": 0.572, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0146484375, "rewards/margins": 0.82421875, "rewards/rejected": -0.83984375, "step": 920 }, { "epoch": 0.9733124018838305, "grad_norm": 23.33355712890625, "learning_rate": 1.8734141938160918e-09, "logits/chosen": 0.640625, "logits/rejected": 0.64453125, "logps/chosen": -350.0, "logps/rejected": -324.0, "loss": 0.5611, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0311279296875, "rewards/margins": 0.796875, "rewards/rejected": -0.828125, "step": 930 }, { "epoch": 0.9837781266352695, "grad_norm": 24.21430778503418, "learning_rate": 6.746988986155999e-10, "logits/chosen": 0.50390625, "logits/rejected": 0.443359375, "logps/chosen": -358.0, "logps/rejected": -320.0, "loss": 0.5494, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07275390625, "rewards/margins": 0.77734375, "rewards/rejected": -0.8515625, "step": 940 }, { "epoch": 0.9942438513867086, "grad_norm": 23.800779342651367, "learning_rate": 7.498153615653758e-11, "logits/chosen": 0.515625, "logits/rejected": 0.447265625, "logps/chosen": -338.0, "logps/rejected": -324.0, "loss": 0.5375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15625, "rewards/margins": 0.72265625, "rewards/rejected": -0.87890625, "step": 950 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }