{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 3822, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005232862375719519, "grad_norm": 10.351250771822766, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -12.5625, "logits/rejected": -11.6875, "logps/chosen": -430.0, "logps/rejected": -460.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0052328623757195184, "grad_norm": 9.844286946232794, "learning_rate": 1.3054830287206267e-06, "logits/chosen": -11.3125, "logits/rejected": -11.3125, "logps/chosen": -364.0, "logps/rejected": -290.0, "loss": 0.6938, "rewards/accuracies": 0.2083333283662796, "rewards/chosen": 0.003997802734375, "rewards/margins": 0.0033111572265625, "rewards/rejected": 0.000701904296875, "step": 10 }, { "epoch": 0.010465724751439037, "grad_norm": 8.840907831144664, "learning_rate": 2.6109660574412534e-06, "logits/chosen": -11.0625, "logits/rejected": -11.0625, "logps/chosen": -264.0, "logps/rejected": -256.0, "loss": 0.691, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.0019989013671875, "rewards/margins": 0.009521484375, "rewards/rejected": -0.01153564453125, "step": 20 }, { "epoch": 0.015698587127158554, "grad_norm": 10.083186641323694, "learning_rate": 3.9164490861618806e-06, "logits/chosen": -10.375, "logits/rejected": -10.3125, "logps/chosen": -328.0, "logps/rejected": -318.0, "loss": 0.6803, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.00970458984375, "rewards/margins": 0.023681640625, "rewards/rejected": -0.01397705078125, "step": 30 }, { "epoch": 0.020931449502878074, "grad_norm": 9.068339095588007, "learning_rate": 5.221932114882507e-06, "logits/chosen": -11.4375, "logits/rejected": -11.0625, "logps/chosen": -336.0, "logps/rejected": -312.0, "loss": 0.6719, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.032470703125, "rewards/margins": 0.030517578125, "rewards/rejected": -0.06298828125, "step": 40 }, { "epoch": 0.026164311878597593, "grad_norm": 9.546615633761498, "learning_rate": 6.527415143603134e-06, "logits/chosen": -12.1875, "logits/rejected": -12.0, "logps/chosen": -316.0, "logps/rejected": -314.0, "loss": 0.6523, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09912109375, "rewards/margins": 0.1376953125, "rewards/rejected": -0.2373046875, "step": 50 }, { "epoch": 0.03139717425431711, "grad_norm": 9.19274633645371, "learning_rate": 7.832898172323761e-06, "logits/chosen": -12.4375, "logits/rejected": -12.125, "logps/chosen": -334.0, "logps/rejected": -328.0, "loss": 0.6243, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3203125, "rewards/margins": 0.1455078125, "rewards/rejected": -0.466796875, "step": 60 }, { "epoch": 0.03663003663003663, "grad_norm": 10.83447468535562, "learning_rate": 9.138381201044387e-06, "logits/chosen": -13.625, "logits/rejected": -13.5625, "logps/chosen": -380.0, "logps/rejected": -340.0, "loss": 0.609, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6796875, "rewards/margins": 0.287109375, "rewards/rejected": -0.96875, "step": 70 }, { "epoch": 0.04186289900575615, "grad_norm": 9.13507791543036, "learning_rate": 1.0443864229765014e-05, "logits/chosen": -13.25, "logits/rejected": -13.25, "logps/chosen": -324.0, "logps/rejected": -320.0, "loss": 0.6258, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5703125, "rewards/margins": 0.283203125, "rewards/rejected": -0.8515625, "step": 80 }, { "epoch": 0.04709576138147567, "grad_norm": 8.742369309742912, "learning_rate": 1.174934725848564e-05, "logits/chosen": -11.875, "logits/rejected": -11.5, "logps/chosen": -296.0, "logps/rejected": -272.0, "loss": 0.6439, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.28515625, "rewards/margins": 0.208984375, "rewards/rejected": -0.494140625, "step": 90 }, { "epoch": 0.052328623757195186, "grad_norm": 11.89346927211031, "learning_rate": 1.3054830287206268e-05, "logits/chosen": -11.875, "logits/rejected": -10.875, "logps/chosen": -356.0, "logps/rejected": -286.0, "loss": 0.6267, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.380859375, "rewards/margins": 0.27734375, "rewards/rejected": -0.65625, "step": 100 }, { "epoch": 0.0575614861329147, "grad_norm": 7.722831940344756, "learning_rate": 1.4360313315926893e-05, "logits/chosen": -11.125, "logits/rejected": -10.625, "logps/chosen": -320.0, "logps/rejected": -292.0, "loss": 0.6081, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6328125, "rewards/margins": 0.443359375, "rewards/rejected": -1.078125, "step": 110 }, { "epoch": 0.06279434850863422, "grad_norm": 10.557403681525182, "learning_rate": 1.5665796344647522e-05, "logits/chosen": -11.6875, "logits/rejected": -11.375, "logps/chosen": -422.0, "logps/rejected": -376.0, "loss": 0.6488, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.578125, "rewards/margins": 0.28515625, "rewards/rejected": -0.86328125, "step": 120 }, { "epoch": 0.06802721088435375, "grad_norm": 8.52113738726324, "learning_rate": 1.6971279373368146e-05, "logits/chosen": -11.9375, "logits/rejected": -11.125, "logps/chosen": -282.0, "logps/rejected": -288.0, "loss": 0.6485, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6796875, "rewards/margins": 0.2890625, "rewards/rejected": -0.96875, "step": 130 }, { "epoch": 0.07326007326007326, "grad_norm": 10.818036746056942, "learning_rate": 1.8276762402088773e-05, "logits/chosen": -12.3125, "logits/rejected": -12.125, "logps/chosen": -362.0, "logps/rejected": -340.0, "loss": 0.5629, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0703125, "rewards/margins": 0.6015625, "rewards/rejected": -1.671875, "step": 140 }, { "epoch": 0.07849293563579278, "grad_norm": 9.701498673533864, "learning_rate": 1.95822454308094e-05, "logits/chosen": -13.125, "logits/rejected": -13.0, "logps/chosen": -358.0, "logps/rejected": -326.0, "loss": 0.6388, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.515625, "rewards/margins": 0.48046875, "rewards/rejected": -1.9921875, "step": 150 }, { "epoch": 0.0837257980115123, "grad_norm": 9.972948367466696, "learning_rate": 2.0887728459530027e-05, "logits/chosen": -13.3125, "logits/rejected": -13.5, "logps/chosen": -366.0, "logps/rejected": -324.0, "loss": 0.6264, "rewards/accuracies": 0.75, "rewards/chosen": -1.328125, "rewards/margins": 0.68359375, "rewards/rejected": -2.015625, "step": 160 }, { "epoch": 0.08895866038723181, "grad_norm": 9.17511762621532, "learning_rate": 2.2193211488250655e-05, "logits/chosen": -13.375, "logits/rejected": -13.25, "logps/chosen": -338.0, "logps/rejected": -346.0, "loss": 0.5969, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3359375, "rewards/margins": 0.1865234375, "rewards/rejected": -1.5234375, "step": 170 }, { "epoch": 0.09419152276295134, "grad_norm": 6.965621587975365, "learning_rate": 2.349869451697128e-05, "logits/chosen": -12.8125, "logits/rejected": -12.75, "logps/chosen": -302.0, "logps/rejected": -314.0, "loss": 0.6303, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.359375, "rewards/margins": 0.359375, "rewards/rejected": -1.71875, "step": 180 }, { "epoch": 0.09942438513867086, "grad_norm": 9.553188606130272, "learning_rate": 2.4804177545691905e-05, "logits/chosen": -12.4375, "logits/rejected": -12.5625, "logps/chosen": -416.0, "logps/rejected": -356.0, "loss": 0.5885, "rewards/accuracies": 0.75, "rewards/chosen": -1.7578125, "rewards/margins": 0.62890625, "rewards/rejected": -2.390625, "step": 190 }, { "epoch": 0.10465724751439037, "grad_norm": 9.121837612837107, "learning_rate": 2.6109660574412536e-05, "logits/chosen": -12.8125, "logits/rejected": -12.5, "logps/chosen": -364.0, "logps/rejected": -360.0, "loss": 0.6058, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.859375, "rewards/margins": 0.38671875, "rewards/rejected": -2.25, "step": 200 }, { "epoch": 0.10989010989010989, "grad_norm": 8.059177993813114, "learning_rate": 2.741514360313316e-05, "logits/chosen": -13.0, "logits/rejected": -12.875, "logps/chosen": -316.0, "logps/rejected": -294.0, "loss": 0.6417, "rewards/accuracies": 0.625, "rewards/chosen": -1.796875, "rewards/margins": 0.306640625, "rewards/rejected": -2.109375, "step": 210 }, { "epoch": 0.1151229722658294, "grad_norm": 8.500755304845391, "learning_rate": 2.8720626631853787e-05, "logits/chosen": -13.125, "logits/rejected": -12.625, "logps/chosen": -344.0, "logps/rejected": -354.0, "loss": 0.7723, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.4375, "rewards/margins": 0.578125, "rewards/rejected": -3.015625, "step": 220 }, { "epoch": 0.12035583464154893, "grad_norm": 9.230475840269714, "learning_rate": 3.0026109660574414e-05, "logits/chosen": -12.375, "logits/rejected": -12.125, "logps/chosen": -386.0, "logps/rejected": -422.0, "loss": 0.6046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.359375, "rewards/margins": 0.63671875, "rewards/rejected": -3.0, "step": 230 }, { "epoch": 0.12558869701726844, "grad_norm": 9.204392189192681, "learning_rate": 3.1331592689295045e-05, "logits/chosen": -10.875, "logits/rejected": -10.0, "logps/chosen": -328.0, "logps/rejected": -336.0, "loss": 0.6512, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9765625, "rewards/margins": 0.65234375, "rewards/rejected": -2.640625, "step": 240 }, { "epoch": 0.13082155939298795, "grad_norm": 9.749349255282869, "learning_rate": 3.263707571801567e-05, "logits/chosen": -11.0625, "logits/rejected": -10.4375, "logps/chosen": -388.0, "logps/rejected": -348.0, "loss": 0.6508, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0625, "rewards/margins": 0.439453125, "rewards/rejected": -2.5, "step": 250 }, { "epoch": 0.1360544217687075, "grad_norm": 10.257472173487507, "learning_rate": 3.394255874673629e-05, "logits/chosen": -10.9375, "logits/rejected": -10.4375, "logps/chosen": -368.0, "logps/rejected": -338.0, "loss": 0.6354, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0625, "rewards/margins": 0.68359375, "rewards/rejected": -2.75, "step": 260 }, { "epoch": 0.141287284144427, "grad_norm": 6.958550084405995, "learning_rate": 3.524804177545692e-05, "logits/chosen": -9.875, "logits/rejected": -9.5625, "logps/chosen": -364.0, "logps/rejected": -362.0, "loss": 0.5584, "rewards/accuracies": 0.625, "rewards/chosen": -1.8125, "rewards/margins": 0.7109375, "rewards/rejected": -2.515625, "step": 270 }, { "epoch": 0.14652014652014653, "grad_norm": 9.847805644914816, "learning_rate": 3.6553524804177546e-05, "logits/chosen": -11.625, "logits/rejected": -11.6875, "logps/chosen": -410.0, "logps/rejected": -382.0, "loss": 0.6862, "rewards/accuracies": 0.6875, "rewards/chosen": -2.75, "rewards/margins": 0.49609375, "rewards/rejected": -3.25, "step": 280 }, { "epoch": 0.15175300889586604, "grad_norm": 8.573440905919824, "learning_rate": 3.7859007832898173e-05, "logits/chosen": -12.875, "logits/rejected": -12.4375, "logps/chosen": -382.0, "logps/rejected": -336.0, "loss": 0.7394, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9609375, "rewards/margins": 0.62109375, "rewards/rejected": -2.578125, "step": 290 }, { "epoch": 0.15698587127158556, "grad_norm": 8.80725505375811, "learning_rate": 3.91644908616188e-05, "logits/chosen": -13.75, "logits/rejected": -13.25, "logps/chosen": -380.0, "logps/rejected": -344.0, "loss": 0.6646, "rewards/accuracies": 0.625, "rewards/chosen": -2.3125, "rewards/margins": 0.458984375, "rewards/rejected": -2.765625, "step": 300 }, { "epoch": 0.16221873364730507, "grad_norm": 8.968657211210374, "learning_rate": 4.046997389033943e-05, "logits/chosen": -12.25, "logits/rejected": -11.9375, "logps/chosen": -386.0, "logps/rejected": -346.0, "loss": 0.6784, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.4375, "rewards/margins": 0.578125, "rewards/rejected": -3.015625, "step": 310 }, { "epoch": 0.1674515960230246, "grad_norm": 10.194843430881596, "learning_rate": 4.1775456919060055e-05, "logits/chosen": -13.625, "logits/rejected": -13.0625, "logps/chosen": -442.0, "logps/rejected": -372.0, "loss": 0.5713, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.5625, "rewards/margins": 0.76171875, "rewards/rejected": -4.3125, "step": 320 }, { "epoch": 0.1726844583987441, "grad_norm": 10.313402119423792, "learning_rate": 4.308093994778068e-05, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -398.0, "logps/rejected": -364.0, "loss": 0.735, "rewards/accuracies": 0.6875, "rewards/chosen": -3.984375, "rewards/margins": 0.9375, "rewards/rejected": -4.90625, "step": 330 }, { "epoch": 0.17791732077446362, "grad_norm": 9.173592771992633, "learning_rate": 4.438642297650131e-05, "logits/chosen": -11.1875, "logits/rejected": -10.8125, "logps/chosen": -406.0, "logps/rejected": -382.0, "loss": 0.6655, "rewards/accuracies": 0.6875, "rewards/chosen": -4.125, "rewards/margins": 0.8515625, "rewards/rejected": -4.96875, "step": 340 }, { "epoch": 0.18315018315018314, "grad_norm": 8.34069764210929, "learning_rate": 4.5691906005221936e-05, "logits/chosen": -11.375, "logits/rejected": -11.25, "logps/chosen": -332.0, "logps/rejected": -360.0, "loss": 0.6806, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.59375, "rewards/margins": 0.87109375, "rewards/rejected": -5.46875, "step": 350 }, { "epoch": 0.18838304552590268, "grad_norm": 11.451242673237575, "learning_rate": 4.699738903394256e-05, "logits/chosen": -12.8125, "logits/rejected": -12.5, "logps/chosen": -424.0, "logps/rejected": -404.0, "loss": 0.767, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.4375, "rewards/margins": 0.578125, "rewards/rejected": -5.03125, "step": 360 }, { "epoch": 0.1936159079016222, "grad_norm": 8.706088083363465, "learning_rate": 4.830287206266319e-05, "logits/chosen": -12.875, "logits/rejected": -12.6875, "logps/chosen": -380.0, "logps/rejected": -362.0, "loss": 0.7948, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.4375, "rewards/margins": 0.55078125, "rewards/rejected": -5.0, "step": 370 }, { "epoch": 0.1988487702773417, "grad_norm": 8.71050562327642, "learning_rate": 4.960835509138381e-05, "logits/chosen": -12.875, "logits/rejected": -12.8125, "logps/chosen": -360.0, "logps/rejected": -402.0, "loss": 0.7885, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.40625, "rewards/margins": 0.79296875, "rewards/rejected": -5.21875, "step": 380 }, { "epoch": 0.20408163265306123, "grad_norm": 9.631202895027702, "learning_rate": 4.9999488859837295e-05, "logits/chosen": -13.375, "logits/rejected": -13.1875, "logps/chosen": -448.0, "logps/rejected": -396.0, "loss": 0.8545, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.25, "rewards/margins": 0.93359375, "rewards/rejected": -5.1875, "step": 390 }, { "epoch": 0.20931449502878074, "grad_norm": 12.175473256479712, "learning_rate": 4.999698536649904e-05, "logits/chosen": -14.5, "logits/rejected": -14.5, "logps/chosen": -462.0, "logps/rejected": -392.0, "loss": 0.7412, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.8125, "rewards/margins": 0.65234375, "rewards/rejected": -5.4375, "step": 400 }, { "epoch": 0.21454735740450026, "grad_norm": 10.472391673320981, "learning_rate": 4.999239584575648e-05, "logits/chosen": -14.75, "logits/rejected": -14.875, "logps/chosen": -446.0, "logps/rejected": -438.0, "loss": 0.7897, "rewards/accuracies": 0.625, "rewards/chosen": -4.84375, "rewards/margins": 0.298828125, "rewards/rejected": -5.15625, "step": 410 }, { "epoch": 0.21978021978021978, "grad_norm": 10.549150549694106, "learning_rate": 4.9985720680610434e-05, "logits/chosen": -14.6875, "logits/rejected": -14.6875, "logps/chosen": -412.0, "logps/rejected": -390.0, "loss": 0.7813, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.625, "rewards/margins": 0.2734375, "rewards/rejected": -4.90625, "step": 420 }, { "epoch": 0.2250130821559393, "grad_norm": 8.610558700109573, "learning_rate": 4.997696042811118e-05, "logits/chosen": -14.625, "logits/rejected": -14.8125, "logps/chosen": -416.0, "logps/rejected": -344.0, "loss": 0.7214, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.375, "rewards/margins": 0.59765625, "rewards/rejected": -4.96875, "step": 430 }, { "epoch": 0.2302459445316588, "grad_norm": 9.047522311209711, "learning_rate": 4.996611581931193e-05, "logits/chosen": -14.125, "logits/rejected": -14.25, "logps/chosen": -440.0, "logps/rejected": -374.0, "loss": 0.7039, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.5625, "rewards/margins": 0.498046875, "rewards/rejected": -5.0625, "step": 440 }, { "epoch": 0.23547880690737832, "grad_norm": 10.207379585431303, "learning_rate": 4.995318775920787e-05, "logits/chosen": -13.5, "logits/rejected": -13.5, "logps/chosen": -384.0, "logps/rejected": -386.0, "loss": 0.7792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.71875, "rewards/margins": 0.130859375, "rewards/rejected": -4.84375, "step": 450 }, { "epoch": 0.24071166928309787, "grad_norm": 9.821006860178837, "learning_rate": 4.9938177326660587e-05, "logits/chosen": -13.375, "logits/rejected": -13.375, "logps/chosen": -478.0, "logps/rejected": -436.0, "loss": 0.6816, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.625, "rewards/margins": 1.046875, "rewards/rejected": -5.65625, "step": 460 }, { "epoch": 0.24594453165881738, "grad_norm": 10.675355174224261, "learning_rate": 4.99210857743081e-05, "logits/chosen": -12.875, "logits/rejected": -12.875, "logps/chosen": -448.0, "logps/rejected": -436.0, "loss": 0.6973, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -5.875, "rewards/margins": 0.72265625, "rewards/rejected": -6.59375, "step": 470 }, { "epoch": 0.25117739403453687, "grad_norm": 10.26484583598167, "learning_rate": 4.990191452846024e-05, "logits/chosen": -13.0, "logits/rejected": -13.0, "logps/chosen": -406.0, "logps/rejected": -396.0, "loss": 0.6783, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.34375, "rewards/margins": 0.66796875, "rewards/rejected": -6.0, "step": 480 }, { "epoch": 0.2564102564102564, "grad_norm": 9.035937614807631, "learning_rate": 4.988066518897971e-05, "logits/chosen": -13.625, "logits/rejected": -13.3125, "logps/chosen": -464.0, "logps/rejected": -436.0, "loss": 0.6354, "rewards/accuracies": 0.6875, "rewards/chosen": -6.09375, "rewards/margins": 0.921875, "rewards/rejected": -7.0, "step": 490 }, { "epoch": 0.2616431187859759, "grad_norm": 6.87488260728394, "learning_rate": 4.985733952914852e-05, "logits/chosen": -15.5625, "logits/rejected": -15.375, "logps/chosen": -452.0, "logps/rejected": -438.0, "loss": 0.6495, "rewards/accuracies": 0.625, "rewards/chosen": -7.0, "rewards/margins": 0.69921875, "rewards/rejected": -7.71875, "step": 500 }, { "epoch": 0.2668759811616955, "grad_norm": 9.363373210153325, "learning_rate": 4.983193949552002e-05, "logits/chosen": -15.625, "logits/rejected": -15.9375, "logps/chosen": -450.0, "logps/rejected": -420.0, "loss": 0.7343, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -6.5, "rewards/margins": 0.458984375, "rewards/rejected": -6.9375, "step": 510 }, { "epoch": 0.272108843537415, "grad_norm": 7.7281840818721, "learning_rate": 4.980446720775646e-05, "logits/chosen": -14.125, "logits/rejected": -14.25, "logps/chosen": -464.0, "logps/rejected": -474.0, "loss": 0.742, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -5.375, "rewards/margins": 0.58984375, "rewards/rejected": -5.9375, "step": 520 }, { "epoch": 0.2773417059131345, "grad_norm": 9.271525518013695, "learning_rate": 4.9774924958452084e-05, "logits/chosen": -14.5625, "logits/rejected": -14.75, "logps/chosen": -492.0, "logps/rejected": -408.0, "loss": 0.8633, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -6.3125, "rewards/margins": 0.265625, "rewards/rejected": -6.59375, "step": 530 }, { "epoch": 0.282574568288854, "grad_norm": 11.416271168800474, "learning_rate": 4.974331521294186e-05, "logits/chosen": -14.75, "logits/rejected": -14.75, "logps/chosen": -496.0, "logps/rejected": -460.0, "loss": 0.7933, "rewards/accuracies": 0.625, "rewards/chosen": -7.3125, "rewards/margins": 0.5625, "rewards/rejected": -7.90625, "step": 540 }, { "epoch": 0.28780743066457354, "grad_norm": 7.787620742028088, "learning_rate": 4.97096406090957e-05, "logits/chosen": -14.3125, "logits/rejected": -14.125, "logps/chosen": -492.0, "logps/rejected": -468.0, "loss": 0.7545, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.125, "rewards/margins": 0.69921875, "rewards/rejected": -6.8125, "step": 550 }, { "epoch": 0.29304029304029305, "grad_norm": 8.382409522213168, "learning_rate": 4.96739039570983e-05, "logits/chosen": -14.25, "logits/rejected": -14.125, "logps/chosen": -444.0, "logps/rejected": -436.0, "loss": 0.7372, "rewards/accuracies": 0.625, "rewards/chosen": -5.6875, "rewards/margins": 0.609375, "rewards/rejected": -6.3125, "step": 560 }, { "epoch": 0.29827315541601257, "grad_norm": 9.779033425632107, "learning_rate": 4.963610823921471e-05, "logits/chosen": -14.625, "logits/rejected": -14.4375, "logps/chosen": -482.0, "logps/rejected": -434.0, "loss": 0.9479, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -7.125, "rewards/margins": -0.15625, "rewards/rejected": -6.96875, "step": 570 }, { "epoch": 0.3035060177917321, "grad_norm": 9.733444132976096, "learning_rate": 4.959625660954139e-05, "logits/chosen": -14.5, "logits/rejected": -14.625, "logps/chosen": -444.0, "logps/rejected": -388.0, "loss": 0.744, "rewards/accuracies": 0.625, "rewards/chosen": -6.78125, "rewards/margins": 0.65234375, "rewards/rejected": -7.40625, "step": 580 }, { "epoch": 0.3087388801674516, "grad_norm": 8.051810745094972, "learning_rate": 4.9554352393743045e-05, "logits/chosen": -13.125, "logits/rejected": -13.0625, "logps/chosen": -470.0, "logps/rejected": -452.0, "loss": 0.9418, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.3125, "rewards/margins": 0.64453125, "rewards/rejected": -9.0, "step": 590 }, { "epoch": 0.3139717425431711, "grad_norm": 5.903225463973512, "learning_rate": 4.9510399088775047e-05, "logits/chosen": -12.75, "logits/rejected": -12.75, "logps/chosen": -500.0, "logps/rejected": -502.0, "loss": 0.7086, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -8.125, "rewards/margins": 0.50390625, "rewards/rejected": -8.5625, "step": 600 }, { "epoch": 0.31920460491889063, "grad_norm": 11.634410981967923, "learning_rate": 4.9464400362591644e-05, "logits/chosen": -12.1875, "logits/rejected": -12.125, "logps/chosen": -420.0, "logps/rejected": -396.0, "loss": 0.7854, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -6.5625, "rewards/margins": 0.50390625, "rewards/rejected": -7.09375, "step": 610 }, { "epoch": 0.32443746729461015, "grad_norm": 8.791552603599007, "learning_rate": 4.941636005383986e-05, "logits/chosen": -12.375, "logits/rejected": -12.1875, "logps/chosen": -528.0, "logps/rejected": -402.0, "loss": 0.8591, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.59375, "rewards/margins": 0.86328125, "rewards/rejected": -7.4375, "step": 620 }, { "epoch": 0.32967032967032966, "grad_norm": 9.02432053656773, "learning_rate": 4.936628217153914e-05, "logits/chosen": -12.3125, "logits/rejected": -12.3125, "logps/chosen": -386.0, "logps/rejected": -408.0, "loss": 0.7811, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -6.40625, "rewards/margins": 0.4296875, "rewards/rejected": -6.8125, "step": 630 }, { "epoch": 0.3349031920460492, "grad_norm": 9.394934456092882, "learning_rate": 4.931417089474682e-05, "logits/chosen": -13.625, "logits/rejected": -13.6875, "logps/chosen": -462.0, "logps/rejected": -438.0, "loss": 0.6975, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -6.5625, "rewards/margins": 0.796875, "rewards/rejected": -7.375, "step": 640 }, { "epoch": 0.3401360544217687, "grad_norm": 7.439435296736013, "learning_rate": 4.926003057220935e-05, "logits/chosen": -15.3125, "logits/rejected": -15.4375, "logps/chosen": -470.0, "logps/rejected": -434.0, "loss": 0.7351, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -7.0, "rewards/margins": 0.609375, "rewards/rejected": -7.625, "step": 650 }, { "epoch": 0.3453689167974882, "grad_norm": 9.162980300519617, "learning_rate": 4.92038657219994e-05, "logits/chosen": -16.375, "logits/rejected": -16.5, "logps/chosen": -402.0, "logps/rejected": -392.0, "loss": 0.743, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.53125, "rewards/margins": 0.671875, "rewards/rejected": -7.1875, "step": 660 }, { "epoch": 0.35060177917320773, "grad_norm": 11.704890362462061, "learning_rate": 4.914568103113882e-05, "logits/chosen": -15.6875, "logits/rejected": -16.0, "logps/chosen": -442.0, "logps/rejected": -418.0, "loss": 0.7136, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -6.59375, "rewards/margins": 0.671875, "rewards/rejected": -7.28125, "step": 670 }, { "epoch": 0.35583464154892724, "grad_norm": 7.227855368315395, "learning_rate": 4.908548135520752e-05, "logits/chosen": -14.8125, "logits/rejected": -14.9375, "logps/chosen": -456.0, "logps/rejected": -416.0, "loss": 0.6655, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.9375, "rewards/margins": 0.5390625, "rewards/rejected": -7.5, "step": 680 }, { "epoch": 0.36106750392464676, "grad_norm": 9.146985437020582, "learning_rate": 4.9023271717938224e-05, "logits/chosen": -14.125, "logits/rejected": -14.0625, "logps/chosen": -528.0, "logps/rejected": -486.0, "loss": 0.7974, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -9.0625, "rewards/margins": 0.53515625, "rewards/rejected": -9.5625, "step": 690 }, { "epoch": 0.3663003663003663, "grad_norm": 9.278505608910281, "learning_rate": 4.8959057310797286e-05, "logits/chosen": -14.0625, "logits/rejected": -14.3125, "logps/chosen": -486.0, "logps/rejected": -428.0, "loss": 0.7751, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -6.90625, "rewards/margins": 0.427734375, "rewards/rejected": -7.3125, "step": 700 }, { "epoch": 0.3715332286760858, "grad_norm": 9.712228745007655, "learning_rate": 4.889284349255141e-05, "logits/chosen": -14.6875, "logits/rejected": -14.6875, "logps/chosen": -506.0, "logps/rejected": -474.0, "loss": 0.6715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.46875, "rewards/margins": 1.1328125, "rewards/rejected": -8.5625, "step": 710 }, { "epoch": 0.37676609105180536, "grad_norm": 8.521168690100787, "learning_rate": 4.8824635788820475e-05, "logits/chosen": -14.375, "logits/rejected": -14.25, "logps/chosen": -448.0, "logps/rejected": -446.0, "loss": 0.8019, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.875, "rewards/margins": 0.453125, "rewards/rejected": -8.3125, "step": 720 }, { "epoch": 0.3819989534275249, "grad_norm": 11.10196898625787, "learning_rate": 4.8754439891616434e-05, "logits/chosen": -14.1875, "logits/rejected": -14.1875, "logps/chosen": -472.0, "logps/rejected": -458.0, "loss": 0.8141, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -7.34375, "rewards/margins": 0.60546875, "rewards/rejected": -7.9375, "step": 730 }, { "epoch": 0.3872318158032444, "grad_norm": 10.792601975369012, "learning_rate": 4.8682261658868264e-05, "logits/chosen": -14.25, "logits/rejected": -14.375, "logps/chosen": -450.0, "logps/rejected": -414.0, "loss": 0.6468, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -7.34375, "rewards/margins": 0.8828125, "rewards/rejected": -8.25, "step": 740 }, { "epoch": 0.3924646781789639, "grad_norm": 55.736192989145714, "learning_rate": 4.860810711393317e-05, "logits/chosen": -14.75, "logits/rejected": -14.5625, "logps/chosen": -480.0, "logps/rejected": -502.0, "loss": 1.0953, "rewards/accuracies": 0.625, "rewards/chosen": -8.5625, "rewards/margins": 0.93359375, "rewards/rejected": -9.5, "step": 750 }, { "epoch": 0.3976975405546834, "grad_norm": 8.990314887586853, "learning_rate": 4.853198244509386e-05, "logits/chosen": -14.625, "logits/rejected": -14.625, "logps/chosen": -450.0, "logps/rejected": -434.0, "loss": 0.7577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.78125, "rewards/margins": 0.6640625, "rewards/rejected": -8.4375, "step": 760 }, { "epoch": 0.40293040293040294, "grad_norm": 7.965840471202787, "learning_rate": 4.845389400504221e-05, "logits/chosen": -14.375, "logits/rejected": -14.25, "logps/chosen": -488.0, "logps/rejected": -490.0, "loss": 0.7896, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -7.5625, "rewards/margins": 0.484375, "rewards/rejected": -8.0625, "step": 770 }, { "epoch": 0.40816326530612246, "grad_norm": 11.497968950154975, "learning_rate": 4.837384831034905e-05, "logits/chosen": -13.875, "logits/rejected": -13.75, "logps/chosen": -466.0, "logps/rejected": -438.0, "loss": 0.6823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.15625, "rewards/margins": 0.6875, "rewards/rejected": -7.84375, "step": 780 }, { "epoch": 0.413396127681842, "grad_norm": 9.66137517497451, "learning_rate": 4.829185204092039e-05, "logits/chosen": -14.5625, "logits/rejected": -14.25, "logps/chosen": -416.0, "logps/rejected": -442.0, "loss": 0.7498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.0, "rewards/margins": 1.15625, "rewards/rejected": -8.125, "step": 790 }, { "epoch": 0.4186289900575615, "grad_norm": 10.37574761233887, "learning_rate": 4.8207912039439964e-05, "logits/chosen": -15.4375, "logits/rejected": -15.625, "logps/chosen": -504.0, "logps/rejected": -466.0, "loss": 0.8479, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.875, "rewards/margins": 0.29296875, "rewards/rejected": -7.1875, "step": 800 }, { "epoch": 0.423861852433281, "grad_norm": 10.351543599017008, "learning_rate": 4.812203531079819e-05, "logits/chosen": -15.25, "logits/rejected": -15.375, "logps/chosen": -476.0, "logps/rejected": -454.0, "loss": 0.7395, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.09375, "rewards/margins": 0.75, "rewards/rejected": -7.84375, "step": 810 }, { "epoch": 0.4290947148090005, "grad_norm": 8.622095411540833, "learning_rate": 4.803422902150762e-05, "logits/chosen": -14.375, "logits/rejected": -14.4375, "logps/chosen": -494.0, "logps/rejected": -456.0, "loss": 0.7025, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.6875, "rewards/margins": 0.6875, "rewards/rejected": -9.375, "step": 820 }, { "epoch": 0.43432757718472004, "grad_norm": 9.264123822708422, "learning_rate": 4.794450049910487e-05, "logits/chosen": -12.9375, "logits/rejected": -12.9375, "logps/chosen": -454.0, "logps/rejected": -438.0, "loss": 0.8018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.125, "rewards/margins": 0.369140625, "rewards/rejected": -8.5, "step": 830 }, { "epoch": 0.43956043956043955, "grad_norm": 7.9655618610185845, "learning_rate": 4.785285723153915e-05, "logits/chosen": -11.625, "logits/rejected": -12.0, "logps/chosen": -536.0, "logps/rejected": -472.0, "loss": 0.7406, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -7.53125, "rewards/margins": 0.6953125, "rewards/rejected": -8.25, "step": 840 }, { "epoch": 0.44479330193615907, "grad_norm": 10.645509151597812, "learning_rate": 4.775930686654738e-05, "logits/chosen": -12.0625, "logits/rejected": -12.125, "logps/chosen": -470.0, "logps/rejected": -440.0, "loss": 0.733, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.125, "rewards/margins": 1.0078125, "rewards/rejected": -9.125, "step": 850 }, { "epoch": 0.4500261643118786, "grad_norm": 9.095410131374702, "learning_rate": 4.7663857211015936e-05, "logits/chosen": -13.0, "logits/rejected": -12.625, "logps/chosen": -434.0, "logps/rejected": -468.0, "loss": 0.7619, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -6.53125, "rewards/margins": 0.5546875, "rewards/rejected": -7.0625, "step": 860 }, { "epoch": 0.4552590266875981, "grad_norm": 7.213273555139093, "learning_rate": 4.756651623032922e-05, "logits/chosen": -12.625, "logits/rejected": -12.75, "logps/chosen": -458.0, "logps/rejected": -412.0, "loss": 0.7308, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -6.40625, "rewards/margins": 0.3359375, "rewards/rejected": -6.75, "step": 870 }, { "epoch": 0.4604918890633176, "grad_norm": 10.25034124783594, "learning_rate": 4.746729204770491e-05, "logits/chosen": -12.25, "logits/rejected": -12.125, "logps/chosen": -532.0, "logps/rejected": -470.0, "loss": 0.688, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.6875, "rewards/margins": 0.9140625, "rewards/rejected": -9.625, "step": 880 }, { "epoch": 0.46572475143903713, "grad_norm": 9.164696578171599, "learning_rate": 4.736619294351607e-05, "logits/chosen": -11.4375, "logits/rejected": -11.25, "logps/chosen": -556.0, "logps/rejected": -516.0, "loss": 0.7735, "rewards/accuracies": 0.625, "rewards/chosen": -9.8125, "rewards/margins": 0.68359375, "rewards/rejected": -10.5, "step": 890 }, { "epoch": 0.47095761381475665, "grad_norm": 9.176482738323408, "learning_rate": 4.726322735460012e-05, "logits/chosen": -11.75, "logits/rejected": -11.5, "logps/chosen": -476.0, "logps/rejected": -510.0, "loss": 0.761, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -7.875, "rewards/margins": 1.3984375, "rewards/rejected": -9.25, "step": 900 }, { "epoch": 0.47619047619047616, "grad_norm": 12.033840542792827, "learning_rate": 4.715840387355481e-05, "logits/chosen": -12.0625, "logits/rejected": -11.875, "logps/chosen": -452.0, "logps/rejected": -440.0, "loss": 0.84, "rewards/accuracies": 0.625, "rewards/chosen": -8.75, "rewards/margins": 0.83203125, "rewards/rejected": -9.5625, "step": 910 }, { "epoch": 0.48142333856619574, "grad_norm": 9.951231450685535, "learning_rate": 4.705173124802114e-05, "logits/chosen": -12.125, "logits/rejected": -12.0625, "logps/chosen": -528.0, "logps/rejected": -500.0, "loss": 0.6771, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -9.4375, "rewards/margins": 0.451171875, "rewards/rejected": -9.875, "step": 920 }, { "epoch": 0.48665620094191525, "grad_norm": 5.982719210494255, "learning_rate": 4.694321837995337e-05, "logits/chosen": -12.0625, "logits/rejected": -12.125, "logps/chosen": -516.0, "logps/rejected": -482.0, "loss": 0.6545, "rewards/accuracies": 0.6875, "rewards/chosen": -9.0625, "rewards/margins": 0.9296875, "rewards/rejected": -10.0, "step": 930 }, { "epoch": 0.49188906331763477, "grad_norm": 7.732935041516918, "learning_rate": 4.683287432487612e-05, "logits/chosen": -12.4375, "logits/rejected": -12.3125, "logps/chosen": -520.0, "logps/rejected": -482.0, "loss": 0.6515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -9.3125, "rewards/margins": 1.125, "rewards/rejected": -10.4375, "step": 940 }, { "epoch": 0.4971219256933543, "grad_norm": 11.192514507830417, "learning_rate": 4.672070829112868e-05, "logits/chosen": -12.6875, "logits/rejected": -12.8125, "logps/chosen": -498.0, "logps/rejected": -488.0, "loss": 0.6869, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.8125, "rewards/margins": 1.0390625, "rewards/rejected": -9.875, "step": 950 }, { "epoch": 0.5023547880690737, "grad_norm": 9.893002359130774, "learning_rate": 4.6606729639096606e-05, "logits/chosen": -12.25, "logits/rejected": -12.1875, "logps/chosen": -520.0, "logps/rejected": -520.0, "loss": 0.6144, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -10.4375, "rewards/margins": 1.484375, "rewards/rejected": -11.9375, "step": 960 }, { "epoch": 0.5075876504447933, "grad_norm": 11.075478903393348, "learning_rate": 4.6490947880430515e-05, "logits/chosen": -11.5, "logits/rejected": -11.0, "logps/chosen": -584.0, "logps/rejected": -520.0, "loss": 0.7253, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -10.75, "rewards/margins": 1.4453125, "rewards/rejected": -12.1875, "step": 970 }, { "epoch": 0.5128205128205128, "grad_norm": 7.207931474384908, "learning_rate": 4.637337267725239e-05, "logits/chosen": -12.0625, "logits/rejected": -11.8125, "logps/chosen": -584.0, "logps/rejected": -536.0, "loss": 0.6751, "rewards/accuracies": 0.6875, "rewards/chosen": -10.25, "rewards/margins": 1.5078125, "rewards/rejected": -11.75, "step": 980 }, { "epoch": 0.5180533751962323, "grad_norm": 10.223265502840688, "learning_rate": 4.625401384134921e-05, "logits/chosen": -12.75, "logits/rejected": -12.6875, "logps/chosen": -504.0, "logps/rejected": -466.0, "loss": 0.6907, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -9.5625, "rewards/margins": 1.0, "rewards/rejected": -10.5625, "step": 990 }, { "epoch": 0.5232862375719518, "grad_norm": 9.366756020201832, "learning_rate": 4.613288133335418e-05, "logits/chosen": -12.1875, "logits/rejected": -12.0625, "logps/chosen": -490.0, "logps/rejected": -472.0, "loss": 0.7333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.625, "rewards/margins": 1.0, "rewards/rejected": -10.625, "step": 1000 }, { "epoch": 0.5285190999476713, "grad_norm": 9.668457676351762, "learning_rate": 4.600998526191553e-05, "logits/chosen": -12.6875, "logits/rejected": -12.5625, "logps/chosen": -540.0, "logps/rejected": -544.0, "loss": 0.8312, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -9.9375, "rewards/margins": 0.73828125, "rewards/rejected": -10.6875, "step": 1010 }, { "epoch": 0.533751962323391, "grad_norm": 9.169851106004764, "learning_rate": 4.588533588285287e-05, "logits/chosen": -12.1875, "logits/rejected": -12.1875, "logps/chosen": -540.0, "logps/rejected": -502.0, "loss": 0.6978, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -10.375, "rewards/margins": 1.3046875, "rewards/rejected": -11.6875, "step": 1020 }, { "epoch": 0.5389848246991105, "grad_norm": 6.76805241291535, "learning_rate": 4.5758943598301354e-05, "logits/chosen": -11.4375, "logits/rejected": -11.1875, "logps/chosen": -556.0, "logps/rejected": -492.0, "loss": 0.7095, "rewards/accuracies": 0.625, "rewards/chosen": -10.875, "rewards/margins": 0.66796875, "rewards/rejected": -11.5, "step": 1030 }, { "epoch": 0.54421768707483, "grad_norm": 7.48260799269685, "learning_rate": 4.5630818955843646e-05, "logits/chosen": -12.0, "logits/rejected": -11.8125, "logps/chosen": -506.0, "logps/rejected": -536.0, "loss": 0.7073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -9.9375, "rewards/margins": 1.03125, "rewards/rejected": -10.9375, "step": 1040 }, { "epoch": 0.5494505494505495, "grad_norm": 9.671760718133113, "learning_rate": 4.550097264762968e-05, "logits/chosen": -12.625, "logits/rejected": -12.625, "logps/chosen": -492.0, "logps/rejected": -492.0, "loss": 0.8316, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.875, "rewards/margins": 0.71875, "rewards/rejected": -9.5625, "step": 1050 }, { "epoch": 0.554683411826269, "grad_norm": 7.232272637524744, "learning_rate": 4.536941550948439e-05, "logits/chosen": -13.3125, "logits/rejected": -13.25, "logps/chosen": -512.0, "logps/rejected": -482.0, "loss": 0.7443, "rewards/accuracies": 0.6875, "rewards/chosen": -9.25, "rewards/margins": 0.84765625, "rewards/rejected": -10.0625, "step": 1060 }, { "epoch": 0.5599162742019885, "grad_norm": 7.573253707315908, "learning_rate": 4.5236158520003444e-05, "logits/chosen": -12.9375, "logits/rejected": -13.0, "logps/chosen": -536.0, "logps/rejected": -502.0, "loss": 0.6025, "rewards/accuracies": 0.6875, "rewards/chosen": -8.1875, "rewards/margins": 1.0390625, "rewards/rejected": -9.25, "step": 1070 }, { "epoch": 0.565149136577708, "grad_norm": 7.888840261037634, "learning_rate": 4.510121279963709e-05, "logits/chosen": -12.1875, "logits/rejected": -12.3125, "logps/chosen": -528.0, "logps/rejected": -496.0, "loss": 0.7204, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -10.0625, "rewards/margins": 1.03125, "rewards/rejected": -11.0625, "step": 1080 }, { "epoch": 0.5703819989534276, "grad_norm": 13.762939010968964, "learning_rate": 4.4964589609762095e-05, "logits/chosen": -11.9375, "logits/rejected": -12.125, "logps/chosen": -564.0, "logps/rejected": -494.0, "loss": 0.8346, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -11.4375, "rewards/margins": 0.7734375, "rewards/rejected": -12.1875, "step": 1090 }, { "epoch": 0.5756148613291471, "grad_norm": 9.56155800775074, "learning_rate": 4.482630035174205e-05, "logits/chosen": -12.0, "logits/rejected": -11.9375, "logps/chosen": -512.0, "logps/rejected": -492.0, "loss": 0.6975, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -10.8125, "rewards/margins": 0.828125, "rewards/rejected": -11.625, "step": 1100 }, { "epoch": 0.5808477237048666, "grad_norm": 9.650034622926128, "learning_rate": 4.468635656597582e-05, "logits/chosen": -12.0, "logits/rejected": -11.9375, "logps/chosen": -490.0, "logps/rejected": -488.0, "loss": 0.7859, "rewards/accuracies": 0.625, "rewards/chosen": -9.75, "rewards/margins": 0.77734375, "rewards/rejected": -10.5, "step": 1110 }, { "epoch": 0.5860805860805861, "grad_norm": 9.009994906294855, "learning_rate": 4.454476993093454e-05, "logits/chosen": -11.4375, "logits/rejected": -11.375, "logps/chosen": -580.0, "logps/rejected": -524.0, "loss": 0.9287, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -10.375, "rewards/margins": 0.58203125, "rewards/rejected": -10.9375, "step": 1120 }, { "epoch": 0.5913134484563056, "grad_norm": 10.042283135138778, "learning_rate": 4.440155226218703e-05, "logits/chosen": -11.375, "logits/rejected": -11.1875, "logps/chosen": -496.0, "logps/rejected": -504.0, "loss": 0.8404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.25, "rewards/margins": 0.66015625, "rewards/rejected": -9.875, "step": 1130 }, { "epoch": 0.5965463108320251, "grad_norm": 8.418582913270656, "learning_rate": 4.425671551141376e-05, "logits/chosen": -11.125, "logits/rejected": -10.875, "logps/chosen": -564.0, "logps/rejected": -520.0, "loss": 0.6583, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -10.5625, "rewards/margins": 0.8671875, "rewards/rejected": -11.5, "step": 1140 }, { "epoch": 0.6017791732077447, "grad_norm": 9.416691885986982, "learning_rate": 4.411027176540948e-05, "logits/chosen": -10.75, "logits/rejected": -10.5625, "logps/chosen": -524.0, "logps/rejected": -510.0, "loss": 0.6848, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -11.0, "rewards/margins": 1.234375, "rewards/rejected": -12.1875, "step": 1150 }, { "epoch": 0.6070120355834642, "grad_norm": 10.865786644213799, "learning_rate": 4.396223324507454e-05, "logits/chosen": -10.8125, "logits/rejected": -10.75, "logps/chosen": -524.0, "logps/rejected": -454.0, "loss": 0.9435, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -11.4375, "rewards/margins": 0.640625, "rewards/rejected": -12.125, "step": 1160 }, { "epoch": 0.6122448979591837, "grad_norm": 12.184578875756303, "learning_rate": 4.3812612304395046e-05, "logits/chosen": -11.25, "logits/rejected": -11.125, "logps/chosen": -556.0, "logps/rejected": -564.0, "loss": 0.7482, "rewards/accuracies": 0.6875, "rewards/chosen": -11.375, "rewards/margins": 0.76953125, "rewards/rejected": -12.125, "step": 1170 }, { "epoch": 0.6174777603349032, "grad_norm": 7.183055929786656, "learning_rate": 4.366142142941195e-05, "logits/chosen": -10.6875, "logits/rejected": -10.375, "logps/chosen": -552.0, "logps/rejected": -516.0, "loss": 0.711, "rewards/accuracies": 0.625, "rewards/chosen": -11.125, "rewards/margins": 0.8515625, "rewards/rejected": -12.0, "step": 1180 }, { "epoch": 0.6227106227106227, "grad_norm": 7.359506759653038, "learning_rate": 4.350867323717902e-05, "logits/chosen": -10.625, "logits/rejected": -10.375, "logps/chosen": -528.0, "logps/rejected": -510.0, "loss": 0.5868, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -10.6875, "rewards/margins": 0.7890625, "rewards/rejected": -11.4375, "step": 1190 }, { "epoch": 0.6279434850863422, "grad_norm": 6.501759078401181, "learning_rate": 4.335438047470996e-05, "logits/chosen": -10.75, "logits/rejected": -10.5, "logps/chosen": -528.0, "logps/rejected": -536.0, "loss": 0.6786, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -10.625, "rewards/margins": 1.4296875, "rewards/rejected": -12.0625, "step": 1200 }, { "epoch": 0.6331763474620618, "grad_norm": 7.620878974669834, "learning_rate": 4.3198556017914635e-05, "logits/chosen": -11.1875, "logits/rejected": -10.875, "logps/chosen": -588.0, "logps/rejected": -524.0, "loss": 0.7357, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -11.125, "rewards/margins": 1.40625, "rewards/rejected": -12.5625, "step": 1210 }, { "epoch": 0.6384092098377813, "grad_norm": 9.34056834357026, "learning_rate": 4.30412128705246e-05, "logits/chosen": -11.4375, "logits/rejected": -11.3125, "logps/chosen": -564.0, "logps/rejected": -520.0, "loss": 0.7723, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -10.375, "rewards/margins": 1.109375, "rewards/rejected": -11.4375, "step": 1220 }, { "epoch": 0.6436420722135008, "grad_norm": 7.674870336983743, "learning_rate": 4.28823641630079e-05, "logits/chosen": -11.375, "logits/rejected": -11.1875, "logps/chosen": -568.0, "logps/rejected": -516.0, "loss": 0.7292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -10.125, "rewards/margins": 1.5078125, "rewards/rejected": -11.625, "step": 1230 }, { "epoch": 0.6488749345892203, "grad_norm": 7.40544549406474, "learning_rate": 4.2722023151473294e-05, "logits/chosen": -10.9375, "logits/rejected": -10.9375, "logps/chosen": -486.0, "logps/rejected": -492.0, "loss": 0.7212, "rewards/accuracies": 0.625, "rewards/chosen": -10.75, "rewards/margins": 1.1171875, "rewards/rejected": -11.875, "step": 1240 }, { "epoch": 0.6541077969649398, "grad_norm": 9.53123578774151, "learning_rate": 4.256020321656405e-05, "logits/chosen": -10.625, "logits/rejected": -10.375, "logps/chosen": -560.0, "logps/rejected": -552.0, "loss": 0.7306, "rewards/accuracies": 0.625, "rewards/chosen": -12.3125, "rewards/margins": 0.95703125, "rewards/rejected": -13.25, "step": 1250 }, { "epoch": 0.6593406593406593, "grad_norm": 6.161183952229396, "learning_rate": 4.239691786234133e-05, "logits/chosen": -11.0, "logits/rejected": -10.9375, "logps/chosen": -544.0, "logps/rejected": -488.0, "loss": 0.6762, "rewards/accuracies": 0.6875, "rewards/chosen": -10.9375, "rewards/margins": 1.140625, "rewards/rejected": -12.125, "step": 1260 }, { "epoch": 0.6645735217163788, "grad_norm": 8.172666817720033, "learning_rate": 4.223218071515721e-05, "logits/chosen": -11.0, "logits/rejected": -10.875, "logps/chosen": -544.0, "logps/rejected": -516.0, "loss": 0.6903, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.0625, "rewards/margins": 0.99609375, "rewards/rejected": -12.0625, "step": 1270 }, { "epoch": 0.6698063840920984, "grad_norm": 12.188827913123875, "learning_rate": 4.206600552251756e-05, "logits/chosen": -11.25, "logits/rejected": -11.125, "logps/chosen": -524.0, "logps/rejected": -486.0, "loss": 0.79, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -10.5, "rewards/margins": 0.9609375, "rewards/rejected": -11.4375, "step": 1280 }, { "epoch": 0.6750392464678179, "grad_norm": 18.826047916539064, "learning_rate": 4.189840615193486e-05, "logits/chosen": -11.6875, "logits/rejected": -11.375, "logps/chosen": -498.0, "logps/rejected": -520.0, "loss": 0.7081, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -9.8125, "rewards/margins": 0.96875, "rewards/rejected": -10.8125, "step": 1290 }, { "epoch": 0.6802721088435374, "grad_norm": 6.969800701972501, "learning_rate": 4.172939658977084e-05, "logits/chosen": -11.1875, "logits/rejected": -11.0, "logps/chosen": -528.0, "logps/rejected": -544.0, "loss": 0.7148, "rewards/accuracies": 0.625, "rewards/chosen": -10.625, "rewards/margins": 0.921875, "rewards/rejected": -11.5625, "step": 1300 }, { "epoch": 0.6855049712192569, "grad_norm": 14.859556755269326, "learning_rate": 4.155899094006938e-05, "logits/chosen": -10.9375, "logits/rejected": -10.8125, "logps/chosen": -564.0, "logps/rejected": -528.0, "loss": 0.7416, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -10.875, "rewards/margins": 0.9609375, "rewards/rejected": -11.875, "step": 1310 }, { "epoch": 0.6907378335949764, "grad_norm": 6.957089873593148, "learning_rate": 4.138720342337947e-05, "logits/chosen": -11.125, "logits/rejected": -11.0, "logps/chosen": -564.0, "logps/rejected": -552.0, "loss": 0.6879, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -11.75, "rewards/margins": 1.0859375, "rewards/rejected": -12.8125, "step": 1320 }, { "epoch": 0.6959706959706959, "grad_norm": 9.707692830662324, "learning_rate": 4.121404837556851e-05, "logits/chosen": -11.9375, "logits/rejected": -11.5625, "logps/chosen": -580.0, "logps/rejected": -616.0, "loss": 0.6995, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.5625, "rewards/margins": 1.546875, "rewards/rejected": -13.0625, "step": 1330 }, { "epoch": 0.7012035583464155, "grad_norm": 8.770735737960996, "learning_rate": 4.103954024662594e-05, "logits/chosen": -12.375, "logits/rejected": -12.4375, "logps/chosen": -568.0, "logps/rejected": -536.0, "loss": 0.719, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -11.25, "rewards/margins": 0.640625, "rewards/rejected": -11.875, "step": 1340 }, { "epoch": 0.706436420722135, "grad_norm": 7.1593445389385515, "learning_rate": 4.086369359945743e-05, "logits/chosen": -12.5, "logits/rejected": -12.4375, "logps/chosen": -576.0, "logps/rejected": -564.0, "loss": 0.7039, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -10.5625, "rewards/margins": 1.0546875, "rewards/rejected": -11.625, "step": 1350 }, { "epoch": 0.7116692830978545, "grad_norm": 9.243196425255588, "learning_rate": 4.0686523108669496e-05, "logits/chosen": -12.5625, "logits/rejected": -12.625, "logps/chosen": -600.0, "logps/rejected": -572.0, "loss": 0.8775, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -10.9375, "rewards/margins": 1.125, "rewards/rejected": -12.0625, "step": 1360 }, { "epoch": 0.716902145473574, "grad_norm": 11.110076974797884, "learning_rate": 4.050804355934498e-05, "logits/chosen": -11.25, "logits/rejected": -11.1875, "logps/chosen": -568.0, "logps/rejected": -524.0, "loss": 0.6187, "rewards/accuracies": 0.625, "rewards/chosen": -11.4375, "rewards/margins": 0.9375, "rewards/rejected": -12.375, "step": 1370 }, { "epoch": 0.7221350078492935, "grad_norm": 6.00630186538614, "learning_rate": 4.032826984580914e-05, "logits/chosen": -12.4375, "logits/rejected": -12.125, "logps/chosen": -512.0, "logps/rejected": -484.0, "loss": 0.6801, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -10.3125, "rewards/margins": 1.078125, "rewards/rejected": -11.375, "step": 1380 }, { "epoch": 0.727367870225013, "grad_norm": 8.977754421118522, "learning_rate": 4.014721697038678e-05, "logits/chosen": -11.6875, "logits/rejected": -11.4375, "logps/chosen": -490.0, "logps/rejected": -482.0, "loss": 0.6641, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -9.625, "rewards/margins": 1.1015625, "rewards/rejected": -10.75, "step": 1390 }, { "epoch": 0.7326007326007326, "grad_norm": 9.068303784048501, "learning_rate": 3.996490004215021e-05, "logits/chosen": -11.8125, "logits/rejected": -11.6875, "logps/chosen": -512.0, "logps/rejected": -498.0, "loss": 0.6787, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -10.3125, "rewards/margins": 1.125, "rewards/rejected": -11.4375, "step": 1400 }, { "epoch": 0.7378335949764521, "grad_norm": 10.091388237018714, "learning_rate": 3.978133427565842e-05, "logits/chosen": -11.4375, "logits/rejected": -11.125, "logps/chosen": -524.0, "logps/rejected": -516.0, "loss": 0.7733, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -11.25, "rewards/margins": 0.84375, "rewards/rejected": -12.125, "step": 1410 }, { "epoch": 0.7430664573521716, "grad_norm": 9.639237375339137, "learning_rate": 3.9596534989687416e-05, "logits/chosen": -12.4375, "logits/rejected": -12.1875, "logps/chosen": -506.0, "logps/rejected": -504.0, "loss": 0.7797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -10.0, "rewards/margins": 1.8671875, "rewards/rejected": -11.875, "step": 1420 }, { "epoch": 0.7482993197278912, "grad_norm": 6.806097018429323, "learning_rate": 3.9410517605951824e-05, "logits/chosen": -13.125, "logits/rejected": -12.9375, "logps/chosen": -532.0, "logps/rejected": -488.0, "loss": 0.6814, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -10.6875, "rewards/margins": 0.486328125, "rewards/rejected": -11.1875, "step": 1430 }, { "epoch": 0.7535321821036107, "grad_norm": 9.16956662251956, "learning_rate": 3.922329764781793e-05, "logits/chosen": -12.375, "logits/rejected": -12.125, "logps/chosen": -588.0, "logps/rejected": -544.0, "loss": 0.7421, "rewards/accuracies": 0.6875, "rewards/chosen": -11.0, "rewards/margins": 1.1015625, "rewards/rejected": -12.125, "step": 1440 }, { "epoch": 0.7587650444793302, "grad_norm": 7.958218718062474, "learning_rate": 3.903489073900828e-05, "logits/chosen": -11.5625, "logits/rejected": -11.5, "logps/chosen": -552.0, "logps/rejected": -532.0, "loss": 0.7735, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -11.5625, "rewards/margins": 0.8203125, "rewards/rejected": -12.375, "step": 1450 }, { "epoch": 0.7639979068550498, "grad_norm": 8.365628327112017, "learning_rate": 3.884531260229778e-05, "logits/chosen": -11.75, "logits/rejected": -11.625, "logps/chosen": -536.0, "logps/rejected": -490.0, "loss": 0.7042, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -11.1875, "rewards/margins": 0.5390625, "rewards/rejected": -11.75, "step": 1460 }, { "epoch": 0.7692307692307693, "grad_norm": 7.681754910048477, "learning_rate": 3.8654579058201704e-05, "logits/chosen": -11.5625, "logits/rejected": -11.3125, "logps/chosen": -516.0, "logps/rejected": -492.0, "loss": 0.6215, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -10.3125, "rewards/margins": 1.109375, "rewards/rejected": -11.4375, "step": 1470 }, { "epoch": 0.7744636316064888, "grad_norm": 8.699313776529005, "learning_rate": 3.8462706023655404e-05, "logits/chosen": -10.9375, "logits/rejected": -11.0, "logps/chosen": -516.0, "logps/rejected": -520.0, "loss": 0.7719, "rewards/accuracies": 0.625, "rewards/chosen": -11.5625, "rewards/margins": 0.89453125, "rewards/rejected": -12.4375, "step": 1480 }, { "epoch": 0.7796964939822083, "grad_norm": 10.20603489800159, "learning_rate": 3.8269709510686005e-05, "logits/chosen": -10.9375, "logits/rejected": -10.8125, "logps/chosen": -536.0, "logps/rejected": -520.0, "loss": 0.7257, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -11.1875, "rewards/margins": 0.7265625, "rewards/rejected": -11.9375, "step": 1490 }, { "epoch": 0.7849293563579278, "grad_norm": 6.371725891794927, "learning_rate": 3.807560562507624e-05, "logits/chosen": -11.4375, "logits/rejected": -11.0, "logps/chosen": -478.0, "logps/rejected": -516.0, "loss": 0.6052, "rewards/accuracies": 0.6875, "rewards/chosen": -10.8125, "rewards/margins": 1.390625, "rewards/rejected": -12.1875, "step": 1500 }, { "epoch": 0.7901622187336473, "grad_norm": 8.044514431743275, "learning_rate": 3.7880410565020366e-05, "logits/chosen": -11.9375, "logits/rejected": -11.625, "logps/chosen": -532.0, "logps/rejected": -552.0, "loss": 0.722, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -11.4375, "rewards/margins": 0.9765625, "rewards/rejected": -12.4375, "step": 1510 }, { "epoch": 0.7953950811093669, "grad_norm": 6.3419982155290855, "learning_rate": 3.76841406197724e-05, "logits/chosen": -12.25, "logits/rejected": -12.125, "logps/chosen": -468.0, "logps/rejected": -460.0, "loss": 0.693, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -9.6875, "rewards/margins": 1.0390625, "rewards/rejected": -10.6875, "step": 1520 }, { "epoch": 0.8006279434850864, "grad_norm": 13.822774811604797, "learning_rate": 3.748681216828678e-05, "logits/chosen": -12.1875, "logits/rejected": -11.75, "logps/chosen": -572.0, "logps/rejected": -644.0, "loss": 0.7042, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -11.4375, "rewards/margins": 0.76953125, "rewards/rejected": -12.25, "step": 1530 }, { "epoch": 0.8058608058608059, "grad_norm": 8.295919264278089, "learning_rate": 3.728844167785151e-05, "logits/chosen": -11.875, "logits/rejected": -11.625, "logps/chosen": -584.0, "logps/rejected": -572.0, "loss": 0.7166, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.1875, "rewards/margins": 0.8359375, "rewards/rejected": -12.0, "step": 1540 }, { "epoch": 0.8110936682365254, "grad_norm": 8.520816598792539, "learning_rate": 3.7089045702713976e-05, "logits/chosen": -12.0625, "logits/rejected": -12.25, "logps/chosen": -584.0, "logps/rejected": -510.0, "loss": 0.7012, "rewards/accuracies": 0.625, "rewards/chosen": -10.3125, "rewards/margins": 1.0390625, "rewards/rejected": -11.375, "step": 1550 }, { "epoch": 0.8163265306122449, "grad_norm": 8.142573842126938, "learning_rate": 3.6888640882699425e-05, "logits/chosen": -11.4375, "logits/rejected": -11.3125, "logps/chosen": -584.0, "logps/rejected": -568.0, "loss": 0.7397, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -11.625, "rewards/margins": 0.89453125, "rewards/rejected": -12.5, "step": 1560 }, { "epoch": 0.8215593929879644, "grad_norm": 7.481987186912928, "learning_rate": 3.668724394182239e-05, "logits/chosen": -11.375, "logits/rejected": -10.9375, "logps/chosen": -528.0, "logps/rejected": -524.0, "loss": 0.7039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.9375, "rewards/margins": 1.1484375, "rewards/rejected": -13.0625, "step": 1570 }, { "epoch": 0.826792255363684, "grad_norm": 8.042620411635573, "learning_rate": 3.648487168689104e-05, "logits/chosen": -11.5625, "logits/rejected": -11.375, "logps/chosen": -604.0, "logps/rejected": -536.0, "loss": 0.7544, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -11.25, "rewards/margins": 0.69140625, "rewards/rejected": -11.9375, "step": 1580 }, { "epoch": 0.8320251177394035, "grad_norm": 6.330392212337275, "learning_rate": 3.628154100610463e-05, "logits/chosen": -11.875, "logits/rejected": -11.5, "logps/chosen": -528.0, "logps/rejected": -496.0, "loss": 0.7201, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.9375, "rewards/margins": 1.0625, "rewards/rejected": -11.0, "step": 1590 }, { "epoch": 0.837257980115123, "grad_norm": 9.495665962948788, "learning_rate": 3.607726886764415e-05, "logits/chosen": -12.0625, "logits/rejected": -11.875, "logps/chosen": -506.0, "logps/rejected": -544.0, "loss": 0.7175, "rewards/accuracies": 0.6875, "rewards/chosen": -10.3125, "rewards/margins": 0.96484375, "rewards/rejected": -11.25, "step": 1600 }, { "epoch": 0.8424908424908425, "grad_norm": 8.88992804946847, "learning_rate": 3.5872072318256375e-05, "logits/chosen": -11.9375, "logits/rejected": -11.75, "logps/chosen": -596.0, "logps/rejected": -552.0, "loss": 0.7063, "rewards/accuracies": 0.6875, "rewards/chosen": -10.75, "rewards/margins": 1.25, "rewards/rejected": -12.0, "step": 1610 }, { "epoch": 0.847723704866562, "grad_norm": 6.549366426211227, "learning_rate": 3.566596848183117e-05, "logits/chosen": -11.375, "logits/rejected": -11.3125, "logps/chosen": -528.0, "logps/rejected": -488.0, "loss": 0.7635, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -11.4375, "rewards/margins": 0.74609375, "rewards/rejected": -12.1875, "step": 1620 }, { "epoch": 0.8529565672422815, "grad_norm": 8.965681557356943, "learning_rate": 3.54589745579726e-05, "logits/chosen": -12.0, "logits/rejected": -11.5625, "logps/chosen": -560.0, "logps/rejected": -564.0, "loss": 0.7312, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -11.125, "rewards/margins": 0.96875, "rewards/rejected": -12.0625, "step": 1630 }, { "epoch": 0.858189429618001, "grad_norm": 7.597359211830405, "learning_rate": 3.5251107820563565e-05, "logits/chosen": -12.4375, "logits/rejected": -12.5625, "logps/chosen": -580.0, "logps/rejected": -540.0, "loss": 0.7307, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -9.875, "rewards/margins": 0.94140625, "rewards/rejected": -10.8125, "step": 1640 }, { "epoch": 0.8634222919937206, "grad_norm": 16.306018432981723, "learning_rate": 3.504238561632424e-05, "logits/chosen": -12.75, "logits/rejected": -12.6875, "logps/chosen": -512.0, "logps/rejected": -516.0, "loss": 0.788, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.8125, "rewards/margins": 0.9375, "rewards/rejected": -10.75, "step": 1650 }, { "epoch": 0.8686551543694401, "grad_norm": 6.41824790740516, "learning_rate": 3.483282536336451e-05, "logits/chosen": -12.75, "logits/rejected": -12.6875, "logps/chosen": -468.0, "logps/rejected": -464.0, "loss": 0.7286, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -9.125, "rewards/margins": 1.0546875, "rewards/rejected": -10.125, "step": 1660 }, { "epoch": 0.8738880167451596, "grad_norm": 8.997202419771773, "learning_rate": 3.46224445497304e-05, "logits/chosen": -11.8125, "logits/rejected": -11.8125, "logps/chosen": -552.0, "logps/rejected": -532.0, "loss": 0.6613, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -10.1875, "rewards/margins": 1.53125, "rewards/rejected": -11.75, "step": 1670 }, { "epoch": 0.8791208791208791, "grad_norm": 8.646781069565186, "learning_rate": 3.441126073194468e-05, "logits/chosen": -12.1875, "logits/rejected": -11.75, "logps/chosen": -504.0, "logps/rejected": -532.0, "loss": 0.6581, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -9.9375, "rewards/margins": 1.546875, "rewards/rejected": -11.5, "step": 1680 }, { "epoch": 0.8843537414965986, "grad_norm": 7.113786319184486, "learning_rate": 3.4199291533541735e-05, "logits/chosen": -11.625, "logits/rejected": -11.5, "logps/chosen": -520.0, "logps/rejected": -494.0, "loss": 0.6574, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -10.4375, "rewards/margins": 0.8671875, "rewards/rejected": -11.3125, "step": 1690 }, { "epoch": 0.8895866038723181, "grad_norm": 9.146215241563787, "learning_rate": 3.398655464359687e-05, "logits/chosen": -11.9375, "logits/rejected": -11.8125, "logps/chosen": -604.0, "logps/rejected": -484.0, "loss": 1.2266, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -12.8125, "rewards/margins": -1.1484375, "rewards/rejected": -11.6875, "step": 1700 }, { "epoch": 0.8948194662480377, "grad_norm": 7.301197069927249, "learning_rate": 3.377306781525015e-05, "logits/chosen": -12.0, "logits/rejected": -11.875, "logps/chosen": -560.0, "logps/rejected": -540.0, "loss": 0.7462, "rewards/accuracies": 0.6875, "rewards/chosen": -10.9375, "rewards/margins": 1.0234375, "rewards/rejected": -11.9375, "step": 1710 }, { "epoch": 0.9000523286237572, "grad_norm": 7.026924797144021, "learning_rate": 3.3558848864224876e-05, "logits/chosen": -12.25, "logits/rejected": -12.1875, "logps/chosen": -540.0, "logps/rejected": -506.0, "loss": 0.6618, "rewards/accuracies": 0.6875, "rewards/chosen": -10.4375, "rewards/margins": 0.85546875, "rewards/rejected": -11.3125, "step": 1720 }, { "epoch": 0.9052851909994767, "grad_norm": 11.941983652096013, "learning_rate": 3.334391566734082e-05, "logits/chosen": -12.0, "logits/rejected": -12.0, "logps/chosen": -492.0, "logps/rejected": -494.0, "loss": 0.7589, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.375, "rewards/margins": 1.1484375, "rewards/rejected": -10.5, "step": 1730 }, { "epoch": 0.9105180533751962, "grad_norm": 8.065484818366723, "learning_rate": 3.3128286161022394e-05, "logits/chosen": -11.6875, "logits/rejected": -11.75, "logps/chosen": -552.0, "logps/rejected": -510.0, "loss": 0.7712, "rewards/accuracies": 0.625, "rewards/chosen": -10.5, "rewards/margins": 0.984375, "rewards/rejected": -11.4375, "step": 1740 }, { "epoch": 0.9157509157509157, "grad_norm": 8.045324511262809, "learning_rate": 3.2911978339801855e-05, "logits/chosen": -11.9375, "logits/rejected": -12.0, "logps/chosen": -584.0, "logps/rejected": -568.0, "loss": 0.7152, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -10.375, "rewards/margins": 1.2890625, "rewards/rejected": -11.6875, "step": 1750 }, { "epoch": 0.9209837781266352, "grad_norm": 10.25223828631299, "learning_rate": 3.269501025481763e-05, "logits/chosen": -12.375, "logits/rejected": -12.3125, "logps/chosen": -528.0, "logps/rejected": -560.0, "loss": 0.6604, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -10.125, "rewards/margins": 0.89453125, "rewards/rejected": -11.0, "step": 1760 }, { "epoch": 0.9262166405023547, "grad_norm": 6.658643984408585, "learning_rate": 3.2477400012307885e-05, "logits/chosen": -12.25, "logits/rejected": -12.25, "logps/chosen": -540.0, "logps/rejected": -540.0, "loss": 0.7548, "rewards/accuracies": 0.6875, "rewards/chosen": -9.75, "rewards/margins": 1.0, "rewards/rejected": -10.75, "step": 1770 }, { "epoch": 0.9314495028780743, "grad_norm": 7.583843351290834, "learning_rate": 3.2259165772099644e-05, "logits/chosen": -12.8125, "logits/rejected": -12.9375, "logps/chosen": -540.0, "logps/rejected": -528.0, "loss": 0.6979, "rewards/accuracies": 0.6875, "rewards/chosen": -9.75, "rewards/margins": 1.1171875, "rewards/rejected": -10.875, "step": 1780 }, { "epoch": 0.9366823652537938, "grad_norm": 6.695330181972035, "learning_rate": 3.204032574609318e-05, "logits/chosen": -12.25, "logits/rejected": -12.375, "logps/chosen": -576.0, "logps/rejected": -568.0, "loss": 0.7396, "rewards/accuracies": 0.6875, "rewards/chosen": -10.5, "rewards/margins": 1.046875, "rewards/rejected": -11.5, "step": 1790 }, { "epoch": 0.9419152276295133, "grad_norm": 7.41109532240638, "learning_rate": 3.1820898196742335e-05, "logits/chosen": -12.0, "logits/rejected": -12.0, "logps/chosen": -520.0, "logps/rejected": -548.0, "loss": 0.7853, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -11.0625, "rewards/margins": 0.8203125, "rewards/rejected": -11.875, "step": 1800 }, { "epoch": 0.9471480900052328, "grad_norm": 7.998757902945492, "learning_rate": 3.160090143553049e-05, "logits/chosen": -11.875, "logits/rejected": -11.625, "logps/chosen": -476.0, "logps/rejected": -476.0, "loss": 0.6803, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -10.375, "rewards/margins": 1.015625, "rewards/rejected": -11.4375, "step": 1810 }, { "epoch": 0.9523809523809523, "grad_norm": 7.080219482241533, "learning_rate": 3.1380353821442354e-05, "logits/chosen": -12.0, "logits/rejected": -11.8125, "logps/chosen": -448.0, "logps/rejected": -446.0, "loss": 0.7265, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.75, "rewards/margins": 0.76171875, "rewards/rejected": -10.5, "step": 1820 }, { "epoch": 0.957613814756672, "grad_norm": 10.03679929204167, "learning_rate": 3.1159273759431964e-05, "logits/chosen": -11.5625, "logits/rejected": -11.5, "logps/chosen": -544.0, "logps/rejected": -516.0, "loss": 0.6867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.125, "rewards/margins": 1.0703125, "rewards/rejected": -12.25, "step": 1830 }, { "epoch": 0.9628466771323915, "grad_norm": 6.830881800921508, "learning_rate": 3.0937679698886786e-05, "logits/chosen": -12.0625, "logits/rejected": -11.875, "logps/chosen": -560.0, "logps/rejected": -540.0, "loss": 0.5938, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.125, "rewards/margins": 1.3125, "rewards/rejected": -12.4375, "step": 1840 }, { "epoch": 0.968079539508111, "grad_norm": 8.529607259118398, "learning_rate": 3.071559013208801e-05, "logits/chosen": -11.75, "logits/rejected": -11.5625, "logps/chosen": -628.0, "logps/rejected": -568.0, "loss": 0.7147, "rewards/accuracies": 0.6875, "rewards/chosen": -11.4375, "rewards/margins": 1.2109375, "rewards/rejected": -12.625, "step": 1850 }, { "epoch": 0.9733124018838305, "grad_norm": 6.915342277139341, "learning_rate": 3.0493023592667446e-05, "logits/chosen": -11.4375, "logits/rejected": -11.25, "logps/chosen": -540.0, "logps/rejected": -500.0, "loss": 0.7208, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -10.625, "rewards/margins": 0.76953125, "rewards/rejected": -11.375, "step": 1860 }, { "epoch": 0.97854526425955, "grad_norm": 9.778948710217877, "learning_rate": 3.0269998654060788e-05, "logits/chosen": -12.125, "logits/rejected": -12.0, "logps/chosen": -502.0, "logps/rejected": -462.0, "loss": 0.6835, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.75, "rewards/margins": 0.7421875, "rewards/rejected": -10.5, "step": 1870 }, { "epoch": 0.9837781266352695, "grad_norm": 5.655349340296246, "learning_rate": 3.0046533927957677e-05, "logits/chosen": -11.5625, "logits/rejected": -11.3125, "logps/chosen": -520.0, "logps/rejected": -540.0, "loss": 0.6201, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -10.25, "rewards/margins": 1.0, "rewards/rejected": -11.25, "step": 1880 }, { "epoch": 0.989010989010989, "grad_norm": 10.130846611986492, "learning_rate": 2.9822648062748536e-05, "logits/chosen": -12.375, "logits/rejected": -12.375, "logps/chosen": -612.0, "logps/rejected": -600.0, "loss": 0.679, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -10.5625, "rewards/margins": 1.125, "rewards/rejected": -11.6875, "step": 1890 }, { "epoch": 0.9942438513867086, "grad_norm": 7.921271665865128, "learning_rate": 2.959835974196836e-05, "logits/chosen": -11.6875, "logits/rejected": -11.5625, "logps/chosen": -516.0, "logps/rejected": -520.0, "loss": 0.6907, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -10.375, "rewards/margins": 1.0625, "rewards/rejected": -11.4375, "step": 1900 }, { "epoch": 0.9994767137624281, "grad_norm": 8.904276645319571, "learning_rate": 2.9373687682737484e-05, "logits/chosen": -12.0625, "logits/rejected": -12.0, "logps/chosen": -588.0, "logps/rejected": -544.0, "loss": 0.6791, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -10.8125, "rewards/margins": 0.83203125, "rewards/rejected": -11.625, "step": 1910 }, { "epoch": 1.0, "eval_logits/chosen": -12.6875, "eval_logits/rejected": -12.5625, "eval_logps/chosen": -536.0, "eval_logps/rejected": -528.0, "eval_loss": 0.7097968459129333, "eval_rewards/accuracies": 0.69921875, "eval_rewards/chosen": -10.8125, "eval_rewards/margins": 1.1328125, "eval_rewards/rejected": -11.9375, "eval_runtime": 47.5241, "eval_samples_per_second": 42.084, "eval_steps_per_second": 0.673, "step": 1911 }, { "epoch": 1.0047095761381475, "grad_norm": 4.728380640009898, "learning_rate": 2.9148650634199674e-05, "logits/chosen": -12.25, "logits/rejected": -12.1875, "logps/chosen": -472.0, "logps/rejected": -516.0, "loss": 0.2767, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.5625, "rewards/margins": 3.671875, "rewards/rejected": -13.25, "step": 1920 }, { "epoch": 1.0099424385138671, "grad_norm": 1.024609812555977, "learning_rate": 2.892326737595751e-05, "logits/chosen": -12.25, "logits/rejected": -11.8125, "logps/chosen": -576.0, "logps/rejected": -612.0, "loss": 0.1663, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.375, "rewards/margins": 6.4375, "rewards/rejected": -15.8125, "step": 1930 }, { "epoch": 1.0151753008895865, "grad_norm": 3.373687792077228, "learning_rate": 2.869755671650512e-05, "logits/chosen": -11.5625, "logits/rejected": -11.25, "logps/chosen": -540.0, "logps/rejected": -660.0, "loss": 0.1126, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.0, "rewards/margins": 6.8125, "rewards/rejected": -16.875, "step": 1940 }, { "epoch": 1.0204081632653061, "grad_norm": 2.368031581220674, "learning_rate": 2.847153749165869e-05, "logits/chosen": -9.75, "logits/rejected": -9.375, "logps/chosen": -524.0, "logps/rejected": -596.0, "loss": 0.1652, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.75, "rewards/margins": 4.5625, "rewards/rejected": -16.375, "step": 1950 }, { "epoch": 1.0256410256410255, "grad_norm": 1.63741968428215, "learning_rate": 2.8245228562984516e-05, "logits/chosen": -10.8125, "logits/rejected": -10.125, "logps/chosen": -536.0, "logps/rejected": -588.0, "loss": 0.1023, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.75, "rewards/margins": 6.4375, "rewards/rejected": -17.25, "step": 1960 }, { "epoch": 1.0308738880167452, "grad_norm": 2.551100302367526, "learning_rate": 2.8018648816225025e-05, "logits/chosen": -11.5, "logits/rejected": -11.0, "logps/chosen": -556.0, "logps/rejected": -640.0, "loss": 0.1294, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.96875, "rewards/margins": 8.4375, "rewards/rejected": -16.375, "step": 1970 }, { "epoch": 1.0361067503924646, "grad_norm": 2.6749422209678073, "learning_rate": 2.7791817159722726e-05, "logits/chosen": -10.6875, "logits/rejected": -10.375, "logps/chosen": -470.0, "logps/rejected": -552.0, "loss": 0.1154, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.0625, "rewards/margins": 5.46875, "rewards/rejected": -14.5625, "step": 1980 }, { "epoch": 1.0413396127681842, "grad_norm": 2.655504829474548, "learning_rate": 2.756475252284229e-05, "logits/chosen": -11.25, "logits/rejected": -10.9375, "logps/chosen": -556.0, "logps/rejected": -656.0, "loss": 0.1274, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.1875, "rewards/margins": 6.71875, "rewards/rejected": -15.875, "step": 1990 }, { "epoch": 1.0465724751439036, "grad_norm": 2.5200245472186418, "learning_rate": 2.7337473854390865e-05, "logits/chosen": -11.5625, "logits/rejected": -11.375, "logps/chosen": -516.0, "logps/rejected": -584.0, "loss": 0.164, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.0, "rewards/margins": 6.25, "rewards/rejected": -15.25, "step": 2000 }, { "epoch": 1.0518053375196232, "grad_norm": 4.499576388025018, "learning_rate": 2.7110000121036793e-05, "logits/chosen": -11.5625, "logits/rejected": -11.25, "logps/chosen": -494.0, "logps/rejected": -600.0, "loss": 0.0866, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.9375, "rewards/margins": 7.875, "rewards/rejected": -16.75, "step": 2010 }, { "epoch": 1.0570381998953426, "grad_norm": 3.108328168380246, "learning_rate": 2.688235030572679e-05, "logits/chosen": -11.9375, "logits/rejected": -11.75, "logps/chosen": -512.0, "logps/rejected": -584.0, "loss": 0.1356, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.3125, "rewards/margins": 7.0625, "rewards/rejected": -16.375, "step": 2020 }, { "epoch": 1.0622710622710623, "grad_norm": 2.7373393989725763, "learning_rate": 2.6654543406101833e-05, "logits/chosen": -12.125, "logits/rejected": -11.6875, "logps/chosen": -520.0, "logps/rejected": -656.0, "loss": 0.1524, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.125, "rewards/margins": 7.28125, "rewards/rejected": -17.375, "step": 2030 }, { "epoch": 1.0675039246467817, "grad_norm": 2.6186672086656926, "learning_rate": 2.6426598432911763e-05, "logits/chosen": -12.875, "logits/rejected": -12.5, "logps/chosen": -576.0, "logps/rejected": -668.0, "loss": 0.1237, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.25, "rewards/margins": 7.53125, "rewards/rejected": -16.75, "step": 2040 }, { "epoch": 1.0727367870225013, "grad_norm": 1.4275150273517287, "learning_rate": 2.6198534408428804e-05, "logits/chosen": -12.75, "logits/rejected": -12.375, "logps/chosen": -532.0, "logps/rejected": -628.0, "loss": 0.0866, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.5, "rewards/margins": 8.4375, "rewards/rejected": -17.0, "step": 2050 }, { "epoch": 1.077969649398221, "grad_norm": 4.626435577549045, "learning_rate": 2.5970370364860176e-05, "logits/chosen": -12.5625, "logits/rejected": -12.25, "logps/chosen": -478.0, "logps/rejected": -636.0, "loss": 0.1145, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.0625, "rewards/margins": 8.625, "rewards/rejected": -17.625, "step": 2060 }, { "epoch": 1.0832025117739403, "grad_norm": 21.753848528929037, "learning_rate": 2.574212534275978e-05, "logits/chosen": -11.75, "logits/rejected": -11.3125, "logps/chosen": -464.0, "logps/rejected": -552.0, "loss": 0.1502, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.875, "rewards/margins": 6.15625, "rewards/rejected": -16.0, "step": 2070 }, { "epoch": 1.08843537414966, "grad_norm": 2.4608658219211628, "learning_rate": 2.5513818389439304e-05, "logits/chosen": -12.1875, "logits/rejected": -11.6875, "logps/chosen": -556.0, "logps/rejected": -656.0, "loss": 0.1992, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.375, "rewards/margins": 7.34375, "rewards/rejected": -17.75, "step": 2080 }, { "epoch": 1.0936682365253794, "grad_norm": 2.691085082609704, "learning_rate": 2.5285468557378616e-05, "logits/chosen": -11.75, "logits/rejected": -11.5, "logps/chosen": -476.0, "logps/rejected": -608.0, "loss": 0.1161, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.625, "rewards/margins": 7.5625, "rewards/rejected": -17.25, "step": 2090 }, { "epoch": 1.098901098901099, "grad_norm": 2.6635752913927853, "learning_rate": 2.5057094902635918e-05, "logits/chosen": -11.75, "logits/rejected": -11.4375, "logps/chosen": -528.0, "logps/rejected": -616.0, "loss": 0.0969, "rewards/accuracies": 0.9375, "rewards/chosen": -9.8125, "rewards/margins": 7.4375, "rewards/rejected": -17.25, "step": 2100 }, { "epoch": 1.1041339612768184, "grad_norm": 1.736617765978122, "learning_rate": 2.4828716483257418e-05, "logits/chosen": -12.25, "logits/rejected": -11.625, "logps/chosen": -548.0, "logps/rejected": -636.0, "loss": 0.1133, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.875, "rewards/margins": 8.125, "rewards/rejected": -18.0, "step": 2110 }, { "epoch": 1.109366823652538, "grad_norm": 1.226880604327556, "learning_rate": 2.460035235768692e-05, "logits/chosen": -12.3125, "logits/rejected": -12.0, "logps/chosen": -500.0, "logps/rejected": -644.0, "loss": 0.1001, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.1875, "rewards/margins": 8.0, "rewards/rejected": -18.25, "step": 2120 }, { "epoch": 1.1145996860282574, "grad_norm": 5.772340271804208, "learning_rate": 2.4372021583175446e-05, "logits/chosen": -12.25, "logits/rejected": -12.125, "logps/chosen": -536.0, "logps/rejected": -568.0, "loss": 0.1159, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.875, "rewards/margins": 6.0, "rewards/rejected": -15.875, "step": 2130 }, { "epoch": 1.119832548403977, "grad_norm": 4.404511520229935, "learning_rate": 2.4143743214190778e-05, "logits/chosen": -12.8125, "logits/rejected": -12.4375, "logps/chosen": -510.0, "logps/rejected": -640.0, "loss": 0.1329, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.8125, "rewards/margins": 7.09375, "rewards/rejected": -16.875, "step": 2140 }, { "epoch": 1.1250654107796965, "grad_norm": 2.5920340144560723, "learning_rate": 2.3915536300827414e-05, "logits/chosen": -13.375, "logits/rejected": -13.3125, "logps/chosen": -458.0, "logps/rejected": -632.0, "loss": 0.1462, "rewards/accuracies": 0.9375, "rewards/chosen": -9.1875, "rewards/margins": 7.28125, "rewards/rejected": -16.5, "step": 2150 }, { "epoch": 1.130298273155416, "grad_norm": 1.9315423437420907, "learning_rate": 2.3687419887216825e-05, "logits/chosen": -13.25, "logits/rejected": -12.9375, "logps/chosen": -520.0, "logps/rejected": -592.0, "loss": 0.1172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.25, "rewards/margins": 6.5625, "rewards/rejected": -15.8125, "step": 2160 }, { "epoch": 1.1355311355311355, "grad_norm": 4.9387988320363085, "learning_rate": 2.345941300993812e-05, "logits/chosen": -13.625, "logits/rejected": -13.25, "logps/chosen": -524.0, "logps/rejected": -628.0, "loss": 0.1262, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.125, "rewards/margins": 7.5, "rewards/rejected": -16.625, "step": 2170 }, { "epoch": 1.1407639979068551, "grad_norm": 2.6490159969730476, "learning_rate": 2.3231534696429533e-05, "logits/chosen": -13.125, "logits/rejected": -12.625, "logps/chosen": -500.0, "logps/rejected": -640.0, "loss": 0.0858, "rewards/accuracies": 0.9375, "rewards/chosen": -9.8125, "rewards/margins": 7.40625, "rewards/rejected": -17.25, "step": 2180 }, { "epoch": 1.1459968602825745, "grad_norm": 1.8591406460035929, "learning_rate": 2.3003803963400468e-05, "logits/chosen": -13.625, "logits/rejected": -13.25, "logps/chosen": -504.0, "logps/rejected": -632.0, "loss": 0.1339, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.625, "rewards/margins": 7.1875, "rewards/rejected": -16.75, "step": 2190 }, { "epoch": 1.1512297226582942, "grad_norm": 3.440279490788567, "learning_rate": 2.2776239815244543e-05, "logits/chosen": -12.75, "logits/rejected": -12.5625, "logps/chosen": -536.0, "logps/rejected": -648.0, "loss": 0.1023, "rewards/accuracies": 0.9375, "rewards/chosen": -8.8125, "rewards/margins": 8.8125, "rewards/rejected": -17.625, "step": 2200 }, { "epoch": 1.1564625850340136, "grad_norm": 2.3561174482239413, "learning_rate": 2.2548861242453742e-05, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -528.0, "logps/rejected": -608.0, "loss": 0.0903, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.8125, "rewards/margins": 7.125, "rewards/rejected": -16.875, "step": 2210 }, { "epoch": 1.1616954474097332, "grad_norm": 3.129509901044873, "learning_rate": 2.2321687220033523e-05, "logits/chosen": -13.125, "logits/rejected": -12.6875, "logps/chosen": -498.0, "logps/rejected": -652.0, "loss": 0.1041, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.5, "rewards/margins": 8.8125, "rewards/rejected": -18.375, "step": 2220 }, { "epoch": 1.1669283097854526, "grad_norm": 6.073604137970551, "learning_rate": 2.2094736705919368e-05, "logits/chosen": -13.0625, "logits/rejected": -12.8125, "logps/chosen": -568.0, "logps/rejected": -672.0, "loss": 0.1124, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.75, "rewards/margins": 8.25, "rewards/rejected": -18.0, "step": 2230 }, { "epoch": 1.1721611721611722, "grad_norm": 4.837786173506918, "learning_rate": 2.186802863939477e-05, "logits/chosen": -12.9375, "logits/rejected": -12.5, "logps/chosen": -502.0, "logps/rejected": -664.0, "loss": 0.1104, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.625, "rewards/margins": 8.625, "rewards/rejected": -19.25, "step": 2240 }, { "epoch": 1.1773940345368916, "grad_norm": 8.171951988640048, "learning_rate": 2.1641581939510667e-05, "logits/chosen": -13.4375, "logits/rejected": -13.1875, "logps/chosen": -552.0, "logps/rejected": -672.0, "loss": 0.0996, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.5, "rewards/margins": 7.1875, "rewards/rejected": -17.75, "step": 2250 }, { "epoch": 1.1826268969126112, "grad_norm": 1.7116523821577394, "learning_rate": 2.1415415503506653e-05, "logits/chosen": -13.4375, "logits/rejected": -13.1875, "logps/chosen": -548.0, "logps/rejected": -676.0, "loss": 0.09, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.5625, "rewards/margins": 7.9375, "rewards/rejected": -18.5, "step": 2260 }, { "epoch": 1.1878597592883307, "grad_norm": 2.8123145023686416, "learning_rate": 2.1189548205233975e-05, "logits/chosen": -13.625, "logits/rejected": -13.25, "logps/chosen": -584.0, "logps/rejected": -728.0, "loss": 0.0911, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.375, "rewards/margins": 9.3125, "rewards/rejected": -19.75, "step": 2270 }, { "epoch": 1.1930926216640503, "grad_norm": 1.3758247246151873, "learning_rate": 2.0963998893580487e-05, "logits/chosen": -13.375, "logits/rejected": -13.125, "logps/chosen": -544.0, "logps/rejected": -672.0, "loss": 0.1232, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.875, "rewards/margins": 9.0625, "rewards/rejected": -19.0, "step": 2280 }, { "epoch": 1.1983254840397697, "grad_norm": 2.8815572728674486, "learning_rate": 2.0738786390897696e-05, "logits/chosen": -13.75, "logits/rejected": -13.4375, "logps/chosen": -504.0, "logps/rejected": -680.0, "loss": 0.0962, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.625, "rewards/margins": 8.875, "rewards/rejected": -18.625, "step": 2290 }, { "epoch": 1.2035583464154893, "grad_norm": 2.0943865882237183, "learning_rate": 2.0513929491430006e-05, "logits/chosen": -14.125, "logits/rejected": -13.8125, "logps/chosen": -516.0, "logps/rejected": -660.0, "loss": 0.0908, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.125, "rewards/margins": 8.375, "rewards/rejected": -17.5, "step": 2300 }, { "epoch": 1.2087912087912087, "grad_norm": 4.687232667728419, "learning_rate": 2.028944695974633e-05, "logits/chosen": -14.375, "logits/rejected": -13.8125, "logps/chosen": -488.0, "logps/rejected": -596.0, "loss": 0.1219, "rewards/accuracies": 0.9375, "rewards/chosen": -9.5625, "rewards/margins": 7.28125, "rewards/rejected": -16.875, "step": 2310 }, { "epoch": 1.2140240711669283, "grad_norm": 6.733912452591703, "learning_rate": 2.006535752917414e-05, "logits/chosen": -14.375, "logits/rejected": -14.0, "logps/chosen": -536.0, "logps/rejected": -628.0, "loss": 0.1353, "rewards/accuracies": 0.9375, "rewards/chosen": -9.75, "rewards/margins": 7.6875, "rewards/rejected": -17.375, "step": 2320 }, { "epoch": 1.2192569335426477, "grad_norm": 2.3990026267001263, "learning_rate": 1.9841679900236167e-05, "logits/chosen": -13.6875, "logits/rejected": -13.5625, "logps/chosen": -528.0, "logps/rejected": -624.0, "loss": 0.1295, "rewards/accuracies": 0.9375, "rewards/chosen": -9.25, "rewards/margins": 7.59375, "rewards/rejected": -16.875, "step": 2330 }, { "epoch": 1.2244897959183674, "grad_norm": 1.5565379549024234, "learning_rate": 1.9618432739089843e-05, "logits/chosen": -13.8125, "logits/rejected": -13.5625, "logps/chosen": -456.0, "logps/rejected": -568.0, "loss": 0.1056, "rewards/accuracies": 1.0, "rewards/chosen": -8.5, "rewards/margins": 7.28125, "rewards/rejected": -15.8125, "step": 2340 }, { "epoch": 1.2297226582940868, "grad_norm": 2.385154086116954, "learning_rate": 1.9395634675969525e-05, "logits/chosen": -13.75, "logits/rejected": -13.3125, "logps/chosen": -504.0, "logps/rejected": -612.0, "loss": 0.1503, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.125, "rewards/margins": 7.46875, "rewards/rejected": -16.5, "step": 2350 }, { "epoch": 1.2349555206698064, "grad_norm": 0.7231966984051394, "learning_rate": 1.9173304303631848e-05, "logits/chosen": -13.125, "logits/rejected": -12.875, "logps/chosen": -504.0, "logps/rejected": -608.0, "loss": 0.0951, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.8125, "rewards/margins": 7.46875, "rewards/rejected": -16.25, "step": 2360 }, { "epoch": 1.2401883830455258, "grad_norm": 3.595599348043387, "learning_rate": 1.8951460175804104e-05, "logits/chosen": -13.625, "logits/rejected": -13.3125, "logps/chosen": -548.0, "logps/rejected": -648.0, "loss": 0.1048, "rewards/accuracies": 0.9375, "rewards/chosen": -9.8125, "rewards/margins": 7.875, "rewards/rejected": -17.625, "step": 2370 }, { "epoch": 1.2454212454212454, "grad_norm": 2.0288579413034777, "learning_rate": 1.87301208056359e-05, "logits/chosen": -12.625, "logits/rejected": -12.375, "logps/chosen": -512.0, "logps/rejected": -616.0, "loss": 0.104, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.375, "rewards/margins": 7.90625, "rewards/rejected": -17.25, "step": 2380 }, { "epoch": 1.250654107796965, "grad_norm": 2.3526669326687464, "learning_rate": 1.8509304664154255e-05, "logits/chosen": -13.0625, "logits/rejected": -12.625, "logps/chosen": -604.0, "logps/rejected": -716.0, "loss": 0.1063, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.375, "rewards/margins": 7.84375, "rewards/rejected": -18.25, "step": 2390 }, { "epoch": 1.2558869701726845, "grad_norm": 3.112985785125725, "learning_rate": 1.8289030178722132e-05, "logits/chosen": -13.0, "logits/rejected": -12.5, "logps/chosen": -540.0, "logps/rejected": -676.0, "loss": 0.1041, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -8.8125, "rewards/margins": 9.625, "rewards/rejected": -18.5, "step": 2400 }, { "epoch": 1.2611198325484039, "grad_norm": 1.2581184168231971, "learning_rate": 1.8069315731500666e-05, "logits/chosen": -13.75, "logits/rejected": -13.3125, "logps/chosen": -576.0, "logps/rejected": -692.0, "loss": 0.1438, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.3125, "rewards/margins": 8.5, "rewards/rejected": -17.75, "step": 2410 }, { "epoch": 1.2663526949241235, "grad_norm": 4.762221728293621, "learning_rate": 1.7850179657915183e-05, "logits/chosen": -13.25, "logits/rejected": -12.875, "logps/chosen": -600.0, "logps/rejected": -616.0, "loss": 0.1127, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.0, "rewards/margins": 7.53125, "rewards/rejected": -17.5, "step": 2420 }, { "epoch": 1.2715855572998431, "grad_norm": 3.447175858247355, "learning_rate": 1.7631640245125015e-05, "logits/chosen": -12.625, "logits/rejected": -12.125, "logps/chosen": -548.0, "logps/rejected": -624.0, "loss": 0.1399, "rewards/accuracies": 0.9375, "rewards/chosen": -10.8125, "rewards/margins": 7.4375, "rewards/rejected": -18.25, "step": 2430 }, { "epoch": 1.2768184196755625, "grad_norm": 4.061321817953982, "learning_rate": 1.7413715730497494e-05, "logits/chosen": -13.0625, "logits/rejected": -12.875, "logps/chosen": -516.0, "logps/rejected": -668.0, "loss": 0.0983, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.5, "rewards/margins": 9.0625, "rewards/rejected": -18.625, "step": 2440 }, { "epoch": 1.282051282051282, "grad_norm": 4.489903011834727, "learning_rate": 1.7196424300085978e-05, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -520.0, "logps/rejected": -604.0, "loss": 0.0957, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.5625, "rewards/margins": 6.75, "rewards/rejected": -16.375, "step": 2450 }, { "epoch": 1.2872841444270016, "grad_norm": 3.4346300831848433, "learning_rate": 1.6979784087112188e-05, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -468.0, "logps/rejected": -572.0, "loss": 0.1555, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.4375, "rewards/margins": 7.03125, "rewards/rejected": -16.5, "step": 2460 }, { "epoch": 1.2925170068027212, "grad_norm": 2.4661294584896933, "learning_rate": 1.6763813170453044e-05, "logits/chosen": -13.0, "logits/rejected": -12.5625, "logps/chosen": -474.0, "logps/rejected": -608.0, "loss": 0.0976, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.0, "rewards/margins": 7.09375, "rewards/rejected": -16.0, "step": 2470 }, { "epoch": 1.2977498691784406, "grad_norm": 3.149223335476965, "learning_rate": 1.6548529573131876e-05, "logits/chosen": -13.125, "logits/rejected": -12.625, "logps/chosen": -510.0, "logps/rejected": -608.0, "loss": 0.1061, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.625, "rewards/margins": 7.21875, "rewards/rejected": -16.875, "step": 2480 }, { "epoch": 1.30298273155416, "grad_norm": 2.862506339722175, "learning_rate": 1.6333951260814413e-05, "logits/chosen": -13.25, "logits/rejected": -12.875, "logps/chosen": -592.0, "logps/rejected": -640.0, "loss": 0.0693, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.3125, "rewards/margins": 8.0625, "rewards/rejected": -17.375, "step": 2490 }, { "epoch": 1.3082155939298796, "grad_norm": 2.7248445532753016, "learning_rate": 1.6120096140309572e-05, "logits/chosen": -13.125, "logits/rejected": -12.8125, "logps/chosen": -460.0, "logps/rejected": -592.0, "loss": 0.1068, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.875, "rewards/margins": 7.25, "rewards/rejected": -16.125, "step": 2500 }, { "epoch": 1.3134484563055993, "grad_norm": 1.4186556225169547, "learning_rate": 1.5906982058075038e-05, "logits/chosen": -12.6875, "logits/rejected": -12.3125, "logps/chosen": -478.0, "logps/rejected": -656.0, "loss": 0.0886, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.625, "rewards/margins": 7.5, "rewards/rejected": -17.125, "step": 2510 }, { "epoch": 1.3186813186813187, "grad_norm": 2.9852405667865174, "learning_rate": 1.569462679872801e-05, "logits/chosen": -12.8125, "logits/rejected": -12.5625, "logps/chosen": -528.0, "logps/rejected": -600.0, "loss": 0.1124, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.125, "rewards/margins": 6.9375, "rewards/rejected": -17.0, "step": 2520 }, { "epoch": 1.323914181057038, "grad_norm": 3.666318609283611, "learning_rate": 1.5483048083561036e-05, "logits/chosen": -13.5625, "logits/rejected": -13.0, "logps/chosen": -540.0, "logps/rejected": -676.0, "loss": 0.1064, "rewards/accuracies": 0.9375, "rewards/chosen": -9.1875, "rewards/margins": 9.3125, "rewards/rejected": -18.5, "step": 2530 }, { "epoch": 1.3291470434327577, "grad_norm": 2.9887842389088215, "learning_rate": 1.527226356906314e-05, "logits/chosen": -12.9375, "logits/rejected": -12.625, "logps/chosen": -458.0, "logps/rejected": -588.0, "loss": 0.1584, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.0625, "rewards/margins": 7.3125, "rewards/rejected": -16.375, "step": 2540 }, { "epoch": 1.3343799058084773, "grad_norm": 4.22908382077193, "learning_rate": 1.5062290845446403e-05, "logits/chosen": -12.625, "logits/rejected": -11.9375, "logps/chosen": -540.0, "logps/rejected": -636.0, "loss": 0.1073, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.75, "rewards/margins": 7.25, "rewards/rejected": -18.0, "step": 2550 }, { "epoch": 1.3396127681841967, "grad_norm": 2.546115179849579, "learning_rate": 1.4853147435177992e-05, "logits/chosen": -12.625, "logits/rejected": -12.0625, "logps/chosen": -524.0, "logps/rejected": -632.0, "loss": 0.1146, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.375, "rewards/margins": 8.75, "rewards/rejected": -18.125, "step": 2560 }, { "epoch": 1.3448456305599163, "grad_norm": 2.964138623891713, "learning_rate": 1.4644850791517933e-05, "logits/chosen": -12.875, "logits/rejected": -12.625, "logps/chosen": -528.0, "logps/rejected": -640.0, "loss": 0.0923, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.125, "rewards/margins": 8.1875, "rewards/rejected": -17.25, "step": 2570 }, { "epoch": 1.3500784929356358, "grad_norm": 2.833859042991735, "learning_rate": 1.4437418297062589e-05, "logits/chosen": -12.1875, "logits/rejected": -12.0625, "logps/chosen": -504.0, "logps/rejected": -624.0, "loss": 0.1154, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.0, "rewards/margins": 7.875, "rewards/rejected": -16.875, "step": 2580 }, { "epoch": 1.3553113553113554, "grad_norm": 4.611332803169445, "learning_rate": 1.4230867262294045e-05, "logits/chosen": -13.0, "logits/rejected": -12.5625, "logps/chosen": -532.0, "logps/rejected": -700.0, "loss": 0.1234, "rewards/accuracies": 0.9375, "rewards/chosen": -8.75, "rewards/margins": 8.75, "rewards/rejected": -17.5, "step": 2590 }, { "epoch": 1.3605442176870748, "grad_norm": 2.585933332197896, "learning_rate": 1.4025214924135616e-05, "logits/chosen": -12.5, "logits/rejected": -12.0, "logps/chosen": -464.0, "logps/rejected": -584.0, "loss": 0.1002, "rewards/accuracies": 0.9375, "rewards/chosen": -9.0625, "rewards/margins": 6.875, "rewards/rejected": -15.9375, "step": 2600 }, { "epoch": 1.3657770800627944, "grad_norm": 5.004771837552434, "learning_rate": 1.3820478444513288e-05, "logits/chosen": -12.9375, "logits/rejected": -12.25, "logps/chosen": -540.0, "logps/rejected": -636.0, "loss": 0.1361, "rewards/accuracies": 0.9375, "rewards/chosen": -9.3125, "rewards/margins": 7.71875, "rewards/rejected": -17.0, "step": 2610 }, { "epoch": 1.3710099424385138, "grad_norm": 3.3721942115741483, "learning_rate": 1.3616674908923585e-05, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -502.0, "logps/rejected": -604.0, "loss": 0.1074, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.0625, "rewards/margins": 7.75, "rewards/rejected": -16.875, "step": 2620 }, { "epoch": 1.3762428048142334, "grad_norm": 1.926096891230754, "learning_rate": 1.3413821325007834e-05, "logits/chosen": -12.375, "logits/rejected": -12.125, "logps/chosen": -460.0, "logps/rejected": -608.0, "loss": 0.0734, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.4375, "rewards/margins": 7.25, "rewards/rejected": -16.75, "step": 2630 }, { "epoch": 1.3814756671899528, "grad_norm": 2.6811569703958438, "learning_rate": 1.321193462113272e-05, "logits/chosen": -12.125, "logits/rejected": -11.5, "logps/chosen": -500.0, "logps/rejected": -584.0, "loss": 0.1341, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.3125, "rewards/margins": 6.65625, "rewards/rejected": -16.0, "step": 2640 }, { "epoch": 1.3867085295656725, "grad_norm": 1.5488117167812219, "learning_rate": 1.3011031644977716e-05, "logits/chosen": -12.5, "logits/rejected": -12.0, "logps/chosen": -470.0, "logps/rejected": -636.0, "loss": 0.1135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.5, "rewards/margins": 7.875, "rewards/rejected": -17.375, "step": 2650 }, { "epoch": 1.3919413919413919, "grad_norm": 5.7062514391226555, "learning_rate": 1.2811129162129065e-05, "logits/chosen": -12.375, "logits/rejected": -12.125, "logps/chosen": -548.0, "logps/rejected": -740.0, "loss": 0.0949, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.375, "rewards/margins": 9.0625, "rewards/rejected": -18.5, "step": 2660 }, { "epoch": 1.3971742543171115, "grad_norm": 1.7643119117081274, "learning_rate": 1.261224385468066e-05, "logits/chosen": -12.3125, "logits/rejected": -11.3125, "logps/chosen": -528.0, "logps/rejected": -604.0, "loss": 0.0787, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.25, "rewards/margins": 7.0, "rewards/rejected": -17.25, "step": 2670 }, { "epoch": 1.402407116692831, "grad_norm": 4.394449609840489, "learning_rate": 1.2414392319841957e-05, "logits/chosen": -12.3125, "logits/rejected": -11.875, "logps/chosen": -548.0, "logps/rejected": -724.0, "loss": 0.1152, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.1875, "rewards/margins": 11.1875, "rewards/rejected": -20.375, "step": 2680 }, { "epoch": 1.4076399790685505, "grad_norm": 5.0757669192975685, "learning_rate": 1.2217591068552894e-05, "logits/chosen": -12.625, "logits/rejected": -12.1875, "logps/chosen": -456.0, "logps/rejected": -632.0, "loss": 0.0827, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.25, "rewards/margins": 7.90625, "rewards/rejected": -17.125, "step": 2690 }, { "epoch": 1.41287284144427, "grad_norm": 1.2128787751321914, "learning_rate": 1.2021856524105992e-05, "logits/chosen": -13.0, "logits/rejected": -12.375, "logps/chosen": -556.0, "logps/rejected": -652.0, "loss": 0.0657, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.5, "rewards/margins": 8.75, "rewards/rejected": -18.25, "step": 2700 }, { "epoch": 1.4181057038199896, "grad_norm": 4.371148246096117, "learning_rate": 1.1827205020775881e-05, "logits/chosen": -12.5, "logits/rejected": -12.125, "logps/chosen": -528.0, "logps/rejected": -624.0, "loss": 0.1347, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.25, "rewards/margins": 8.0, "rewards/rejected": -17.25, "step": 2710 }, { "epoch": 1.423338566195709, "grad_norm": 1.6427572152583667, "learning_rate": 1.163365280245615e-05, "logits/chosen": -13.125, "logits/rejected": -12.6875, "logps/chosen": -480.0, "logps/rejected": -592.0, "loss": 0.1309, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -8.5, "rewards/margins": 7.34375, "rewards/rejected": -15.875, "step": 2720 }, { "epoch": 1.4285714285714286, "grad_norm": 6.792621163838377, "learning_rate": 1.1441216021303777e-05, "logits/chosen": -13.125, "logits/rejected": -12.6875, "logps/chosen": -494.0, "logps/rejected": -608.0, "loss": 0.1526, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.1875, "rewards/margins": 7.3125, "rewards/rejected": -16.5, "step": 2730 }, { "epoch": 1.433804290947148, "grad_norm": 7.768597274251902, "learning_rate": 1.1249910736391203e-05, "logits/chosen": -13.0, "logits/rejected": -12.625, "logps/chosen": -496.0, "logps/rejected": -616.0, "loss": 0.1477, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.0625, "rewards/margins": 8.0, "rewards/rejected": -17.125, "step": 2740 }, { "epoch": 1.4390371533228676, "grad_norm": 1.5634511255756072, "learning_rate": 1.1059752912366217e-05, "logits/chosen": -13.1875, "logits/rejected": -12.75, "logps/chosen": -462.0, "logps/rejected": -660.0, "loss": 0.1114, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.375, "rewards/margins": 8.75, "rewards/rejected": -17.125, "step": 2750 }, { "epoch": 1.4442700156985873, "grad_norm": 5.473079860013893, "learning_rate": 1.0870758418119659e-05, "logits/chosen": -13.1875, "logits/rejected": -12.6875, "logps/chosen": -540.0, "logps/rejected": -584.0, "loss": 0.0834, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.3125, "rewards/margins": 7.875, "rewards/rejected": -17.25, "step": 2760 }, { "epoch": 1.4495028780743067, "grad_norm": 3.4164753814709665, "learning_rate": 1.0682943025461136e-05, "logits/chosen": -13.3125, "logits/rejected": -13.0, "logps/chosen": -544.0, "logps/rejected": -628.0, "loss": 0.1494, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.25, "rewards/margins": 7.4375, "rewards/rejected": -16.625, "step": 2770 }, { "epoch": 1.454735740450026, "grad_norm": 4.0004197012792835, "learning_rate": 1.049632240780288e-05, "logits/chosen": -12.625, "logits/rejected": -11.875, "logps/chosen": -486.0, "logps/rejected": -556.0, "loss": 0.1041, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.1875, "rewards/margins": 6.46875, "rewards/rejected": -15.6875, "step": 2780 }, { "epoch": 1.4599686028257457, "grad_norm": 2.937844502391212, "learning_rate": 1.0310912138851769e-05, "logits/chosen": -12.9375, "logits/rejected": -12.4375, "logps/chosen": -520.0, "logps/rejected": -688.0, "loss": 0.0971, "rewards/accuracies": 0.9375, "rewards/chosen": -9.5625, "rewards/margins": 8.0625, "rewards/rejected": -17.625, "step": 2790 }, { "epoch": 1.4652014652014653, "grad_norm": 4.339889798014685, "learning_rate": 1.0126727691309638e-05, "logits/chosen": -13.125, "logits/rejected": -12.9375, "logps/chosen": -552.0, "logps/rejected": -704.0, "loss": 0.1164, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.375, "rewards/margins": 8.75, "rewards/rejected": -18.125, "step": 2800 }, { "epoch": 1.4704343275771847, "grad_norm": 1.8102160895635062, "learning_rate": 9.943784435582166e-06, "logits/chosen": -12.9375, "logits/rejected": -12.25, "logps/chosen": -516.0, "logps/rejected": -660.0, "loss": 0.0842, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.3125, "rewards/margins": 8.1875, "rewards/rejected": -17.5, "step": 2810 }, { "epoch": 1.4756671899529041, "grad_norm": 1.7389222823704746, "learning_rate": 9.76209763849609e-06, "logits/chosen": -12.5625, "logits/rejected": -11.9375, "logps/chosen": -478.0, "logps/rejected": -620.0, "loss": 0.0995, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.0, "rewards/margins": 8.25, "rewards/rejected": -17.25, "step": 2820 }, { "epoch": 1.4809000523286238, "grad_norm": 1.3162427246282047, "learning_rate": 9.581682462025215e-06, "logits/chosen": -12.9375, "logits/rejected": -12.875, "logps/chosen": -494.0, "logps/rejected": -624.0, "loss": 0.0881, "rewards/accuracies": 0.9375, "rewards/chosen": -9.0625, "rewards/margins": 7.9375, "rewards/rejected": -17.0, "step": 2830 }, { "epoch": 1.4861329147043434, "grad_norm": 2.015607418663913, "learning_rate": 9.40255396202518e-06, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -568.0, "logps/rejected": -660.0, "loss": 0.1589, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.4375, "rewards/margins": 7.84375, "rewards/rejected": -17.25, "step": 2840 }, { "epoch": 1.4913657770800628, "grad_norm": 1.0089278154821486, "learning_rate": 9.22472708697692e-06, "logits/chosen": -12.9375, "logits/rejected": -12.3125, "logps/chosen": -516.0, "logps/rejected": -624.0, "loss": 0.1199, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.9375, "rewards/margins": 8.3125, "rewards/rejected": -17.25, "step": 2850 }, { "epoch": 1.4965986394557822, "grad_norm": 7.8442767175126145, "learning_rate": 9.048216676739295e-06, "logits/chosen": -13.0625, "logits/rejected": -12.875, "logps/chosen": -528.0, "logps/rejected": -716.0, "loss": 0.1101, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.4375, "rewards/margins": 10.375, "rewards/rejected": -18.875, "step": 2860 }, { "epoch": 1.5018315018315018, "grad_norm": 0.8512762475654531, "learning_rate": 8.87303746131066e-06, "logits/chosen": -12.75, "logits/rejected": -12.125, "logps/chosen": -536.0, "logps/rejected": -672.0, "loss": 0.1099, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.375, "rewards/margins": 8.5625, "rewards/rejected": -18.0, "step": 2870 }, { "epoch": 1.5070643642072215, "grad_norm": 1.5593140512431443, "learning_rate": 8.699204059599578e-06, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -536.0, "logps/rejected": -660.0, "loss": 0.1107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.5, "rewards/margins": 8.625, "rewards/rejected": -18.125, "step": 2880 }, { "epoch": 1.5122972265829409, "grad_norm": 3.0135057825122633, "learning_rate": 8.526730978204933e-06, "logits/chosen": -12.5, "logits/rejected": -12.3125, "logps/chosen": -572.0, "logps/rejected": -680.0, "loss": 0.097, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.625, "rewards/margins": 9.3125, "rewards/rejected": -18.875, "step": 2890 }, { "epoch": 1.5175300889586603, "grad_norm": 4.884151124038764, "learning_rate": 8.35563261020529e-06, "logits/chosen": -13.125, "logits/rejected": -12.6875, "logps/chosen": -548.0, "logps/rejected": -728.0, "loss": 0.0774, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.9375, "rewards/margins": 10.75, "rewards/rejected": -19.75, "step": 2900 }, { "epoch": 1.5227629513343799, "grad_norm": 5.5540339317729295, "learning_rate": 8.185923233957802e-06, "logits/chosen": -12.9375, "logits/rejected": -12.125, "logps/chosen": -486.0, "logps/rejected": -652.0, "loss": 0.0961, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.1875, "rewards/margins": 9.0, "rewards/rejected": -18.125, "step": 2910 }, { "epoch": 1.5279958137100995, "grad_norm": 3.6558685861938636, "learning_rate": 8.017617011906618e-06, "logits/chosen": -12.75, "logits/rejected": -12.3125, "logps/chosen": -548.0, "logps/rejected": -704.0, "loss": 0.08, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.75, "rewards/margins": 7.8125, "rewards/rejected": -17.5, "step": 2920 }, { "epoch": 1.533228676085819, "grad_norm": 3.6345000092419517, "learning_rate": 7.850727989401064e-06, "logits/chosen": -13.0, "logits/rejected": -12.75, "logps/chosen": -568.0, "logps/rejected": -740.0, "loss": 0.0925, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.625, "rewards/margins": 8.8125, "rewards/rejected": -18.375, "step": 2930 }, { "epoch": 1.5384615384615383, "grad_norm": 1.68599240926016, "learning_rate": 7.685270093523534e-06, "logits/chosen": -13.125, "logits/rejected": -12.625, "logps/chosen": -532.0, "logps/rejected": -704.0, "loss": 0.0917, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.0, "rewards/margins": 10.125, "rewards/rejected": -19.125, "step": 2940 }, { "epoch": 1.543694400837258, "grad_norm": 4.5784671678935895, "learning_rate": 7.521257131927212e-06, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -508.0, "logps/rejected": -572.0, "loss": 0.1385, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.0, "rewards/margins": 7.28125, "rewards/rejected": -17.25, "step": 2950 }, { "epoch": 1.5489272632129776, "grad_norm": 3.1832934865765763, "learning_rate": 7.358702791683869e-06, "logits/chosen": -13.25, "logits/rejected": -12.875, "logps/chosen": -478.0, "logps/rejected": -600.0, "loss": 0.0871, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.0625, "rewards/margins": 8.0625, "rewards/rejected": -17.125, "step": 2960 }, { "epoch": 1.554160125588697, "grad_norm": 5.1042748038245565, "learning_rate": 7.197620638141633e-06, "logits/chosen": -13.3125, "logits/rejected": -13.1875, "logps/chosen": -494.0, "logps/rejected": -616.0, "loss": 0.0872, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.625, "rewards/margins": 8.4375, "rewards/rejected": -17.125, "step": 2970 }, { "epoch": 1.5593929879644164, "grad_norm": 4.261194324303468, "learning_rate": 7.038024113792921e-06, "logits/chosen": -13.5, "logits/rejected": -12.875, "logps/chosen": -544.0, "logps/rejected": -660.0, "loss": 0.0934, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.75, "rewards/margins": 8.75, "rewards/rejected": -17.5, "step": 2980 }, { "epoch": 1.564625850340136, "grad_norm": 5.838668015412529, "learning_rate": 6.879926537152695e-06, "logits/chosen": -12.9375, "logits/rejected": -12.375, "logps/chosen": -478.0, "logps/rejected": -684.0, "loss": 0.1091, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.625, "rewards/margins": 9.75, "rewards/rejected": -18.375, "step": 2990 }, { "epoch": 1.5698587127158556, "grad_norm": 6.215751805578572, "learning_rate": 6.723341101646993e-06, "logits/chosen": -13.5, "logits/rejected": -13.1875, "logps/chosen": -544.0, "logps/rejected": -680.0, "loss": 0.0991, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.375, "rewards/margins": 9.3125, "rewards/rejected": -17.75, "step": 3000 }, { "epoch": 1.575091575091575, "grad_norm": 6.288421625487859, "learning_rate": 6.568280874511904e-06, "logits/chosen": -13.375, "logits/rejected": -13.0, "logps/chosen": -544.0, "logps/rejected": -704.0, "loss": 0.0995, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.75, "rewards/margins": 9.1875, "rewards/rejected": -17.875, "step": 3010 }, { "epoch": 1.5803244374672945, "grad_norm": 7.0546142777757845, "learning_rate": 6.414758795703122e-06, "logits/chosen": -13.375, "logits/rejected": -12.9375, "logps/chosen": -520.0, "logps/rejected": -624.0, "loss": 0.1143, "rewards/accuracies": 0.9375, "rewards/chosen": -9.125, "rewards/margins": 7.59375, "rewards/rejected": -16.75, "step": 3020 }, { "epoch": 1.585557299843014, "grad_norm": 4.589578434412954, "learning_rate": 6.262787676816093e-06, "logits/chosen": -13.0625, "logits/rejected": -12.625, "logps/chosen": -552.0, "logps/rejected": -648.0, "loss": 0.1142, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.5, "rewards/margins": 7.9375, "rewards/rejected": -17.375, "step": 3030 }, { "epoch": 1.5907901622187337, "grad_norm": 4.762993940765846, "learning_rate": 6.112380200016832e-06, "logits/chosen": -13.6875, "logits/rejected": -13.1875, "logps/chosen": -496.0, "logps/rejected": -640.0, "loss": 0.1374, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.625, "rewards/margins": 7.875, "rewards/rejected": -16.5, "step": 3040 }, { "epoch": 1.5960230245944533, "grad_norm": 5.107564915010418, "learning_rate": 5.963548916983627e-06, "logits/chosen": -12.875, "logits/rejected": -12.375, "logps/chosen": -452.0, "logps/rejected": -552.0, "loss": 0.0917, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.5, "rewards/margins": 7.71875, "rewards/rejected": -16.25, "step": 3050 }, { "epoch": 1.6012558869701727, "grad_norm": 2.54635356048562, "learning_rate": 5.816306247859571e-06, "logits/chosen": -13.3125, "logits/rejected": -12.875, "logps/chosen": -552.0, "logps/rejected": -640.0, "loss": 0.0855, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.1875, "rewards/margins": 9.375, "rewards/rejected": -17.5, "step": 3060 }, { "epoch": 1.6064887493458921, "grad_norm": 3.298880234845822, "learning_rate": 5.670664480216087e-06, "logits/chosen": -13.25, "logits/rejected": -12.75, "logps/chosen": -516.0, "logps/rejected": -676.0, "loss": 0.0884, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.0, "rewards/margins": 9.8125, "rewards/rejected": -17.75, "step": 3070 }, { "epoch": 1.6117216117216118, "grad_norm": 4.353379277791028, "learning_rate": 5.526635768027489e-06, "logits/chosen": -13.25, "logits/rejected": -12.625, "logps/chosen": -528.0, "logps/rejected": -620.0, "loss": 0.1106, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.5, "rewards/margins": 8.875, "rewards/rejected": -17.375, "step": 3080 }, { "epoch": 1.6169544740973314, "grad_norm": 1.7350907393275037, "learning_rate": 5.384232130656772e-06, "logits/chosen": -12.875, "logits/rejected": -12.5, "logps/chosen": -536.0, "logps/rejected": -704.0, "loss": 0.0954, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.25, "rewards/margins": 9.875, "rewards/rejected": -18.125, "step": 3090 }, { "epoch": 1.6221873364730508, "grad_norm": 1.9702905557486132, "learning_rate": 5.243465451852547e-06, "logits/chosen": -12.9375, "logits/rejected": -12.25, "logps/chosen": -512.0, "logps/rejected": -660.0, "loss": 0.1501, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -8.5625, "rewards/margins": 8.625, "rewards/rejected": -17.25, "step": 3100 }, { "epoch": 1.6274201988487702, "grad_norm": 2.119534360879486, "learning_rate": 5.104347478757313e-06, "logits/chosen": -12.8125, "logits/rejected": -12.25, "logps/chosen": -480.0, "logps/rejected": -636.0, "loss": 0.1065, "rewards/accuracies": 0.9375, "rewards/chosen": -8.75, "rewards/margins": 8.3125, "rewards/rejected": -17.125, "step": 3110 }, { "epoch": 1.6326530612244898, "grad_norm": 5.305348928368352, "learning_rate": 4.9668898209272094e-06, "logits/chosen": -13.625, "logits/rejected": -13.0625, "logps/chosen": -544.0, "logps/rejected": -648.0, "loss": 0.1039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.0, "rewards/margins": 9.6875, "rewards/rejected": -17.75, "step": 3120 }, { "epoch": 1.6378859236002095, "grad_norm": 3.865588511769586, "learning_rate": 4.831103949363103e-06, "logits/chosen": -13.0, "logits/rejected": -12.4375, "logps/chosen": -486.0, "logps/rejected": -576.0, "loss": 0.1176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.1875, "rewards/margins": 7.9375, "rewards/rejected": -16.125, "step": 3130 }, { "epoch": 1.6431187859759289, "grad_norm": 1.1087995031281264, "learning_rate": 4.697001195553366e-06, "logits/chosen": -13.3125, "logits/rejected": -12.875, "logps/chosen": -476.0, "logps/rejected": -628.0, "loss": 0.0754, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.0625, "rewards/margins": 8.5, "rewards/rejected": -17.5, "step": 3140 }, { "epoch": 1.6483516483516483, "grad_norm": 9.607813501681845, "learning_rate": 4.564592750528271e-06, "logits/chosen": -13.25, "logits/rejected": -12.75, "logps/chosen": -506.0, "logps/rejected": -584.0, "loss": 0.1084, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.9375, "rewards/margins": 7.5625, "rewards/rejected": -16.5, "step": 3150 }, { "epoch": 1.653584510727368, "grad_norm": 2.166356498800773, "learning_rate": 4.4338896639260276e-06, "logits/chosen": -13.4375, "logits/rejected": -13.0, "logps/chosen": -508.0, "logps/rejected": -632.0, "loss": 0.0882, "rewards/accuracies": 0.9375, "rewards/chosen": -8.375, "rewards/margins": 8.0, "rewards/rejected": -16.375, "step": 3160 }, { "epoch": 1.6588173731030875, "grad_norm": 1.3202863612431788, "learning_rate": 4.304902843070701e-06, "logits/chosen": -13.375, "logits/rejected": -12.75, "logps/chosen": -494.0, "logps/rejected": -580.0, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": -8.5, "rewards/margins": 7.5625, "rewards/rejected": -16.0, "step": 3170 }, { "epoch": 1.664050235478807, "grad_norm": 6.022229933522954, "learning_rate": 4.177643052062039e-06, "logits/chosen": -13.375, "logits/rejected": -12.75, "logps/chosen": -510.0, "logps/rejected": -656.0, "loss": 0.0964, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.9375, "rewards/margins": 8.0, "rewards/rejected": -16.875, "step": 3180 }, { "epoch": 1.6692830978545263, "grad_norm": 4.697489562167333, "learning_rate": 4.0521209108770945e-06, "logits/chosen": -13.25, "logits/rejected": -12.625, "logps/chosen": -506.0, "logps/rejected": -640.0, "loss": 0.1347, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.0, "rewards/margins": 8.5, "rewards/rejected": -17.5, "step": 3190 }, { "epoch": 1.674515960230246, "grad_norm": 7.606755469102715, "learning_rate": 3.928346894484056e-06, "logits/chosen": -13.0625, "logits/rejected": -12.9375, "logps/chosen": -552.0, "logps/rejected": -600.0, "loss": 0.1338, "rewards/accuracies": 0.9375, "rewards/chosen": -9.375, "rewards/margins": 6.28125, "rewards/rejected": -15.625, "step": 3200 }, { "epoch": 1.6797488226059656, "grad_norm": 6.996228868642036, "learning_rate": 3.8063313319680686e-06, "logits/chosen": -12.6875, "logits/rejected": -12.25, "logps/chosen": -544.0, "logps/rejected": -680.0, "loss": 0.1061, "rewards/accuracies": 0.9375, "rewards/chosen": -9.0625, "rewards/margins": 8.6875, "rewards/rejected": -17.75, "step": 3210 }, { "epoch": 1.684981684981685, "grad_norm": 2.1894497750464494, "learning_rate": 3.686084405669249e-06, "logits/chosen": -13.4375, "logits/rejected": -13.0625, "logps/chosen": -540.0, "logps/rejected": -712.0, "loss": 0.1001, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.4375, "rewards/margins": 10.25, "rewards/rejected": -18.75, "step": 3220 }, { "epoch": 1.6902145473574044, "grad_norm": 3.29711389402712, "learning_rate": 3.567616150332992e-06, "logits/chosen": -12.9375, "logits/rejected": -12.6875, "logps/chosen": -504.0, "logps/rejected": -588.0, "loss": 0.1136, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.625, "rewards/margins": 7.40625, "rewards/rejected": -16.0, "step": 3230 }, { "epoch": 1.695447409733124, "grad_norm": 3.5590799179096213, "learning_rate": 3.450936452272524e-06, "logits/chosen": -13.25, "logits/rejected": -12.8125, "logps/chosen": -490.0, "logps/rejected": -612.0, "loss": 0.1291, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.4375, "rewards/margins": 7.8125, "rewards/rejected": -16.25, "step": 3240 }, { "epoch": 1.7006802721088436, "grad_norm": 2.1632084987538476, "learning_rate": 3.3360550485439067e-06, "logits/chosen": -13.5625, "logits/rejected": -13.0625, "logps/chosen": -454.0, "logps/rejected": -636.0, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -7.71875, "rewards/margins": 9.375, "rewards/rejected": -17.125, "step": 3250 }, { "epoch": 1.705913134484563, "grad_norm": 6.079477309011048, "learning_rate": 3.222981526133434e-06, "logits/chosen": -13.1875, "logits/rejected": -12.9375, "logps/chosen": -486.0, "logps/rejected": -600.0, "loss": 0.1236, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.0, "rewards/margins": 7.5, "rewards/rejected": -16.5, "step": 3260 }, { "epoch": 1.7111459968602825, "grad_norm": 1.5499240661013363, "learning_rate": 3.111725321157627e-06, "logits/chosen": -13.4375, "logits/rejected": -13.1875, "logps/chosen": -516.0, "logps/rejected": -764.0, "loss": 0.0815, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.1875, "rewards/margins": 12.0625, "rewards/rejected": -19.25, "step": 3270 }, { "epoch": 1.716378859236002, "grad_norm": 3.7623282869444963, "learning_rate": 3.002295718075762e-06, "logits/chosen": -13.1875, "logits/rejected": -12.8125, "logps/chosen": -520.0, "logps/rejected": -680.0, "loss": 0.0881, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.25, "rewards/margins": 10.0625, "rewards/rejected": -18.375, "step": 3280 }, { "epoch": 1.7216117216117217, "grad_norm": 7.519232363887075, "learning_rate": 2.8947018489150517e-06, "logits/chosen": -13.25, "logits/rejected": -12.625, "logps/chosen": -502.0, "logps/rejected": -652.0, "loss": 0.0775, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.6875, "rewards/margins": 8.5, "rewards/rejected": -17.125, "step": 3290 }, { "epoch": 1.7268445839874411, "grad_norm": 5.31099596157704, "learning_rate": 2.7889526925085978e-06, "logits/chosen": -12.75, "logits/rejected": -12.8125, "logps/chosen": -516.0, "logps/rejected": -644.0, "loss": 0.1053, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.0625, "rewards/margins": 9.4375, "rewards/rejected": -17.5, "step": 3300 }, { "epoch": 1.7320774463631605, "grad_norm": 7.1196364277612165, "learning_rate": 2.6850570737460916e-06, "logits/chosen": -13.125, "logits/rejected": -12.8125, "logps/chosen": -482.0, "logps/rejected": -600.0, "loss": 0.1152, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -8.125, "rewards/margins": 8.75, "rewards/rejected": -16.875, "step": 3310 }, { "epoch": 1.7373103087388801, "grad_norm": 2.680324074975716, "learning_rate": 2.5830236628373363e-06, "logits/chosen": -13.1875, "logits/rejected": -13.125, "logps/chosen": -502.0, "logps/rejected": -676.0, "loss": 0.0778, "rewards/accuracies": 0.9375, "rewards/chosen": -9.0, "rewards/margins": 9.0, "rewards/rejected": -18.0, "step": 3320 }, { "epoch": 1.7425431711145998, "grad_norm": 2.2573623813902244, "learning_rate": 2.482860974588755e-06, "logits/chosen": -13.125, "logits/rejected": -12.5, "logps/chosen": -532.0, "logps/rejected": -652.0, "loss": 0.0792, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.125, "rewards/margins": 8.9375, "rewards/rejected": -18.125, "step": 3330 }, { "epoch": 1.7477760334903192, "grad_norm": 2.3988175015656794, "learning_rate": 2.3845773676927863e-06, "logits/chosen": -13.375, "logits/rejected": -12.9375, "logps/chosen": -500.0, "logps/rejected": -644.0, "loss": 0.0737, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.0625, "rewards/margins": 8.625, "rewards/rejected": -17.75, "step": 3340 }, { "epoch": 1.7530088958660386, "grad_norm": 2.499191138157377, "learning_rate": 2.288181044030341e-06, "logits/chosen": -13.3125, "logits/rejected": -12.8125, "logps/chosen": -454.0, "logps/rejected": -592.0, "loss": 0.1102, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.1875, "rewards/margins": 9.0, "rewards/rejected": -17.125, "step": 3350 }, { "epoch": 1.7582417582417582, "grad_norm": 1.7297577696934656, "learning_rate": 2.193680047986385e-06, "logits/chosen": -12.9375, "logits/rejected": -12.75, "logps/chosen": -478.0, "logps/rejected": -608.0, "loss": 0.1039, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.3125, "rewards/margins": 7.375, "rewards/rejected": -16.625, "step": 3360 }, { "epoch": 1.7634746206174778, "grad_norm": 1.3844986076148504, "learning_rate": 2.1010822657785673e-06, "logits/chosen": -13.25, "logits/rejected": -13.0625, "logps/chosen": -552.0, "logps/rejected": -624.0, "loss": 0.0855, "rewards/accuracies": 0.9375, "rewards/chosen": -8.1875, "rewards/margins": 8.625, "rewards/rejected": -16.75, "step": 3370 }, { "epoch": 1.7687074829931972, "grad_norm": 2.5009715621795063, "learning_rate": 2.0103954247991525e-06, "logits/chosen": -13.125, "logits/rejected": -12.5625, "logps/chosen": -488.0, "logps/rejected": -612.0, "loss": 0.09, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.625, "rewards/margins": 8.6875, "rewards/rejected": -17.25, "step": 3380 }, { "epoch": 1.7739403453689166, "grad_norm": 1.8945829950156903, "learning_rate": 1.9216270929701407e-06, "logits/chosen": -12.9375, "logits/rejected": -12.625, "logps/chosen": -474.0, "logps/rejected": -652.0, "loss": 0.0723, "rewards/accuracies": 0.9375, "rewards/chosen": -8.75, "rewards/margins": 8.0625, "rewards/rejected": -16.75, "step": 3390 }, { "epoch": 1.7791732077446363, "grad_norm": 5.069862077253051, "learning_rate": 1.8347846781117201e-06, "logits/chosen": -13.0625, "logits/rejected": -12.625, "logps/chosen": -512.0, "logps/rejected": -616.0, "loss": 0.1063, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.8125, "rewards/margins": 9.0625, "rewards/rejected": -17.875, "step": 3400 }, { "epoch": 1.784406070120356, "grad_norm": 1.3102672583456336, "learning_rate": 1.7498754273240713e-06, "logits/chosen": -13.0625, "logits/rejected": -12.5, "logps/chosen": -516.0, "logps/rejected": -656.0, "loss": 0.102, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.375, "rewards/margins": 9.5, "rewards/rejected": -17.875, "step": 3410 }, { "epoch": 1.7896389324960753, "grad_norm": 5.748063923557189, "learning_rate": 1.6669064263826028e-06, "logits/chosen": -12.875, "logits/rejected": -12.625, "logps/chosen": -572.0, "logps/rejected": -608.0, "loss": 0.1097, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.8125, "rewards/margins": 8.125, "rewards/rejected": -17.0, "step": 3420 }, { "epoch": 1.7948717948717947, "grad_norm": 4.829587618568215, "learning_rate": 1.5858845991466088e-06, "logits/chosen": -13.0, "logits/rejected": -12.875, "logps/chosen": -472.0, "logps/rejected": -576.0, "loss": 0.1007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.25, "rewards/margins": 6.65625, "rewards/rejected": -15.875, "step": 3430 }, { "epoch": 1.8001046572475143, "grad_norm": 0.92994359502391, "learning_rate": 1.5068167069814926e-06, "logits/chosen": -12.875, "logits/rejected": -12.625, "logps/chosen": -560.0, "logps/rejected": -692.0, "loss": 0.0844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.3125, "rewards/margins": 9.9375, "rewards/rejected": -18.25, "step": 3440 }, { "epoch": 1.805337519623234, "grad_norm": 2.1999437018442163, "learning_rate": 1.4297093481945106e-06, "logits/chosen": -13.0625, "logits/rejected": -12.75, "logps/chosen": -506.0, "logps/rejected": -608.0, "loss": 0.0948, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.125, "rewards/margins": 7.59375, "rewards/rejected": -16.75, "step": 3450 }, { "epoch": 1.8105703819989536, "grad_norm": 4.960781699983769, "learning_rate": 1.3545689574841342e-06, "logits/chosen": -13.125, "logits/rejected": -12.5625, "logps/chosen": -544.0, "logps/rejected": -652.0, "loss": 0.0968, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.0625, "rewards/margins": 8.3125, "rewards/rejected": -17.375, "step": 3460 }, { "epoch": 1.815803244374673, "grad_norm": 3.66785793431449, "learning_rate": 1.2814018054030623e-06, "logits/chosen": -13.25, "logits/rejected": -12.75, "logps/chosen": -520.0, "logps/rejected": -624.0, "loss": 0.0943, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.875, "rewards/margins": 8.8125, "rewards/rejected": -17.75, "step": 3470 }, { "epoch": 1.8210361067503924, "grad_norm": 4.803515943576416, "learning_rate": 1.2102139978349497e-06, "logits/chosen": -12.75, "logits/rejected": -12.25, "logps/chosen": -560.0, "logps/rejected": -692.0, "loss": 0.1218, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.625, "rewards/margins": 9.0, "rewards/rejected": -18.625, "step": 3480 }, { "epoch": 1.826268969126112, "grad_norm": 1.3014751486404437, "learning_rate": 1.14101147548486e-06, "logits/chosen": -13.25, "logits/rejected": -12.625, "logps/chosen": -528.0, "logps/rejected": -640.0, "loss": 0.0868, "rewards/accuracies": 0.9375, "rewards/chosen": -8.5625, "rewards/margins": 9.3125, "rewards/rejected": -17.875, "step": 3490 }, { "epoch": 1.8315018315018317, "grad_norm": 7.2868142451084, "learning_rate": 1.0738000133834969e-06, "logits/chosen": -13.4375, "logits/rejected": -12.9375, "logps/chosen": -532.0, "logps/rejected": -616.0, "loss": 0.1025, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.75, "rewards/margins": 7.0625, "rewards/rejected": -16.75, "step": 3500 }, { "epoch": 1.836734693877551, "grad_norm": 2.182109722434071, "learning_rate": 1.008585220405278e-06, "logits/chosen": -13.0, "logits/rejected": -12.625, "logps/chosen": -462.0, "logps/rejected": -612.0, "loss": 0.0964, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.5, "rewards/margins": 6.96875, "rewards/rejected": -16.5, "step": 3510 }, { "epoch": 1.8419675562532705, "grad_norm": 3.0642067374107125, "learning_rate": 9.453725388002821e-07, "logits/chosen": -13.25, "logits/rejected": -13.0, "logps/chosen": -540.0, "logps/rejected": -680.0, "loss": 0.1071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.0, "rewards/margins": 10.0, "rewards/rejected": -19.0, "step": 3520 }, { "epoch": 1.84720041862899, "grad_norm": 2.211282968905268, "learning_rate": 8.841672437400528e-07, "logits/chosen": -12.6875, "logits/rejected": -12.4375, "logps/chosen": -496.0, "logps/rejected": -644.0, "loss": 0.0976, "rewards/accuracies": 0.9375, "rewards/chosen": -8.875, "rewards/margins": 8.25, "rewards/rejected": -17.125, "step": 3530 }, { "epoch": 1.8524332810047097, "grad_norm": 2.306667763441843, "learning_rate": 8.249744428774103e-07, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -576.0, "logps/rejected": -652.0, "loss": 0.0807, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.75, "rewards/margins": 9.125, "rewards/rejected": -17.875, "step": 3540 }, { "epoch": 1.8576661433804291, "grad_norm": 5.435125332604937, "learning_rate": 7.677990759202086e-07, "logits/chosen": -12.6875, "logits/rejected": -11.9375, "logps/chosen": -572.0, "logps/rejected": -708.0, "loss": 0.0971, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.875, "rewards/margins": 9.375, "rewards/rejected": -18.25, "step": 3550 }, { "epoch": 1.8628990057561485, "grad_norm": 3.3348576248427326, "learning_rate": 7.126459142190844e-07, "logits/chosen": -12.6875, "logits/rejected": -12.25, "logps/chosen": -532.0, "logps/rejected": -688.0, "loss": 0.1057, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.125, "rewards/margins": 9.0, "rewards/rejected": -18.125, "step": 3560 }, { "epoch": 1.8681318681318682, "grad_norm": 2.5808442085804644, "learning_rate": 6.595195603693205e-07, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -488.0, "logps/rejected": -604.0, "loss": 0.066, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.75, "rewards/margins": 7.1875, "rewards/rejected": -15.9375, "step": 3570 }, { "epoch": 1.8733647305075878, "grad_norm": 2.7672202913255366, "learning_rate": 6.084244478267248e-07, "logits/chosen": -12.8125, "logits/rejected": -12.6875, "logps/chosen": -520.0, "logps/rejected": -628.0, "loss": 0.1047, "rewards/accuracies": 0.9375, "rewards/chosen": -8.75, "rewards/margins": 8.5, "rewards/rejected": -17.25, "step": 3580 }, { "epoch": 1.8785975928833072, "grad_norm": 2.02922718371571, "learning_rate": 5.593648405376711e-07, "logits/chosen": -12.9375, "logits/rejected": -12.6875, "logps/chosen": -524.0, "logps/rejected": -660.0, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": -9.5, "rewards/margins": 9.0, "rewards/rejected": -18.5, "step": 3590 }, { "epoch": 1.8838304552590266, "grad_norm": 2.046652559740739, "learning_rate": 5.123448325832475e-07, "logits/chosen": -13.25, "logits/rejected": -12.625, "logps/chosen": -472.0, "logps/rejected": -580.0, "loss": 0.1085, "rewards/accuracies": 0.9375, "rewards/chosen": -9.1875, "rewards/margins": 7.53125, "rewards/rejected": -16.75, "step": 3600 }, { "epoch": 1.8890633176347462, "grad_norm": 5.589958818337675, "learning_rate": 4.6736834783762397e-07, "logits/chosen": -12.5, "logits/rejected": -12.0625, "logps/chosen": -484.0, "logps/rejected": -612.0, "loss": 0.1086, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.25, "rewards/margins": 8.0625, "rewards/rejected": -17.25, "step": 3610 }, { "epoch": 1.8942961800104658, "grad_norm": 2.1397763501401443, "learning_rate": 4.24439139640595e-07, "logits/chosen": -12.75, "logits/rejected": -12.375, "logps/chosen": -476.0, "logps/rejected": -612.0, "loss": 0.1402, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -8.8125, "rewards/margins": 7.75, "rewards/rejected": -16.5, "step": 3620 }, { "epoch": 1.8995290423861853, "grad_norm": 4.234807002383162, "learning_rate": 3.835607904843358e-07, "logits/chosen": -13.25, "logits/rejected": -12.875, "logps/chosen": -496.0, "logps/rejected": -640.0, "loss": 0.099, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.625, "rewards/margins": 7.4375, "rewards/rejected": -17.125, "step": 3630 }, { "epoch": 1.9047619047619047, "grad_norm": 5.447862432508138, "learning_rate": 3.4473671171447174e-07, "logits/chosen": -12.8125, "logits/rejected": -12.4375, "logps/chosen": -488.0, "logps/rejected": -596.0, "loss": 0.0801, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.5625, "rewards/margins": 8.875, "rewards/rejected": -17.375, "step": 3640 }, { "epoch": 1.9099947671376243, "grad_norm": 3.7446251130949766, "learning_rate": 3.079701432453841e-07, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -460.0, "logps/rejected": -676.0, "loss": 0.0963, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.9375, "rewards/margins": 8.5625, "rewards/rejected": -17.5, "step": 3650 }, { "epoch": 1.915227629513344, "grad_norm": 3.968745889768715, "learning_rate": 2.7326415328982056e-07, "logits/chosen": -12.75, "logits/rejected": -12.25, "logps/chosen": -502.0, "logps/rejected": -616.0, "loss": 0.1276, "rewards/accuracies": 0.9375, "rewards/chosen": -10.125, "rewards/margins": 6.875, "rewards/rejected": -17.0, "step": 3660 }, { "epoch": 1.9204604918890633, "grad_norm": 4.006875799984734, "learning_rate": 2.4062163810288365e-07, "logits/chosen": -13.0625, "logits/rejected": -12.6875, "logps/chosen": -536.0, "logps/rejected": -632.0, "loss": 0.0867, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.875, "rewards/margins": 8.4375, "rewards/rejected": -17.25, "step": 3670 }, { "epoch": 1.9256933542647827, "grad_norm": 5.953542104090957, "learning_rate": 2.100453217402959e-07, "logits/chosen": -13.0625, "logits/rejected": -12.5625, "logps/chosen": -560.0, "logps/rejected": -660.0, "loss": 0.1176, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.625, "rewards/margins": 8.125, "rewards/rejected": -17.75, "step": 3680 }, { "epoch": 1.9309262166405023, "grad_norm": 5.239490240155921, "learning_rate": 1.8153775583110156e-07, "logits/chosen": -13.0, "logits/rejected": -12.625, "logps/chosen": -458.0, "logps/rejected": -604.0, "loss": 0.0844, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.625, "rewards/margins": 8.0, "rewards/rejected": -16.625, "step": 3690 }, { "epoch": 1.936159079016222, "grad_norm": 4.377024347096592, "learning_rate": 1.5510131936472273e-07, "logits/chosen": -13.3125, "logits/rejected": -12.625, "logps/chosen": -544.0, "logps/rejected": -672.0, "loss": 0.0835, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.875, "rewards/margins": 9.3125, "rewards/rejected": -18.25, "step": 3700 }, { "epoch": 1.9413919413919414, "grad_norm": 1.5159821172630095, "learning_rate": 1.307382184924266e-07, "logits/chosen": -13.375, "logits/rejected": -12.8125, "logps/chosen": -478.0, "logps/rejected": -600.0, "loss": 0.0899, "rewards/accuracies": 0.9375, "rewards/chosen": -9.25, "rewards/margins": 7.40625, "rewards/rejected": -16.625, "step": 3710 }, { "epoch": 1.9466248037676608, "grad_norm": 4.469755228752353, "learning_rate": 1.0845048634321731e-07, "logits/chosen": -12.5625, "logits/rejected": -12.25, "logps/chosen": -516.0, "logps/rejected": -588.0, "loss": 0.1217, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.8125, "rewards/margins": 7.78125, "rewards/rejected": -16.625, "step": 3720 }, { "epoch": 1.9518576661433804, "grad_norm": 3.3123314260717414, "learning_rate": 8.823998285418522e-08, "logits/chosen": -12.9375, "logits/rejected": -12.625, "logps/chosen": -544.0, "logps/rejected": -648.0, "loss": 0.086, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.5625, "rewards/margins": 9.125, "rewards/rejected": -17.75, "step": 3730 }, { "epoch": 1.9570905285191, "grad_norm": 2.731361823113672, "learning_rate": 7.010839461526752e-08, "logits/chosen": -12.875, "logits/rejected": -12.5, "logps/chosen": -544.0, "logps/rejected": -680.0, "loss": 0.0895, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.0, "rewards/margins": 10.4375, "rewards/rejected": -18.375, "step": 3740 }, { "epoch": 1.9623233908948194, "grad_norm": 3.243299191382827, "learning_rate": 5.4057234728521756e-08, "logits/chosen": -12.9375, "logits/rejected": -12.75, "logps/chosen": -498.0, "logps/rejected": -620.0, "loss": 0.1018, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.625, "rewards/margins": 8.1875, "rewards/rejected": -16.75, "step": 3750 }, { "epoch": 1.9675562532705388, "grad_norm": 5.643560413836883, "learning_rate": 4.0087842681846286e-08, "logits/chosen": -12.25, "logits/rejected": -11.9375, "logps/chosen": -528.0, "logps/rejected": -604.0, "loss": 0.1314, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.4375, "rewards/margins": 7.6875, "rewards/rejected": -17.125, "step": 3760 }, { "epoch": 1.9727891156462585, "grad_norm": 1.3002574150876818, "learning_rate": 2.820138423720309e-08, "logits/chosen": -13.0, "logits/rejected": -12.625, "logps/chosen": -476.0, "logps/rejected": -596.0, "loss": 0.0943, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.125, "rewards/margins": 7.78125, "rewards/rejected": -16.875, "step": 3770 }, { "epoch": 1.978021978021978, "grad_norm": 5.260548583290011, "learning_rate": 1.839885133332053e-08, "logits/chosen": -12.8125, "logits/rejected": -12.25, "logps/chosen": -544.0, "logps/rejected": -696.0, "loss": 0.1208, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.125, "rewards/margins": 9.875, "rewards/rejected": -19.0, "step": 3780 }, { "epoch": 1.9832548403976975, "grad_norm": 5.112452952032154, "learning_rate": 1.0681062002940167e-08, "logits/chosen": -13.0625, "logits/rejected": -12.75, "logps/chosen": -548.0, "logps/rejected": -684.0, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": -8.4375, "rewards/margins": 9.5, "rewards/rejected": -18.0, "step": 3790 }, { "epoch": 1.988487702773417, "grad_norm": 0.9856562812739469, "learning_rate": 5.048660304524111e-09, "logits/chosen": -12.875, "logits/rejected": -12.4375, "logps/chosen": -512.0, "logps/rejected": -660.0, "loss": 0.1136, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.5, "rewards/margins": 8.8125, "rewards/rejected": -18.25, "step": 3800 }, { "epoch": 1.9937205651491365, "grad_norm": 1.084985961639873, "learning_rate": 1.502116268523035e-09, "logits/chosen": -13.3125, "logits/rejected": -13.0625, "logps/chosen": -536.0, "logps/rejected": -648.0, "loss": 0.1045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.75, "rewards/margins": 9.0, "rewards/rejected": -17.75, "step": 3810 }, { "epoch": 1.9989534275248562, "grad_norm": 1.7332241383915472, "learning_rate": 4.172585814643526e-11, "logits/chosen": -13.4375, "logits/rejected": -12.9375, "logps/chosen": -508.0, "logps/rejected": -628.0, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": -8.5625, "rewards/margins": 9.0625, "rewards/rejected": -17.625, "step": 3820 }, { "epoch": 2.0, "eval_logits/chosen": -13.3125, "eval_logits/rejected": -13.0625, "eval_logps/chosen": -556.0, "eval_logps/rejected": -564.0, "eval_loss": 0.7793359160423279, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": -11.75, "eval_rewards/margins": 1.9140625, "eval_rewards/rejected": -13.6875, "eval_runtime": 46.77, "eval_samples_per_second": 42.762, "eval_steps_per_second": 0.684, "step": 3822 }, { "epoch": 2.0, "step": 3822, "total_flos": 0.0, "train_loss": 0.41369995401518184, "train_runtime": 7266.4225, "train_samples_per_second": 16.826, "train_steps_per_second": 0.526 } ], "logging_steps": 10, "max_steps": 3822, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }