diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5819 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 3822, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005232862375719519, + "grad_norm": 10.351250771822766, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -12.5625, + "logits/rejected": -11.6875, + "logps/chosen": -430.0, + "logps/rejected": -460.0, + "loss": 0.6914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0052328623757195184, + "grad_norm": 9.844286946232794, + "learning_rate": 1.3054830287206267e-06, + "logits/chosen": -11.3125, + "logits/rejected": -11.3125, + "logps/chosen": -364.0, + "logps/rejected": -290.0, + "loss": 0.6938, + "rewards/accuracies": 0.2083333283662796, + "rewards/chosen": 0.003997802734375, + "rewards/margins": 0.0033111572265625, + "rewards/rejected": 0.000701904296875, + "step": 10 + }, + { + "epoch": 0.010465724751439037, + "grad_norm": 8.840907831144664, + "learning_rate": 2.6109660574412534e-06, + "logits/chosen": -11.0625, + "logits/rejected": -11.0625, + "logps/chosen": -264.0, + "logps/rejected": -256.0, + "loss": 0.691, + "rewards/accuracies": 0.2750000059604645, + "rewards/chosen": -0.0019989013671875, + "rewards/margins": 0.009521484375, + "rewards/rejected": -0.01153564453125, + "step": 20 + }, + { + "epoch": 0.015698587127158554, + "grad_norm": 10.083186641323694, + "learning_rate": 3.9164490861618806e-06, + "logits/chosen": -10.375, + "logits/rejected": -10.3125, + "logps/chosen": -328.0, + "logps/rejected": -318.0, + "loss": 0.6803, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.00970458984375, + "rewards/margins": 0.023681640625, + "rewards/rejected": -0.01397705078125, + "step": 30 + }, + { + "epoch": 0.020931449502878074, + "grad_norm": 9.068339095588007, + "learning_rate": 5.221932114882507e-06, + "logits/chosen": -11.4375, + "logits/rejected": -11.0625, + "logps/chosen": -336.0, + "logps/rejected": -312.0, + "loss": 0.6719, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.032470703125, + "rewards/margins": 0.030517578125, + "rewards/rejected": -0.06298828125, + "step": 40 + }, + { + "epoch": 0.026164311878597593, + "grad_norm": 9.546615633761498, + "learning_rate": 6.527415143603134e-06, + "logits/chosen": -12.1875, + "logits/rejected": -12.0, + "logps/chosen": -316.0, + "logps/rejected": -314.0, + "loss": 0.6523, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.09912109375, + "rewards/margins": 0.1376953125, + "rewards/rejected": -0.2373046875, + "step": 50 + }, + { + "epoch": 0.03139717425431711, + "grad_norm": 9.19274633645371, + "learning_rate": 7.832898172323761e-06, + "logits/chosen": -12.4375, + "logits/rejected": -12.125, + "logps/chosen": -334.0, + "logps/rejected": -328.0, + "loss": 0.6243, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3203125, + "rewards/margins": 0.1455078125, + "rewards/rejected": -0.466796875, + "step": 60 + }, + { + "epoch": 0.03663003663003663, + "grad_norm": 10.83447468535562, + "learning_rate": 9.138381201044387e-06, + "logits/chosen": -13.625, + "logits/rejected": -13.5625, + "logps/chosen": -380.0, + "logps/rejected": -340.0, + "loss": 0.609, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6796875, + "rewards/margins": 0.287109375, + "rewards/rejected": -0.96875, + "step": 70 + }, + { + "epoch": 0.04186289900575615, + "grad_norm": 9.13507791543036, + "learning_rate": 1.0443864229765014e-05, + "logits/chosen": -13.25, + "logits/rejected": -13.25, + "logps/chosen": -324.0, + "logps/rejected": -320.0, + "loss": 0.6258, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5703125, + "rewards/margins": 0.283203125, + "rewards/rejected": -0.8515625, + "step": 80 + }, + { + "epoch": 0.04709576138147567, + "grad_norm": 8.742369309742912, + "learning_rate": 1.174934725848564e-05, + "logits/chosen": -11.875, + "logits/rejected": -11.5, + "logps/chosen": -296.0, + "logps/rejected": -272.0, + "loss": 0.6439, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.28515625, + "rewards/margins": 0.208984375, + "rewards/rejected": -0.494140625, + "step": 90 + }, + { + "epoch": 0.052328623757195186, + "grad_norm": 11.89346927211031, + "learning_rate": 1.3054830287206268e-05, + "logits/chosen": -11.875, + "logits/rejected": -10.875, + "logps/chosen": -356.0, + "logps/rejected": -286.0, + "loss": 0.6267, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.380859375, + "rewards/margins": 0.27734375, + "rewards/rejected": -0.65625, + "step": 100 + }, + { + "epoch": 0.0575614861329147, + "grad_norm": 7.722831940344756, + "learning_rate": 1.4360313315926893e-05, + "logits/chosen": -11.125, + "logits/rejected": -10.625, + "logps/chosen": -320.0, + "logps/rejected": -292.0, + "loss": 0.6081, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6328125, + "rewards/margins": 0.443359375, + "rewards/rejected": -1.078125, + "step": 110 + }, + { + "epoch": 0.06279434850863422, + "grad_norm": 10.557403681525182, + "learning_rate": 1.5665796344647522e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.375, + "logps/chosen": -422.0, + "logps/rejected": -376.0, + "loss": 0.6488, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.578125, + "rewards/margins": 0.28515625, + "rewards/rejected": -0.86328125, + "step": 120 + }, + { + "epoch": 0.06802721088435375, + "grad_norm": 8.52113738726324, + "learning_rate": 1.6971279373368146e-05, + "logits/chosen": -11.9375, + "logits/rejected": -11.125, + "logps/chosen": -282.0, + "logps/rejected": -288.0, + "loss": 0.6485, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6796875, + "rewards/margins": 0.2890625, + "rewards/rejected": -0.96875, + "step": 130 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 10.818036746056942, + "learning_rate": 1.8276762402088773e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.125, + "logps/chosen": -362.0, + "logps/rejected": -340.0, + "loss": 0.5629, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0703125, + "rewards/margins": 0.6015625, + "rewards/rejected": -1.671875, + "step": 140 + }, + { + "epoch": 0.07849293563579278, + "grad_norm": 9.701498673533864, + "learning_rate": 1.95822454308094e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.0, + "logps/chosen": -358.0, + "logps/rejected": -326.0, + "loss": 0.6388, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.515625, + "rewards/margins": 0.48046875, + "rewards/rejected": -1.9921875, + "step": 150 + }, + { + "epoch": 0.0837257980115123, + "grad_norm": 9.972948367466696, + "learning_rate": 2.0887728459530027e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.5, + "logps/chosen": -366.0, + "logps/rejected": -324.0, + "loss": 0.6264, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.328125, + "rewards/margins": 0.68359375, + "rewards/rejected": -2.015625, + "step": 160 + }, + { + "epoch": 0.08895866038723181, + "grad_norm": 9.17511762621532, + "learning_rate": 2.2193211488250655e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.25, + "logps/chosen": -338.0, + "logps/rejected": -346.0, + "loss": 0.5969, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3359375, + "rewards/margins": 0.1865234375, + "rewards/rejected": -1.5234375, + "step": 170 + }, + { + "epoch": 0.09419152276295134, + "grad_norm": 6.965621587975365, + "learning_rate": 2.349869451697128e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.75, + "logps/chosen": -302.0, + "logps/rejected": -314.0, + "loss": 0.6303, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.359375, + "rewards/margins": 0.359375, + "rewards/rejected": -1.71875, + "step": 180 + }, + { + "epoch": 0.09942438513867086, + "grad_norm": 9.553188606130272, + "learning_rate": 2.4804177545691905e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.5625, + "logps/chosen": -416.0, + "logps/rejected": -356.0, + "loss": 0.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7578125, + "rewards/margins": 0.62890625, + "rewards/rejected": -2.390625, + "step": 190 + }, + { + "epoch": 0.10465724751439037, + "grad_norm": 9.121837612837107, + "learning_rate": 2.6109660574412536e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.5, + "logps/chosen": -364.0, + "logps/rejected": -360.0, + "loss": 0.6058, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.859375, + "rewards/margins": 0.38671875, + "rewards/rejected": -2.25, + "step": 200 + }, + { + "epoch": 0.10989010989010989, + "grad_norm": 8.059177993813114, + "learning_rate": 2.741514360313316e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.875, + "logps/chosen": -316.0, + "logps/rejected": -294.0, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.796875, + "rewards/margins": 0.306640625, + "rewards/rejected": -2.109375, + "step": 210 + }, + { + "epoch": 0.1151229722658294, + "grad_norm": 8.500755304845391, + "learning_rate": 2.8720626631853787e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.625, + "logps/chosen": -344.0, + "logps/rejected": -354.0, + "loss": 0.7723, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.4375, + "rewards/margins": 0.578125, + "rewards/rejected": -3.015625, + "step": 220 + }, + { + "epoch": 0.12035583464154893, + "grad_norm": 9.230475840269714, + "learning_rate": 3.0026109660574414e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.125, + "logps/chosen": -386.0, + "logps/rejected": -422.0, + "loss": 0.6046, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.359375, + "rewards/margins": 0.63671875, + "rewards/rejected": -3.0, + "step": 230 + }, + { + "epoch": 0.12558869701726844, + "grad_norm": 9.204392189192681, + "learning_rate": 3.1331592689295045e-05, + "logits/chosen": -10.875, + "logits/rejected": -10.0, + "logps/chosen": -328.0, + "logps/rejected": -336.0, + "loss": 0.6512, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9765625, + "rewards/margins": 0.65234375, + "rewards/rejected": -2.640625, + "step": 240 + }, + { + "epoch": 0.13082155939298795, + "grad_norm": 9.749349255282869, + "learning_rate": 3.263707571801567e-05, + "logits/chosen": -11.0625, + "logits/rejected": -10.4375, + "logps/chosen": -388.0, + "logps/rejected": -348.0, + "loss": 0.6508, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.0625, + "rewards/margins": 0.439453125, + "rewards/rejected": -2.5, + "step": 250 + }, + { + "epoch": 0.1360544217687075, + "grad_norm": 10.257472173487507, + "learning_rate": 3.394255874673629e-05, + "logits/chosen": -10.9375, + "logits/rejected": -10.4375, + "logps/chosen": -368.0, + "logps/rejected": -338.0, + "loss": 0.6354, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0625, + "rewards/margins": 0.68359375, + "rewards/rejected": -2.75, + "step": 260 + }, + { + "epoch": 0.141287284144427, + "grad_norm": 6.958550084405995, + "learning_rate": 3.524804177545692e-05, + "logits/chosen": -9.875, + "logits/rejected": -9.5625, + "logps/chosen": -364.0, + "logps/rejected": -362.0, + "loss": 0.5584, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8125, + "rewards/margins": 0.7109375, + "rewards/rejected": -2.515625, + "step": 270 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 9.847805644914816, + "learning_rate": 3.6553524804177546e-05, + "logits/chosen": -11.625, + "logits/rejected": -11.6875, + "logps/chosen": -410.0, + "logps/rejected": -382.0, + "loss": 0.6862, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.75, + "rewards/margins": 0.49609375, + "rewards/rejected": -3.25, + "step": 280 + }, + { + "epoch": 0.15175300889586604, + "grad_norm": 8.573440905919824, + "learning_rate": 3.7859007832898173e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.4375, + "logps/chosen": -382.0, + "logps/rejected": -336.0, + "loss": 0.7394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9609375, + "rewards/margins": 0.62109375, + "rewards/rejected": -2.578125, + "step": 290 + }, + { + "epoch": 0.15698587127158556, + "grad_norm": 8.80725505375811, + "learning_rate": 3.91644908616188e-05, + "logits/chosen": -13.75, + "logits/rejected": -13.25, + "logps/chosen": -380.0, + "logps/rejected": -344.0, + "loss": 0.6646, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3125, + "rewards/margins": 0.458984375, + "rewards/rejected": -2.765625, + "step": 300 + }, + { + "epoch": 0.16221873364730507, + "grad_norm": 8.968657211210374, + "learning_rate": 4.046997389033943e-05, + "logits/chosen": -12.25, + "logits/rejected": -11.9375, + "logps/chosen": -386.0, + "logps/rejected": -346.0, + "loss": 0.6784, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.4375, + "rewards/margins": 0.578125, + "rewards/rejected": -3.015625, + "step": 310 + }, + { + "epoch": 0.1674515960230246, + "grad_norm": 10.194843430881596, + "learning_rate": 4.1775456919060055e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.0625, + "logps/chosen": -442.0, + "logps/rejected": -372.0, + "loss": 0.5713, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.5625, + "rewards/margins": 0.76171875, + "rewards/rejected": -4.3125, + "step": 320 + }, + { + "epoch": 0.1726844583987441, + "grad_norm": 10.313402119423792, + "learning_rate": 4.308093994778068e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -398.0, + "logps/rejected": -364.0, + "loss": 0.735, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.984375, + "rewards/margins": 0.9375, + "rewards/rejected": -4.90625, + "step": 330 + }, + { + "epoch": 0.17791732077446362, + "grad_norm": 9.173592771992633, + "learning_rate": 4.438642297650131e-05, + "logits/chosen": -11.1875, + "logits/rejected": -10.8125, + "logps/chosen": -406.0, + "logps/rejected": -382.0, + "loss": 0.6655, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.125, + "rewards/margins": 0.8515625, + "rewards/rejected": -4.96875, + "step": 340 + }, + { + "epoch": 0.18315018315018314, + "grad_norm": 8.34069764210929, + "learning_rate": 4.5691906005221936e-05, + "logits/chosen": -11.375, + "logits/rejected": -11.25, + "logps/chosen": -332.0, + "logps/rejected": -360.0, + "loss": 0.6806, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.59375, + "rewards/margins": 0.87109375, + "rewards/rejected": -5.46875, + "step": 350 + }, + { + "epoch": 0.18838304552590268, + "grad_norm": 11.451242673237575, + "learning_rate": 4.699738903394256e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.5, + "logps/chosen": -424.0, + "logps/rejected": -404.0, + "loss": 0.767, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.4375, + "rewards/margins": 0.578125, + "rewards/rejected": -5.03125, + "step": 360 + }, + { + "epoch": 0.1936159079016222, + "grad_norm": 8.706088083363465, + "learning_rate": 4.830287206266319e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.6875, + "logps/chosen": -380.0, + "logps/rejected": -362.0, + "loss": 0.7948, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.4375, + "rewards/margins": 0.55078125, + "rewards/rejected": -5.0, + "step": 370 + }, + { + "epoch": 0.1988487702773417, + "grad_norm": 8.71050562327642, + "learning_rate": 4.960835509138381e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.8125, + "logps/chosen": -360.0, + "logps/rejected": -402.0, + "loss": 0.7885, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.40625, + "rewards/margins": 0.79296875, + "rewards/rejected": -5.21875, + "step": 380 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 9.631202895027702, + "learning_rate": 4.9999488859837295e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.1875, + "logps/chosen": -448.0, + "logps/rejected": -396.0, + "loss": 0.8545, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.25, + "rewards/margins": 0.93359375, + "rewards/rejected": -5.1875, + "step": 390 + }, + { + "epoch": 0.20931449502878074, + "grad_norm": 12.175473256479712, + "learning_rate": 4.999698536649904e-05, + "logits/chosen": -14.5, + "logits/rejected": -14.5, + "logps/chosen": -462.0, + "logps/rejected": -392.0, + "loss": 0.7412, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.8125, + "rewards/margins": 0.65234375, + "rewards/rejected": -5.4375, + "step": 400 + }, + { + "epoch": 0.21454735740450026, + "grad_norm": 10.472391673320981, + "learning_rate": 4.999239584575648e-05, + "logits/chosen": -14.75, + "logits/rejected": -14.875, + "logps/chosen": -446.0, + "logps/rejected": -438.0, + "loss": 0.7897, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.84375, + "rewards/margins": 0.298828125, + "rewards/rejected": -5.15625, + "step": 410 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 10.549150549694106, + "learning_rate": 4.9985720680610434e-05, + "logits/chosen": -14.6875, + "logits/rejected": -14.6875, + "logps/chosen": -412.0, + "logps/rejected": -390.0, + "loss": 0.7813, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.625, + "rewards/margins": 0.2734375, + "rewards/rejected": -4.90625, + "step": 420 + }, + { + "epoch": 0.2250130821559393, + "grad_norm": 8.610558700109573, + "learning_rate": 4.997696042811118e-05, + "logits/chosen": -14.625, + "logits/rejected": -14.8125, + "logps/chosen": -416.0, + "logps/rejected": -344.0, + "loss": 0.7214, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.375, + "rewards/margins": 0.59765625, + "rewards/rejected": -4.96875, + "step": 430 + }, + { + "epoch": 0.2302459445316588, + "grad_norm": 9.047522311209711, + "learning_rate": 4.996611581931193e-05, + "logits/chosen": -14.125, + "logits/rejected": -14.25, + "logps/chosen": -440.0, + "logps/rejected": -374.0, + "loss": 0.7039, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.5625, + "rewards/margins": 0.498046875, + "rewards/rejected": -5.0625, + "step": 440 + }, + { + "epoch": 0.23547880690737832, + "grad_norm": 10.207379585431303, + "learning_rate": 4.995318775920787e-05, + "logits/chosen": -13.5, + "logits/rejected": -13.5, + "logps/chosen": -384.0, + "logps/rejected": -386.0, + "loss": 0.7792, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.71875, + "rewards/margins": 0.130859375, + "rewards/rejected": -4.84375, + "step": 450 + }, + { + "epoch": 0.24071166928309787, + "grad_norm": 9.821006860178837, + "learning_rate": 4.9938177326660587e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.375, + "logps/chosen": -478.0, + "logps/rejected": -436.0, + "loss": 0.6816, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.625, + "rewards/margins": 1.046875, + "rewards/rejected": -5.65625, + "step": 460 + }, + { + "epoch": 0.24594453165881738, + "grad_norm": 10.675355174224261, + "learning_rate": 4.99210857743081e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.875, + "logps/chosen": -448.0, + "logps/rejected": -436.0, + "loss": 0.6973, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -5.875, + "rewards/margins": 0.72265625, + "rewards/rejected": -6.59375, + "step": 470 + }, + { + "epoch": 0.25117739403453687, + "grad_norm": 10.26484583598167, + "learning_rate": 4.990191452846024e-05, + "logits/chosen": -13.0, + "logits/rejected": -13.0, + "logps/chosen": -406.0, + "logps/rejected": -396.0, + "loss": 0.6783, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -5.34375, + "rewards/margins": 0.66796875, + "rewards/rejected": -6.0, + "step": 480 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 9.035937614807631, + "learning_rate": 4.988066518897971e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.3125, + "logps/chosen": -464.0, + "logps/rejected": -436.0, + "loss": 0.6354, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.09375, + "rewards/margins": 0.921875, + "rewards/rejected": -7.0, + "step": 490 + }, + { + "epoch": 0.2616431187859759, + "grad_norm": 6.87488260728394, + "learning_rate": 4.985733952914852e-05, + "logits/chosen": -15.5625, + "logits/rejected": -15.375, + "logps/chosen": -452.0, + "logps/rejected": -438.0, + "loss": 0.6495, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.0, + "rewards/margins": 0.69921875, + "rewards/rejected": -7.71875, + "step": 500 + }, + { + "epoch": 0.2668759811616955, + "grad_norm": 9.363373210153325, + "learning_rate": 4.983193949552002e-05, + "logits/chosen": -15.625, + "logits/rejected": -15.9375, + "logps/chosen": -450.0, + "logps/rejected": -420.0, + "loss": 0.7343, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -6.5, + "rewards/margins": 0.458984375, + "rewards/rejected": -6.9375, + "step": 510 + }, + { + "epoch": 0.272108843537415, + "grad_norm": 7.7281840818721, + "learning_rate": 4.980446720775646e-05, + "logits/chosen": -14.125, + "logits/rejected": -14.25, + "logps/chosen": -464.0, + "logps/rejected": -474.0, + "loss": 0.742, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -5.375, + "rewards/margins": 0.58984375, + "rewards/rejected": -5.9375, + "step": 520 + }, + { + "epoch": 0.2773417059131345, + "grad_norm": 9.271525518013695, + "learning_rate": 4.9774924958452084e-05, + "logits/chosen": -14.5625, + "logits/rejected": -14.75, + "logps/chosen": -492.0, + "logps/rejected": -408.0, + "loss": 0.8633, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 0.265625, + "rewards/rejected": -6.59375, + "step": 530 + }, + { + "epoch": 0.282574568288854, + "grad_norm": 11.416271168800474, + "learning_rate": 4.974331521294186e-05, + "logits/chosen": -14.75, + "logits/rejected": -14.75, + "logps/chosen": -496.0, + "logps/rejected": -460.0, + "loss": 0.7933, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.3125, + "rewards/margins": 0.5625, + "rewards/rejected": -7.90625, + "step": 540 + }, + { + "epoch": 0.28780743066457354, + "grad_norm": 7.787620742028088, + "learning_rate": 4.97096406090957e-05, + "logits/chosen": -14.3125, + "logits/rejected": -14.125, + "logps/chosen": -492.0, + "logps/rejected": -468.0, + "loss": 0.7545, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -6.125, + "rewards/margins": 0.69921875, + "rewards/rejected": -6.8125, + "step": 550 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 8.382409522213168, + "learning_rate": 4.96739039570983e-05, + "logits/chosen": -14.25, + "logits/rejected": -14.125, + "logps/chosen": -444.0, + "logps/rejected": -436.0, + "loss": 0.7372, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.6875, + "rewards/margins": 0.609375, + "rewards/rejected": -6.3125, + "step": 560 + }, + { + "epoch": 0.29827315541601257, + "grad_norm": 9.779033425632107, + "learning_rate": 4.963610823921471e-05, + "logits/chosen": -14.625, + "logits/rejected": -14.4375, + "logps/chosen": -482.0, + "logps/rejected": -434.0, + "loss": 0.9479, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -7.125, + "rewards/margins": -0.15625, + "rewards/rejected": -6.96875, + "step": 570 + }, + { + "epoch": 0.3035060177917321, + "grad_norm": 9.733444132976096, + "learning_rate": 4.959625660954139e-05, + "logits/chosen": -14.5, + "logits/rejected": -14.625, + "logps/chosen": -444.0, + "logps/rejected": -388.0, + "loss": 0.744, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.78125, + "rewards/margins": 0.65234375, + "rewards/rejected": -7.40625, + "step": 580 + }, + { + "epoch": 0.3087388801674516, + "grad_norm": 8.051810745094972, + "learning_rate": 4.9554352393743045e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.0625, + "logps/chosen": -470.0, + "logps/rejected": -452.0, + "loss": 0.9418, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -8.3125, + "rewards/margins": 0.64453125, + "rewards/rejected": -9.0, + "step": 590 + }, + { + "epoch": 0.3139717425431711, + "grad_norm": 5.903225463973512, + "learning_rate": 4.9510399088775047e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.75, + "logps/chosen": -500.0, + "logps/rejected": -502.0, + "loss": 0.7086, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -8.125, + "rewards/margins": 0.50390625, + "rewards/rejected": -8.5625, + "step": 600 + }, + { + "epoch": 0.31920460491889063, + "grad_norm": 11.634410981967923, + "learning_rate": 4.9464400362591644e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.125, + "logps/chosen": -420.0, + "logps/rejected": -396.0, + "loss": 0.7854, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -6.5625, + "rewards/margins": 0.50390625, + "rewards/rejected": -7.09375, + "step": 610 + }, + { + "epoch": 0.32443746729461015, + "grad_norm": 8.791552603599007, + "learning_rate": 4.941636005383986e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.1875, + "logps/chosen": -528.0, + "logps/rejected": -402.0, + "loss": 0.8591, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.59375, + "rewards/margins": 0.86328125, + "rewards/rejected": -7.4375, + "step": 620 + }, + { + "epoch": 0.32967032967032966, + "grad_norm": 9.02432053656773, + "learning_rate": 4.936628217153914e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.3125, + "logps/chosen": -386.0, + "logps/rejected": -408.0, + "loss": 0.7811, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -6.40625, + "rewards/margins": 0.4296875, + "rewards/rejected": -6.8125, + "step": 630 + }, + { + "epoch": 0.3349031920460492, + "grad_norm": 9.394934456092882, + "learning_rate": 4.931417089474682e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.6875, + "logps/chosen": -462.0, + "logps/rejected": -438.0, + "loss": 0.6975, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -6.5625, + "rewards/margins": 0.796875, + "rewards/rejected": -7.375, + "step": 640 + }, + { + "epoch": 0.3401360544217687, + "grad_norm": 7.439435296736013, + "learning_rate": 4.926003057220935e-05, + "logits/chosen": -15.3125, + "logits/rejected": -15.4375, + "logps/chosen": -470.0, + "logps/rejected": -434.0, + "loss": 0.7351, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -7.0, + "rewards/margins": 0.609375, + "rewards/rejected": -7.625, + "step": 650 + }, + { + "epoch": 0.3453689167974882, + "grad_norm": 9.162980300519617, + "learning_rate": 4.92038657219994e-05, + "logits/chosen": -16.375, + "logits/rejected": -16.5, + "logps/chosen": -402.0, + "logps/rejected": -392.0, + "loss": 0.743, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -6.53125, + "rewards/margins": 0.671875, + "rewards/rejected": -7.1875, + "step": 660 + }, + { + "epoch": 0.35060177917320773, + "grad_norm": 11.704890362462061, + "learning_rate": 4.914568103113882e-05, + "logits/chosen": -15.6875, + "logits/rejected": -16.0, + "logps/chosen": -442.0, + "logps/rejected": -418.0, + "loss": 0.7136, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -6.59375, + "rewards/margins": 0.671875, + "rewards/rejected": -7.28125, + "step": 670 + }, + { + "epoch": 0.35583464154892724, + "grad_norm": 7.227855368315395, + "learning_rate": 4.908548135520752e-05, + "logits/chosen": -14.8125, + "logits/rejected": -14.9375, + "logps/chosen": -456.0, + "logps/rejected": -416.0, + "loss": 0.6655, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -6.9375, + "rewards/margins": 0.5390625, + "rewards/rejected": -7.5, + "step": 680 + }, + { + "epoch": 0.36106750392464676, + "grad_norm": 9.146985437020582, + "learning_rate": 4.9023271717938224e-05, + "logits/chosen": -14.125, + "logits/rejected": -14.0625, + "logps/chosen": -528.0, + "logps/rejected": -486.0, + "loss": 0.7974, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -9.0625, + "rewards/margins": 0.53515625, + "rewards/rejected": -9.5625, + "step": 690 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 9.278505608910281, + "learning_rate": 4.8959057310797286e-05, + "logits/chosen": -14.0625, + "logits/rejected": -14.3125, + "logps/chosen": -486.0, + "logps/rejected": -428.0, + "loss": 0.7751, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -6.90625, + "rewards/margins": 0.427734375, + "rewards/rejected": -7.3125, + "step": 700 + }, + { + "epoch": 0.3715332286760858, + "grad_norm": 9.712228745007655, + "learning_rate": 4.889284349255141e-05, + "logits/chosen": -14.6875, + "logits/rejected": -14.6875, + "logps/chosen": -506.0, + "logps/rejected": -474.0, + "loss": 0.6715, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.46875, + "rewards/margins": 1.1328125, + "rewards/rejected": -8.5625, + "step": 710 + }, + { + "epoch": 0.37676609105180536, + "grad_norm": 8.521168690100787, + "learning_rate": 4.8824635788820475e-05, + "logits/chosen": -14.375, + "logits/rejected": -14.25, + "logps/chosen": -448.0, + "logps/rejected": -446.0, + "loss": 0.8019, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -7.875, + "rewards/margins": 0.453125, + "rewards/rejected": -8.3125, + "step": 720 + }, + { + "epoch": 0.3819989534275249, + "grad_norm": 11.10196898625787, + "learning_rate": 4.8754439891616434e-05, + "logits/chosen": -14.1875, + "logits/rejected": -14.1875, + "logps/chosen": -472.0, + "logps/rejected": -458.0, + "loss": 0.8141, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -7.34375, + "rewards/margins": 0.60546875, + "rewards/rejected": -7.9375, + "step": 730 + }, + { + "epoch": 0.3872318158032444, + "grad_norm": 10.792601975369012, + "learning_rate": 4.8682261658868264e-05, + "logits/chosen": -14.25, + "logits/rejected": -14.375, + "logps/chosen": -450.0, + "logps/rejected": -414.0, + "loss": 0.6468, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -7.34375, + "rewards/margins": 0.8828125, + "rewards/rejected": -8.25, + "step": 740 + }, + { + "epoch": 0.3924646781789639, + "grad_norm": 55.736192989145714, + "learning_rate": 4.860810711393317e-05, + "logits/chosen": -14.75, + "logits/rejected": -14.5625, + "logps/chosen": -480.0, + "logps/rejected": -502.0, + "loss": 1.0953, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.5625, + "rewards/margins": 0.93359375, + "rewards/rejected": -9.5, + "step": 750 + }, + { + "epoch": 0.3976975405546834, + "grad_norm": 8.990314887586853, + "learning_rate": 4.853198244509386e-05, + "logits/chosen": -14.625, + "logits/rejected": -14.625, + "logps/chosen": -450.0, + "logps/rejected": -434.0, + "loss": 0.7577, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -7.78125, + "rewards/margins": 0.6640625, + "rewards/rejected": -8.4375, + "step": 760 + }, + { + "epoch": 0.40293040293040294, + "grad_norm": 7.965840471202787, + "learning_rate": 4.845389400504221e-05, + "logits/chosen": -14.375, + "logits/rejected": -14.25, + "logps/chosen": -488.0, + "logps/rejected": -490.0, + "loss": 0.7896, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -7.5625, + "rewards/margins": 0.484375, + "rewards/rejected": -8.0625, + "step": 770 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 11.497968950154975, + "learning_rate": 4.837384831034905e-05, + "logits/chosen": -13.875, + "logits/rejected": -13.75, + "logps/chosen": -466.0, + "logps/rejected": -438.0, + "loss": 0.6823, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -7.15625, + "rewards/margins": 0.6875, + "rewards/rejected": -7.84375, + "step": 780 + }, + { + "epoch": 0.413396127681842, + "grad_norm": 9.66137517497451, + "learning_rate": 4.829185204092039e-05, + "logits/chosen": -14.5625, + "logits/rejected": -14.25, + "logps/chosen": -416.0, + "logps/rejected": -442.0, + "loss": 0.7498, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.0, + "rewards/margins": 1.15625, + "rewards/rejected": -8.125, + "step": 790 + }, + { + "epoch": 0.4186289900575615, + "grad_norm": 10.37574761233887, + "learning_rate": 4.8207912039439964e-05, + "logits/chosen": -15.4375, + "logits/rejected": -15.625, + "logps/chosen": -504.0, + "logps/rejected": -466.0, + "loss": 0.8479, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -6.875, + "rewards/margins": 0.29296875, + "rewards/rejected": -7.1875, + "step": 800 + }, + { + "epoch": 0.423861852433281, + "grad_norm": 10.351543599017008, + "learning_rate": 4.812203531079819e-05, + "logits/chosen": -15.25, + "logits/rejected": -15.375, + "logps/chosen": -476.0, + "logps/rejected": -454.0, + "loss": 0.7395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -7.09375, + "rewards/margins": 0.75, + "rewards/rejected": -7.84375, + "step": 810 + }, + { + "epoch": 0.4290947148090005, + "grad_norm": 8.622095411540833, + "learning_rate": 4.803422902150762e-05, + "logits/chosen": -14.375, + "logits/rejected": -14.4375, + "logps/chosen": -494.0, + "logps/rejected": -456.0, + "loss": 0.7025, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -8.6875, + "rewards/margins": 0.6875, + "rewards/rejected": -9.375, + "step": 820 + }, + { + "epoch": 0.43432757718472004, + "grad_norm": 9.264123822708422, + "learning_rate": 4.794450049910487e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.9375, + "logps/chosen": -454.0, + "logps/rejected": -438.0, + "loss": 0.8018, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -8.125, + "rewards/margins": 0.369140625, + "rewards/rejected": -8.5, + "step": 830 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 7.9655618610185845, + "learning_rate": 4.785285723153915e-05, + "logits/chosen": -11.625, + "logits/rejected": -12.0, + "logps/chosen": -536.0, + "logps/rejected": -472.0, + "loss": 0.7406, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -7.53125, + "rewards/margins": 0.6953125, + "rewards/rejected": -8.25, + "step": 840 + }, + { + "epoch": 0.44479330193615907, + "grad_norm": 10.645509151597812, + "learning_rate": 4.775930686654738e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.125, + "logps/chosen": -470.0, + "logps/rejected": -440.0, + "loss": 0.733, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.125, + "rewards/margins": 1.0078125, + "rewards/rejected": -9.125, + "step": 850 + }, + { + "epoch": 0.4500261643118786, + "grad_norm": 9.095410131374702, + "learning_rate": 4.7663857211015936e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.625, + "logps/chosen": -434.0, + "logps/rejected": -468.0, + "loss": 0.7619, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -6.53125, + "rewards/margins": 0.5546875, + "rewards/rejected": -7.0625, + "step": 860 + }, + { + "epoch": 0.4552590266875981, + "grad_norm": 7.213273555139093, + "learning_rate": 4.756651623032922e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.75, + "logps/chosen": -458.0, + "logps/rejected": -412.0, + "loss": 0.7308, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -6.40625, + "rewards/margins": 0.3359375, + "rewards/rejected": -6.75, + "step": 870 + }, + { + "epoch": 0.4604918890633176, + "grad_norm": 10.25034124783594, + "learning_rate": 4.746729204770491e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.125, + "logps/chosen": -532.0, + "logps/rejected": -470.0, + "loss": 0.688, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.6875, + "rewards/margins": 0.9140625, + "rewards/rejected": -9.625, + "step": 880 + }, + { + "epoch": 0.46572475143903713, + "grad_norm": 9.164696578171599, + "learning_rate": 4.736619294351607e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.25, + "logps/chosen": -556.0, + "logps/rejected": -516.0, + "loss": 0.7735, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.8125, + "rewards/margins": 0.68359375, + "rewards/rejected": -10.5, + "step": 890 + }, + { + "epoch": 0.47095761381475665, + "grad_norm": 9.176482738323408, + "learning_rate": 4.726322735460012e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.5, + "logps/chosen": -476.0, + "logps/rejected": -510.0, + "loss": 0.761, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.875, + "rewards/margins": 1.3984375, + "rewards/rejected": -9.25, + "step": 900 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 12.033840542792827, + "learning_rate": 4.715840387355481e-05, + "logits/chosen": -12.0625, + "logits/rejected": -11.875, + "logps/chosen": -452.0, + "logps/rejected": -440.0, + "loss": 0.84, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.75, + "rewards/margins": 0.83203125, + "rewards/rejected": -9.5625, + "step": 910 + }, + { + "epoch": 0.48142333856619574, + "grad_norm": 9.951231450685535, + "learning_rate": 4.705173124802114e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.0625, + "logps/chosen": -528.0, + "logps/rejected": -500.0, + "loss": 0.6771, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -9.4375, + "rewards/margins": 0.451171875, + "rewards/rejected": -9.875, + "step": 920 + }, + { + "epoch": 0.48665620094191525, + "grad_norm": 5.982719210494255, + "learning_rate": 4.694321837995337e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.125, + "logps/chosen": -516.0, + "logps/rejected": -482.0, + "loss": 0.6545, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.0625, + "rewards/margins": 0.9296875, + "rewards/rejected": -10.0, + "step": 930 + }, + { + "epoch": 0.49188906331763477, + "grad_norm": 7.732935041516918, + "learning_rate": 4.683287432487612e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.3125, + "logps/chosen": -520.0, + "logps/rejected": -482.0, + "loss": 0.6515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -9.3125, + "rewards/margins": 1.125, + "rewards/rejected": -10.4375, + "step": 940 + }, + { + "epoch": 0.4971219256933543, + "grad_norm": 11.192514507830417, + "learning_rate": 4.672070829112868e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.8125, + "logps/chosen": -498.0, + "logps/rejected": -488.0, + "loss": 0.6869, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.8125, + "rewards/margins": 1.0390625, + "rewards/rejected": -9.875, + "step": 950 + }, + { + "epoch": 0.5023547880690737, + "grad_norm": 9.893002359130774, + "learning_rate": 4.6606729639096606e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.1875, + "logps/chosen": -520.0, + "logps/rejected": -520.0, + "loss": 0.6144, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -10.4375, + "rewards/margins": 1.484375, + "rewards/rejected": -11.9375, + "step": 960 + }, + { + "epoch": 0.5075876504447933, + "grad_norm": 11.075478903393348, + "learning_rate": 4.6490947880430515e-05, + "logits/chosen": -11.5, + "logits/rejected": -11.0, + "logps/chosen": -584.0, + "logps/rejected": -520.0, + "loss": 0.7253, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -10.75, + "rewards/margins": 1.4453125, + "rewards/rejected": -12.1875, + "step": 970 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 7.207931474384908, + "learning_rate": 4.637337267725239e-05, + "logits/chosen": -12.0625, + "logits/rejected": -11.8125, + "logps/chosen": -584.0, + "logps/rejected": -536.0, + "loss": 0.6751, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.25, + "rewards/margins": 1.5078125, + "rewards/rejected": -11.75, + "step": 980 + }, + { + "epoch": 0.5180533751962323, + "grad_norm": 10.223265502840688, + "learning_rate": 4.625401384134921e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.6875, + "logps/chosen": -504.0, + "logps/rejected": -466.0, + "loss": 0.6907, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -9.5625, + "rewards/margins": 1.0, + "rewards/rejected": -10.5625, + "step": 990 + }, + { + "epoch": 0.5232862375719518, + "grad_norm": 9.366756020201832, + "learning_rate": 4.613288133335418e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.0625, + "logps/chosen": -490.0, + "logps/rejected": -472.0, + "loss": 0.7333, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.625, + "rewards/margins": 1.0, + "rewards/rejected": -10.625, + "step": 1000 + }, + { + "epoch": 0.5285190999476713, + "grad_norm": 9.668457676351762, + "learning_rate": 4.600998526191553e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.5625, + "logps/chosen": -540.0, + "logps/rejected": -544.0, + "loss": 0.8312, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -9.9375, + "rewards/margins": 0.73828125, + "rewards/rejected": -10.6875, + "step": 1010 + }, + { + "epoch": 0.533751962323391, + "grad_norm": 9.169851106004764, + "learning_rate": 4.588533588285287e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.1875, + "logps/chosen": -540.0, + "logps/rejected": -502.0, + "loss": 0.6978, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.375, + "rewards/margins": 1.3046875, + "rewards/rejected": -11.6875, + "step": 1020 + }, + { + "epoch": 0.5389848246991105, + "grad_norm": 6.76805241291535, + "learning_rate": 4.5758943598301354e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.1875, + "logps/chosen": -556.0, + "logps/rejected": -492.0, + "loss": 0.7095, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.875, + "rewards/margins": 0.66796875, + "rewards/rejected": -11.5, + "step": 1030 + }, + { + "epoch": 0.54421768707483, + "grad_norm": 7.48260799269685, + "learning_rate": 4.5630818955843646e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.8125, + "logps/chosen": -506.0, + "logps/rejected": -536.0, + "loss": 0.7073, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -9.9375, + "rewards/margins": 1.03125, + "rewards/rejected": -10.9375, + "step": 1040 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 9.671760718133113, + "learning_rate": 4.550097264762968e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.625, + "logps/chosen": -492.0, + "logps/rejected": -492.0, + "loss": 0.8316, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -8.875, + "rewards/margins": 0.71875, + "rewards/rejected": -9.5625, + "step": 1050 + }, + { + "epoch": 0.554683411826269, + "grad_norm": 7.232272637524744, + "learning_rate": 4.536941550948439e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.25, + "logps/chosen": -512.0, + "logps/rejected": -482.0, + "loss": 0.7443, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.25, + "rewards/margins": 0.84765625, + "rewards/rejected": -10.0625, + "step": 1060 + }, + { + "epoch": 0.5599162742019885, + "grad_norm": 7.573253707315908, + "learning_rate": 4.5236158520003444e-05, + "logits/chosen": -12.9375, + "logits/rejected": -13.0, + "logps/chosen": -536.0, + "logps/rejected": -502.0, + "loss": 0.6025, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.1875, + "rewards/margins": 1.0390625, + "rewards/rejected": -9.25, + "step": 1070 + }, + { + "epoch": 0.565149136577708, + "grad_norm": 7.888840261037634, + "learning_rate": 4.510121279963709e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.3125, + "logps/chosen": -528.0, + "logps/rejected": -496.0, + "loss": 0.7204, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -10.0625, + "rewards/margins": 1.03125, + "rewards/rejected": -11.0625, + "step": 1080 + }, + { + "epoch": 0.5703819989534276, + "grad_norm": 13.762939010968964, + "learning_rate": 4.4964589609762095e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.125, + "logps/chosen": -564.0, + "logps/rejected": -494.0, + "loss": 0.8346, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -11.4375, + "rewards/margins": 0.7734375, + "rewards/rejected": -12.1875, + "step": 1090 + }, + { + "epoch": 0.5756148613291471, + "grad_norm": 9.56155800775074, + "learning_rate": 4.482630035174205e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.9375, + "logps/chosen": -512.0, + "logps/rejected": -492.0, + "loss": 0.6975, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -10.8125, + "rewards/margins": 0.828125, + "rewards/rejected": -11.625, + "step": 1100 + }, + { + "epoch": 0.5808477237048666, + "grad_norm": 9.650034622926128, + "learning_rate": 4.468635656597582e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.9375, + "logps/chosen": -490.0, + "logps/rejected": -488.0, + "loss": 0.7859, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.75, + "rewards/margins": 0.77734375, + "rewards/rejected": -10.5, + "step": 1110 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 9.009994906294855, + "learning_rate": 4.454476993093454e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.375, + "logps/chosen": -580.0, + "logps/rejected": -524.0, + "loss": 0.9287, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.375, + "rewards/margins": 0.58203125, + "rewards/rejected": -10.9375, + "step": 1120 + }, + { + "epoch": 0.5913134484563056, + "grad_norm": 10.042283135138778, + "learning_rate": 4.440155226218703e-05, + "logits/chosen": -11.375, + "logits/rejected": -11.1875, + "logps/chosen": -496.0, + "logps/rejected": -504.0, + "loss": 0.8404, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -9.25, + "rewards/margins": 0.66015625, + "rewards/rejected": -9.875, + "step": 1130 + }, + { + "epoch": 0.5965463108320251, + "grad_norm": 8.418582913270656, + "learning_rate": 4.425671551141376e-05, + "logits/chosen": -11.125, + "logits/rejected": -10.875, + "logps/chosen": -564.0, + "logps/rejected": -520.0, + "loss": 0.6583, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -10.5625, + "rewards/margins": 0.8671875, + "rewards/rejected": -11.5, + "step": 1140 + }, + { + "epoch": 0.6017791732077447, + "grad_norm": 9.416691885986982, + "learning_rate": 4.411027176540948e-05, + "logits/chosen": -10.75, + "logits/rejected": -10.5625, + "logps/chosen": -524.0, + "logps/rejected": -510.0, + "loss": 0.6848, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -11.0, + "rewards/margins": 1.234375, + "rewards/rejected": -12.1875, + "step": 1150 + }, + { + "epoch": 0.6070120355834642, + "grad_norm": 10.865786644213799, + "learning_rate": 4.396223324507454e-05, + "logits/chosen": -10.8125, + "logits/rejected": -10.75, + "logps/chosen": -524.0, + "logps/rejected": -454.0, + "loss": 0.9435, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -11.4375, + "rewards/margins": 0.640625, + "rewards/rejected": -12.125, + "step": 1160 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 12.184578875756303, + "learning_rate": 4.3812612304395046e-05, + "logits/chosen": -11.25, + "logits/rejected": -11.125, + "logps/chosen": -556.0, + "logps/rejected": -564.0, + "loss": 0.7482, + "rewards/accuracies": 0.6875, + "rewards/chosen": -11.375, + "rewards/margins": 0.76953125, + "rewards/rejected": -12.125, + "step": 1170 + }, + { + "epoch": 0.6174777603349032, + "grad_norm": 7.183055929786656, + "learning_rate": 4.366142142941195e-05, + "logits/chosen": -10.6875, + "logits/rejected": -10.375, + "logps/chosen": -552.0, + "logps/rejected": -516.0, + "loss": 0.711, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.125, + "rewards/margins": 0.8515625, + "rewards/rejected": -12.0, + "step": 1180 + }, + { + "epoch": 0.6227106227106227, + "grad_norm": 7.359506759653038, + "learning_rate": 4.350867323717902e-05, + "logits/chosen": -10.625, + "logits/rejected": -10.375, + "logps/chosen": -528.0, + "logps/rejected": -510.0, + "loss": 0.5868, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.6875, + "rewards/margins": 0.7890625, + "rewards/rejected": -11.4375, + "step": 1190 + }, + { + "epoch": 0.6279434850863422, + "grad_norm": 6.501759078401181, + "learning_rate": 4.335438047470996e-05, + "logits/chosen": -10.75, + "logits/rejected": -10.5, + "logps/chosen": -528.0, + "logps/rejected": -536.0, + "loss": 0.6786, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -10.625, + "rewards/margins": 1.4296875, + "rewards/rejected": -12.0625, + "step": 1200 + }, + { + "epoch": 0.6331763474620618, + "grad_norm": 7.620878974669834, + "learning_rate": 4.3198556017914635e-05, + "logits/chosen": -11.1875, + "logits/rejected": -10.875, + "logps/chosen": -588.0, + "logps/rejected": -524.0, + "loss": 0.7357, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -11.125, + "rewards/margins": 1.40625, + "rewards/rejected": -12.5625, + "step": 1210 + }, + { + "epoch": 0.6384092098377813, + "grad_norm": 9.34056834357026, + "learning_rate": 4.30412128705246e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.3125, + "logps/chosen": -564.0, + "logps/rejected": -520.0, + "loss": 0.7723, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.375, + "rewards/margins": 1.109375, + "rewards/rejected": -11.4375, + "step": 1220 + }, + { + "epoch": 0.6436420722135008, + "grad_norm": 7.674870336983743, + "learning_rate": 4.28823641630079e-05, + "logits/chosen": -11.375, + "logits/rejected": -11.1875, + "logps/chosen": -568.0, + "logps/rejected": -516.0, + "loss": 0.7292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -10.125, + "rewards/margins": 1.5078125, + "rewards/rejected": -11.625, + "step": 1230 + }, + { + "epoch": 0.6488749345892203, + "grad_norm": 7.40544549406474, + "learning_rate": 4.2722023151473294e-05, + "logits/chosen": -10.9375, + "logits/rejected": -10.9375, + "logps/chosen": -486.0, + "logps/rejected": -492.0, + "loss": 0.7212, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.75, + "rewards/margins": 1.1171875, + "rewards/rejected": -11.875, + "step": 1240 + }, + { + "epoch": 0.6541077969649398, + "grad_norm": 9.53123578774151, + "learning_rate": 4.256020321656405e-05, + "logits/chosen": -10.625, + "logits/rejected": -10.375, + "logps/chosen": -560.0, + "logps/rejected": -552.0, + "loss": 0.7306, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.3125, + "rewards/margins": 0.95703125, + "rewards/rejected": -13.25, + "step": 1250 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 6.161183952229396, + "learning_rate": 4.239691786234133e-05, + "logits/chosen": -11.0, + "logits/rejected": -10.9375, + "logps/chosen": -544.0, + "logps/rejected": -488.0, + "loss": 0.6762, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.9375, + "rewards/margins": 1.140625, + "rewards/rejected": -12.125, + "step": 1260 + }, + { + "epoch": 0.6645735217163788, + "grad_norm": 8.172666817720033, + "learning_rate": 4.223218071515721e-05, + "logits/chosen": -11.0, + "logits/rejected": -10.875, + "logps/chosen": -544.0, + "logps/rejected": -516.0, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -11.0625, + "rewards/margins": 0.99609375, + "rewards/rejected": -12.0625, + "step": 1270 + }, + { + "epoch": 0.6698063840920984, + "grad_norm": 12.188827913123875, + "learning_rate": 4.206600552251756e-05, + "logits/chosen": -11.25, + "logits/rejected": -11.125, + "logps/chosen": -524.0, + "logps/rejected": -486.0, + "loss": 0.79, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -10.5, + "rewards/margins": 0.9609375, + "rewards/rejected": -11.4375, + "step": 1280 + }, + { + "epoch": 0.6750392464678179, + "grad_norm": 18.826047916539064, + "learning_rate": 4.189840615193486e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.375, + "logps/chosen": -498.0, + "logps/rejected": -520.0, + "loss": 0.7081, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -9.8125, + "rewards/margins": 0.96875, + "rewards/rejected": -10.8125, + "step": 1290 + }, + { + "epoch": 0.6802721088435374, + "grad_norm": 6.969800701972501, + "learning_rate": 4.172939658977084e-05, + "logits/chosen": -11.1875, + "logits/rejected": -11.0, + "logps/chosen": -528.0, + "logps/rejected": -544.0, + "loss": 0.7148, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.625, + "rewards/margins": 0.921875, + "rewards/rejected": -11.5625, + "step": 1300 + }, + { + "epoch": 0.6855049712192569, + "grad_norm": 14.859556755269326, + "learning_rate": 4.155899094006938e-05, + "logits/chosen": -10.9375, + "logits/rejected": -10.8125, + "logps/chosen": -564.0, + "logps/rejected": -528.0, + "loss": 0.7416, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -10.875, + "rewards/margins": 0.9609375, + "rewards/rejected": -11.875, + "step": 1310 + }, + { + "epoch": 0.6907378335949764, + "grad_norm": 6.957089873593148, + "learning_rate": 4.138720342337947e-05, + "logits/chosen": -11.125, + "logits/rejected": -11.0, + "logps/chosen": -564.0, + "logps/rejected": -552.0, + "loss": 0.6879, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -11.75, + "rewards/margins": 1.0859375, + "rewards/rejected": -12.8125, + "step": 1320 + }, + { + "epoch": 0.6959706959706959, + "grad_norm": 9.707692830662324, + "learning_rate": 4.121404837556851e-05, + "logits/chosen": -11.9375, + "logits/rejected": -11.5625, + "logps/chosen": -580.0, + "logps/rejected": -616.0, + "loss": 0.6995, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -11.5625, + "rewards/margins": 1.546875, + "rewards/rejected": -13.0625, + "step": 1330 + }, + { + "epoch": 0.7012035583464155, + "grad_norm": 8.770735737960996, + "learning_rate": 4.103954024662594e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.4375, + "logps/chosen": -568.0, + "logps/rejected": -536.0, + "loss": 0.719, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -11.25, + "rewards/margins": 0.640625, + "rewards/rejected": -11.875, + "step": 1340 + }, + { + "epoch": 0.706436420722135, + "grad_norm": 7.1593445389385515, + "learning_rate": 4.086369359945743e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.4375, + "logps/chosen": -576.0, + "logps/rejected": -564.0, + "loss": 0.7039, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -10.5625, + "rewards/margins": 1.0546875, + "rewards/rejected": -11.625, + "step": 1350 + }, + { + "epoch": 0.7116692830978545, + "grad_norm": 9.243196425255588, + "learning_rate": 4.0686523108669496e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.625, + "logps/chosen": -600.0, + "logps/rejected": -572.0, + "loss": 0.8775, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.9375, + "rewards/margins": 1.125, + "rewards/rejected": -12.0625, + "step": 1360 + }, + { + "epoch": 0.716902145473574, + "grad_norm": 11.110076974797884, + "learning_rate": 4.050804355934498e-05, + "logits/chosen": -11.25, + "logits/rejected": -11.1875, + "logps/chosen": -568.0, + "logps/rejected": -524.0, + "loss": 0.6187, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.4375, + "rewards/margins": 0.9375, + "rewards/rejected": -12.375, + "step": 1370 + }, + { + "epoch": 0.7221350078492935, + "grad_norm": 6.00630186538614, + "learning_rate": 4.032826984580914e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.125, + "logps/chosen": -512.0, + "logps/rejected": -484.0, + "loss": 0.6801, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -10.3125, + "rewards/margins": 1.078125, + "rewards/rejected": -11.375, + "step": 1380 + }, + { + "epoch": 0.727367870225013, + "grad_norm": 8.977754421118522, + "learning_rate": 4.014721697038678e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.4375, + "logps/chosen": -490.0, + "logps/rejected": -482.0, + "loss": 0.6641, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -9.625, + "rewards/margins": 1.1015625, + "rewards/rejected": -10.75, + "step": 1390 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 9.068303784048501, + "learning_rate": 3.996490004215021e-05, + "logits/chosen": -11.8125, + "logits/rejected": -11.6875, + "logps/chosen": -512.0, + "logps/rejected": -498.0, + "loss": 0.6787, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -10.3125, + "rewards/margins": 1.125, + "rewards/rejected": -11.4375, + "step": 1400 + }, + { + "epoch": 0.7378335949764521, + "grad_norm": 10.091388237018714, + "learning_rate": 3.978133427565842e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.125, + "logps/chosen": -524.0, + "logps/rejected": -516.0, + "loss": 0.7733, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -11.25, + "rewards/margins": 0.84375, + "rewards/rejected": -12.125, + "step": 1410 + }, + { + "epoch": 0.7430664573521716, + "grad_norm": 9.639237375339137, + "learning_rate": 3.9596534989687416e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.1875, + "logps/chosen": -506.0, + "logps/rejected": -504.0, + "loss": 0.7797, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -10.0, + "rewards/margins": 1.8671875, + "rewards/rejected": -11.875, + "step": 1420 + }, + { + "epoch": 0.7482993197278912, + "grad_norm": 6.806097018429323, + "learning_rate": 3.9410517605951824e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.9375, + "logps/chosen": -532.0, + "logps/rejected": -488.0, + "loss": 0.6814, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.6875, + "rewards/margins": 0.486328125, + "rewards/rejected": -11.1875, + "step": 1430 + }, + { + "epoch": 0.7535321821036107, + "grad_norm": 9.16956662251956, + "learning_rate": 3.922329764781793e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.125, + "logps/chosen": -588.0, + "logps/rejected": -544.0, + "loss": 0.7421, + "rewards/accuracies": 0.6875, + "rewards/chosen": -11.0, + "rewards/margins": 1.1015625, + "rewards/rejected": -12.125, + "step": 1440 + }, + { + "epoch": 0.7587650444793302, + "grad_norm": 7.958218718062474, + "learning_rate": 3.903489073900828e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.5, + "logps/chosen": -552.0, + "logps/rejected": -532.0, + "loss": 0.7735, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -11.5625, + "rewards/margins": 0.8203125, + "rewards/rejected": -12.375, + "step": 1450 + }, + { + "epoch": 0.7639979068550498, + "grad_norm": 8.365628327112017, + "learning_rate": 3.884531260229778e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.625, + "logps/chosen": -536.0, + "logps/rejected": -490.0, + "loss": 0.7042, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -11.1875, + "rewards/margins": 0.5390625, + "rewards/rejected": -11.75, + "step": 1460 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 7.681754910048477, + "learning_rate": 3.8654579058201704e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.3125, + "logps/chosen": -516.0, + "logps/rejected": -492.0, + "loss": 0.6215, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -10.3125, + "rewards/margins": 1.109375, + "rewards/rejected": -11.4375, + "step": 1470 + }, + { + "epoch": 0.7744636316064888, + "grad_norm": 8.699313776529005, + "learning_rate": 3.8462706023655404e-05, + "logits/chosen": -10.9375, + "logits/rejected": -11.0, + "logps/chosen": -516.0, + "logps/rejected": -520.0, + "loss": 0.7719, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.5625, + "rewards/margins": 0.89453125, + "rewards/rejected": -12.4375, + "step": 1480 + }, + { + "epoch": 0.7796964939822083, + "grad_norm": 10.20603489800159, + "learning_rate": 3.8269709510686005e-05, + "logits/chosen": -10.9375, + "logits/rejected": -10.8125, + "logps/chosen": -536.0, + "logps/rejected": -520.0, + "loss": 0.7257, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -11.1875, + "rewards/margins": 0.7265625, + "rewards/rejected": -11.9375, + "step": 1490 + }, + { + "epoch": 0.7849293563579278, + "grad_norm": 6.371725891794927, + "learning_rate": 3.807560562507624e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.0, + "logps/chosen": -478.0, + "logps/rejected": -516.0, + "loss": 0.6052, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.8125, + "rewards/margins": 1.390625, + "rewards/rejected": -12.1875, + "step": 1500 + }, + { + "epoch": 0.7901622187336473, + "grad_norm": 8.044514431743275, + "learning_rate": 3.7880410565020366e-05, + "logits/chosen": -11.9375, + "logits/rejected": -11.625, + "logps/chosen": -532.0, + "logps/rejected": -552.0, + "loss": 0.722, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -11.4375, + "rewards/margins": 0.9765625, + "rewards/rejected": -12.4375, + "step": 1510 + }, + { + "epoch": 0.7953950811093669, + "grad_norm": 6.3419982155290855, + "learning_rate": 3.76841406197724e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.125, + "logps/chosen": -468.0, + "logps/rejected": -460.0, + "loss": 0.693, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -9.6875, + "rewards/margins": 1.0390625, + "rewards/rejected": -10.6875, + "step": 1520 + }, + { + "epoch": 0.8006279434850864, + "grad_norm": 13.822774811604797, + "learning_rate": 3.748681216828678e-05, + "logits/chosen": -12.1875, + "logits/rejected": -11.75, + "logps/chosen": -572.0, + "logps/rejected": -644.0, + "loss": 0.7042, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -11.4375, + "rewards/margins": 0.76953125, + "rewards/rejected": -12.25, + "step": 1530 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 8.295919264278089, + "learning_rate": 3.728844167785151e-05, + "logits/chosen": -11.875, + "logits/rejected": -11.625, + "logps/chosen": -584.0, + "logps/rejected": -572.0, + "loss": 0.7166, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -11.1875, + "rewards/margins": 0.8359375, + "rewards/rejected": -12.0, + "step": 1540 + }, + { + "epoch": 0.8110936682365254, + "grad_norm": 8.520816598792539, + "learning_rate": 3.7089045702713976e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.25, + "logps/chosen": -584.0, + "logps/rejected": -510.0, + "loss": 0.7012, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.3125, + "rewards/margins": 1.0390625, + "rewards/rejected": -11.375, + "step": 1550 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 8.142573842126938, + "learning_rate": 3.6888640882699425e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.3125, + "logps/chosen": -584.0, + "logps/rejected": -568.0, + "loss": 0.7397, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -11.625, + "rewards/margins": 0.89453125, + "rewards/rejected": -12.5, + "step": 1560 + }, + { + "epoch": 0.8215593929879644, + "grad_norm": 7.481987186912928, + "learning_rate": 3.668724394182239e-05, + "logits/chosen": -11.375, + "logits/rejected": -10.9375, + "logps/chosen": -528.0, + "logps/rejected": -524.0, + "loss": 0.7039, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.9375, + "rewards/margins": 1.1484375, + "rewards/rejected": -13.0625, + "step": 1570 + }, + { + "epoch": 0.826792255363684, + "grad_norm": 8.042620411635573, + "learning_rate": 3.648487168689104e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.375, + "logps/chosen": -604.0, + "logps/rejected": -536.0, + "loss": 0.7544, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -11.25, + "rewards/margins": 0.69140625, + "rewards/rejected": -11.9375, + "step": 1580 + }, + { + "epoch": 0.8320251177394035, + "grad_norm": 6.330392212337275, + "learning_rate": 3.628154100610463e-05, + "logits/chosen": -11.875, + "logits/rejected": -11.5, + "logps/chosen": -528.0, + "logps/rejected": -496.0, + "loss": 0.7201, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.9375, + "rewards/margins": 1.0625, + "rewards/rejected": -11.0, + "step": 1590 + }, + { + "epoch": 0.837257980115123, + "grad_norm": 9.495665962948788, + "learning_rate": 3.607726886764415e-05, + "logits/chosen": -12.0625, + "logits/rejected": -11.875, + "logps/chosen": -506.0, + "logps/rejected": -544.0, + "loss": 0.7175, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.3125, + "rewards/margins": 0.96484375, + "rewards/rejected": -11.25, + "step": 1600 + }, + { + "epoch": 0.8424908424908425, + "grad_norm": 8.88992804946847, + "learning_rate": 3.5872072318256375e-05, + "logits/chosen": -11.9375, + "logits/rejected": -11.75, + "logps/chosen": -596.0, + "logps/rejected": -552.0, + "loss": 0.7063, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.75, + "rewards/margins": 1.25, + "rewards/rejected": -12.0, + "step": 1610 + }, + { + "epoch": 0.847723704866562, + "grad_norm": 6.549366426211227, + "learning_rate": 3.566596848183117e-05, + "logits/chosen": -11.375, + "logits/rejected": -11.3125, + "logps/chosen": -528.0, + "logps/rejected": -488.0, + "loss": 0.7635, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -11.4375, + "rewards/margins": 0.74609375, + "rewards/rejected": -12.1875, + "step": 1620 + }, + { + "epoch": 0.8529565672422815, + "grad_norm": 8.965681557356943, + "learning_rate": 3.54589745579726e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.5625, + "logps/chosen": -560.0, + "logps/rejected": -564.0, + "loss": 0.7312, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -11.125, + "rewards/margins": 0.96875, + "rewards/rejected": -12.0625, + "step": 1630 + }, + { + "epoch": 0.858189429618001, + "grad_norm": 7.597359211830405, + "learning_rate": 3.5251107820563565e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.5625, + "logps/chosen": -580.0, + "logps/rejected": -540.0, + "loss": 0.7307, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -9.875, + "rewards/margins": 0.94140625, + "rewards/rejected": -10.8125, + "step": 1640 + }, + { + "epoch": 0.8634222919937206, + "grad_norm": 16.306018432981723, + "learning_rate": 3.504238561632424e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.6875, + "logps/chosen": -512.0, + "logps/rejected": -516.0, + "loss": 0.788, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.8125, + "rewards/margins": 0.9375, + "rewards/rejected": -10.75, + "step": 1650 + }, + { + "epoch": 0.8686551543694401, + "grad_norm": 6.41824790740516, + "learning_rate": 3.483282536336451e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.6875, + "logps/chosen": -468.0, + "logps/rejected": -464.0, + "loss": 0.7286, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -9.125, + "rewards/margins": 1.0546875, + "rewards/rejected": -10.125, + "step": 1660 + }, + { + "epoch": 0.8738880167451596, + "grad_norm": 8.997202419771773, + "learning_rate": 3.46224445497304e-05, + "logits/chosen": -11.8125, + "logits/rejected": -11.8125, + "logps/chosen": -552.0, + "logps/rejected": -532.0, + "loss": 0.6613, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -10.1875, + "rewards/margins": 1.53125, + "rewards/rejected": -11.75, + "step": 1670 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 8.646781069565186, + "learning_rate": 3.441126073194468e-05, + "logits/chosen": -12.1875, + "logits/rejected": -11.75, + "logps/chosen": -504.0, + "logps/rejected": -532.0, + "loss": 0.6581, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -9.9375, + "rewards/margins": 1.546875, + "rewards/rejected": -11.5, + "step": 1680 + }, + { + "epoch": 0.8843537414965986, + "grad_norm": 7.113786319184486, + "learning_rate": 3.4199291533541735e-05, + "logits/chosen": -11.625, + "logits/rejected": -11.5, + "logps/chosen": -520.0, + "logps/rejected": -494.0, + "loss": 0.6574, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.4375, + "rewards/margins": 0.8671875, + "rewards/rejected": -11.3125, + "step": 1690 + }, + { + "epoch": 0.8895866038723181, + "grad_norm": 9.146215241563787, + "learning_rate": 3.398655464359687e-05, + "logits/chosen": -11.9375, + "logits/rejected": -11.8125, + "logps/chosen": -604.0, + "logps/rejected": -484.0, + "loss": 1.2266, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -12.8125, + "rewards/margins": -1.1484375, + "rewards/rejected": -11.6875, + "step": 1700 + }, + { + "epoch": 0.8948194662480377, + "grad_norm": 7.301197069927249, + "learning_rate": 3.377306781525015e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.875, + "logps/chosen": -560.0, + "logps/rejected": -540.0, + "loss": 0.7462, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.9375, + "rewards/margins": 1.0234375, + "rewards/rejected": -11.9375, + "step": 1710 + }, + { + "epoch": 0.9000523286237572, + "grad_norm": 7.026924797144021, + "learning_rate": 3.3558848864224876e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.1875, + "logps/chosen": -540.0, + "logps/rejected": -506.0, + "loss": 0.6618, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.4375, + "rewards/margins": 0.85546875, + "rewards/rejected": -11.3125, + "step": 1720 + }, + { + "epoch": 0.9052851909994767, + "grad_norm": 11.941983652096013, + "learning_rate": 3.334391566734082e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.0, + "logps/chosen": -492.0, + "logps/rejected": -494.0, + "loss": 0.7589, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.375, + "rewards/margins": 1.1484375, + "rewards/rejected": -10.5, + "step": 1730 + }, + { + "epoch": 0.9105180533751962, + "grad_norm": 8.065484818366723, + "learning_rate": 3.3128286161022394e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.75, + "logps/chosen": -552.0, + "logps/rejected": -510.0, + "loss": 0.7712, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.5, + "rewards/margins": 0.984375, + "rewards/rejected": -11.4375, + "step": 1740 + }, + { + "epoch": 0.9157509157509157, + "grad_norm": 8.045324511262809, + "learning_rate": 3.2911978339801855e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.0, + "logps/chosen": -584.0, + "logps/rejected": -568.0, + "loss": 0.7152, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -10.375, + "rewards/margins": 1.2890625, + "rewards/rejected": -11.6875, + "step": 1750 + }, + { + "epoch": 0.9209837781266352, + "grad_norm": 10.25223828631299, + "learning_rate": 3.269501025481763e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.3125, + "logps/chosen": -528.0, + "logps/rejected": -560.0, + "loss": 0.6604, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.125, + "rewards/margins": 0.89453125, + "rewards/rejected": -11.0, + "step": 1760 + }, + { + "epoch": 0.9262166405023547, + "grad_norm": 6.658643984408585, + "learning_rate": 3.2477400012307885e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.25, + "logps/chosen": -540.0, + "logps/rejected": -540.0, + "loss": 0.7548, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.75, + "rewards/margins": 1.0, + "rewards/rejected": -10.75, + "step": 1770 + }, + { + "epoch": 0.9314495028780743, + "grad_norm": 7.583843351290834, + "learning_rate": 3.2259165772099644e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.9375, + "logps/chosen": -540.0, + "logps/rejected": -528.0, + "loss": 0.6979, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.75, + "rewards/margins": 1.1171875, + "rewards/rejected": -10.875, + "step": 1780 + }, + { + "epoch": 0.9366823652537938, + "grad_norm": 6.695330181972035, + "learning_rate": 3.204032574609318e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.375, + "logps/chosen": -576.0, + "logps/rejected": -568.0, + "loss": 0.7396, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.5, + "rewards/margins": 1.046875, + "rewards/rejected": -11.5, + "step": 1790 + }, + { + "epoch": 0.9419152276295133, + "grad_norm": 7.41109532240638, + "learning_rate": 3.1820898196742335e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.0, + "logps/chosen": -520.0, + "logps/rejected": -548.0, + "loss": 0.7853, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -11.0625, + "rewards/margins": 0.8203125, + "rewards/rejected": -11.875, + "step": 1800 + }, + { + "epoch": 0.9471480900052328, + "grad_norm": 7.998757902945492, + "learning_rate": 3.160090143553049e-05, + "logits/chosen": -11.875, + "logits/rejected": -11.625, + "logps/chosen": -476.0, + "logps/rejected": -476.0, + "loss": 0.6803, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -10.375, + "rewards/margins": 1.015625, + "rewards/rejected": -11.4375, + "step": 1810 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 7.080219482241533, + "learning_rate": 3.1380353821442354e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.8125, + "logps/chosen": -448.0, + "logps/rejected": -446.0, + "loss": 0.7265, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.75, + "rewards/margins": 0.76171875, + "rewards/rejected": -10.5, + "step": 1820 + }, + { + "epoch": 0.957613814756672, + "grad_norm": 10.03679929204167, + "learning_rate": 3.1159273759431964e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.5, + "logps/chosen": -544.0, + "logps/rejected": -516.0, + "loss": 0.6867, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.125, + "rewards/margins": 1.0703125, + "rewards/rejected": -12.25, + "step": 1830 + }, + { + "epoch": 0.9628466771323915, + "grad_norm": 6.830881800921508, + "learning_rate": 3.0937679698886786e-05, + "logits/chosen": -12.0625, + "logits/rejected": -11.875, + "logps/chosen": -560.0, + "logps/rejected": -540.0, + "loss": 0.5938, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -11.125, + "rewards/margins": 1.3125, + "rewards/rejected": -12.4375, + "step": 1840 + }, + { + "epoch": 0.968079539508111, + "grad_norm": 8.529607259118398, + "learning_rate": 3.071559013208801e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.5625, + "logps/chosen": -628.0, + "logps/rejected": -568.0, + "loss": 0.7147, + "rewards/accuracies": 0.6875, + "rewards/chosen": -11.4375, + "rewards/margins": 1.2109375, + "rewards/rejected": -12.625, + "step": 1850 + }, + { + "epoch": 0.9733124018838305, + "grad_norm": 6.915342277139341, + "learning_rate": 3.0493023592667446e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.25, + "logps/chosen": -540.0, + "logps/rejected": -500.0, + "loss": 0.7208, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -10.625, + "rewards/margins": 0.76953125, + "rewards/rejected": -11.375, + "step": 1860 + }, + { + "epoch": 0.97854526425955, + "grad_norm": 9.778948710217877, + "learning_rate": 3.0269998654060788e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.0, + "logps/chosen": -502.0, + "logps/rejected": -462.0, + "loss": 0.6835, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.75, + "rewards/margins": 0.7421875, + "rewards/rejected": -10.5, + "step": 1870 + }, + { + "epoch": 0.9837781266352695, + "grad_norm": 5.655349340296246, + "learning_rate": 3.0046533927957677e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.3125, + "logps/chosen": -520.0, + "logps/rejected": -540.0, + "loss": 0.6201, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.25, + "rewards/margins": 1.0, + "rewards/rejected": -11.25, + "step": 1880 + }, + { + "epoch": 0.989010989010989, + "grad_norm": 10.130846611986492, + "learning_rate": 2.9822648062748536e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.375, + "logps/chosen": -612.0, + "logps/rejected": -600.0, + "loss": 0.679, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -10.5625, + "rewards/margins": 1.125, + "rewards/rejected": -11.6875, + "step": 1890 + }, + { + "epoch": 0.9942438513867086, + "grad_norm": 7.921271665865128, + "learning_rate": 2.959835974196836e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.5625, + "logps/chosen": -516.0, + "logps/rejected": -520.0, + "loss": 0.6907, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.375, + "rewards/margins": 1.0625, + "rewards/rejected": -11.4375, + "step": 1900 + }, + { + "epoch": 0.9994767137624281, + "grad_norm": 8.904276645319571, + "learning_rate": 2.9373687682737484e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.0, + "logps/chosen": -588.0, + "logps/rejected": -544.0, + "loss": 0.6791, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -10.8125, + "rewards/margins": 0.83203125, + "rewards/rejected": -11.625, + "step": 1910 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -12.6875, + "eval_logits/rejected": -12.5625, + "eval_logps/chosen": -536.0, + "eval_logps/rejected": -528.0, + "eval_loss": 0.7097968459129333, + "eval_rewards/accuracies": 0.69921875, + "eval_rewards/chosen": -10.8125, + "eval_rewards/margins": 1.1328125, + "eval_rewards/rejected": -11.9375, + "eval_runtime": 47.5241, + "eval_samples_per_second": 42.084, + "eval_steps_per_second": 0.673, + "step": 1911 + }, + { + "epoch": 1.0047095761381475, + "grad_norm": 4.728380640009898, + "learning_rate": 2.9148650634199674e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.1875, + "logps/chosen": -472.0, + "logps/rejected": -516.0, + "loss": 0.2767, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.5625, + "rewards/margins": 3.671875, + "rewards/rejected": -13.25, + "step": 1920 + }, + { + "epoch": 1.0099424385138671, + "grad_norm": 1.024609812555977, + "learning_rate": 2.892326737595751e-05, + "logits/chosen": -12.25, + "logits/rejected": -11.8125, + "logps/chosen": -576.0, + "logps/rejected": -612.0, + "loss": 0.1663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.375, + "rewards/margins": 6.4375, + "rewards/rejected": -15.8125, + "step": 1930 + }, + { + "epoch": 1.0151753008895865, + "grad_norm": 3.373687792077228, + "learning_rate": 2.869755671650512e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.25, + "logps/chosen": -540.0, + "logps/rejected": -660.0, + "loss": 0.1126, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.0, + "rewards/margins": 6.8125, + "rewards/rejected": -16.875, + "step": 1940 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 2.368031581220674, + "learning_rate": 2.847153749165869e-05, + "logits/chosen": -9.75, + "logits/rejected": -9.375, + "logps/chosen": -524.0, + "logps/rejected": -596.0, + "loss": 0.1652, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.75, + "rewards/margins": 4.5625, + "rewards/rejected": -16.375, + "step": 1950 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 1.63741968428215, + "learning_rate": 2.8245228562984516e-05, + "logits/chosen": -10.8125, + "logits/rejected": -10.125, + "logps/chosen": -536.0, + "logps/rejected": -588.0, + "loss": 0.1023, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.75, + "rewards/margins": 6.4375, + "rewards/rejected": -17.25, + "step": 1960 + }, + { + "epoch": 1.0308738880167452, + "grad_norm": 2.551100302367526, + "learning_rate": 2.8018648816225025e-05, + "logits/chosen": -11.5, + "logits/rejected": -11.0, + "logps/chosen": -556.0, + "logps/rejected": -640.0, + "loss": 0.1294, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -7.96875, + "rewards/margins": 8.4375, + "rewards/rejected": -16.375, + "step": 1970 + }, + { + "epoch": 1.0361067503924646, + "grad_norm": 2.6749422209678073, + "learning_rate": 2.7791817159722726e-05, + "logits/chosen": -10.6875, + "logits/rejected": -10.375, + "logps/chosen": -470.0, + "logps/rejected": -552.0, + "loss": 0.1154, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.0625, + "rewards/margins": 5.46875, + "rewards/rejected": -14.5625, + "step": 1980 + }, + { + "epoch": 1.0413396127681842, + "grad_norm": 2.655504829474548, + "learning_rate": 2.756475252284229e-05, + "logits/chosen": -11.25, + "logits/rejected": -10.9375, + "logps/chosen": -556.0, + "logps/rejected": -656.0, + "loss": 0.1274, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.1875, + "rewards/margins": 6.71875, + "rewards/rejected": -15.875, + "step": 1990 + }, + { + "epoch": 1.0465724751439036, + "grad_norm": 2.5200245472186418, + "learning_rate": 2.7337473854390865e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.375, + "logps/chosen": -516.0, + "logps/rejected": -584.0, + "loss": 0.164, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.0, + "rewards/margins": 6.25, + "rewards/rejected": -15.25, + "step": 2000 + }, + { + "epoch": 1.0518053375196232, + "grad_norm": 4.499576388025018, + "learning_rate": 2.7110000121036793e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.25, + "logps/chosen": -494.0, + "logps/rejected": -600.0, + "loss": 0.0866, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.9375, + "rewards/margins": 7.875, + "rewards/rejected": -16.75, + "step": 2010 + }, + { + "epoch": 1.0570381998953426, + "grad_norm": 3.108328168380246, + "learning_rate": 2.688235030572679e-05, + "logits/chosen": -11.9375, + "logits/rejected": -11.75, + "logps/chosen": -512.0, + "logps/rejected": -584.0, + "loss": 0.1356, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.3125, + "rewards/margins": 7.0625, + "rewards/rejected": -16.375, + "step": 2020 + }, + { + "epoch": 1.0622710622710623, + "grad_norm": 2.7373393989725763, + "learning_rate": 2.6654543406101833e-05, + "logits/chosen": -12.125, + "logits/rejected": -11.6875, + "logps/chosen": -520.0, + "logps/rejected": -656.0, + "loss": 0.1524, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.125, + "rewards/margins": 7.28125, + "rewards/rejected": -17.375, + "step": 2030 + }, + { + "epoch": 1.0675039246467817, + "grad_norm": 2.6186672086656926, + "learning_rate": 2.6426598432911763e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.5, + "logps/chosen": -576.0, + "logps/rejected": -668.0, + "loss": 0.1237, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.25, + "rewards/margins": 7.53125, + "rewards/rejected": -16.75, + "step": 2040 + }, + { + "epoch": 1.0727367870225013, + "grad_norm": 1.4275150273517287, + "learning_rate": 2.6198534408428804e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.375, + "logps/chosen": -532.0, + "logps/rejected": -628.0, + "loss": 0.0866, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.5, + "rewards/margins": 8.4375, + "rewards/rejected": -17.0, + "step": 2050 + }, + { + "epoch": 1.077969649398221, + "grad_norm": 4.626435577549045, + "learning_rate": 2.5970370364860176e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.25, + "logps/chosen": -478.0, + "logps/rejected": -636.0, + "loss": 0.1145, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.0625, + "rewards/margins": 8.625, + "rewards/rejected": -17.625, + "step": 2060 + }, + { + "epoch": 1.0832025117739403, + "grad_norm": 21.753848528929037, + "learning_rate": 2.574212534275978e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.3125, + "logps/chosen": -464.0, + "logps/rejected": -552.0, + "loss": 0.1502, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.875, + "rewards/margins": 6.15625, + "rewards/rejected": -16.0, + "step": 2070 + }, + { + "epoch": 1.08843537414966, + "grad_norm": 2.4608658219211628, + "learning_rate": 2.5513818389439304e-05, + "logits/chosen": -12.1875, + "logits/rejected": -11.6875, + "logps/chosen": -556.0, + "logps/rejected": -656.0, + "loss": 0.1992, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.375, + "rewards/margins": 7.34375, + "rewards/rejected": -17.75, + "step": 2080 + }, + { + "epoch": 1.0936682365253794, + "grad_norm": 2.691085082609704, + "learning_rate": 2.5285468557378616e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.5, + "logps/chosen": -476.0, + "logps/rejected": -608.0, + "loss": 0.1161, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.625, + "rewards/margins": 7.5625, + "rewards/rejected": -17.25, + "step": 2090 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 2.6635752913927853, + "learning_rate": 2.5057094902635918e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.4375, + "logps/chosen": -528.0, + "logps/rejected": -616.0, + "loss": 0.0969, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.8125, + "rewards/margins": 7.4375, + "rewards/rejected": -17.25, + "step": 2100 + }, + { + "epoch": 1.1041339612768184, + "grad_norm": 1.736617765978122, + "learning_rate": 2.4828716483257418e-05, + "logits/chosen": -12.25, + "logits/rejected": -11.625, + "logps/chosen": -548.0, + "logps/rejected": -636.0, + "loss": 0.1133, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.875, + "rewards/margins": 8.125, + "rewards/rejected": -18.0, + "step": 2110 + }, + { + "epoch": 1.109366823652538, + "grad_norm": 1.226880604327556, + "learning_rate": 2.460035235768692e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.0, + "logps/chosen": -500.0, + "logps/rejected": -644.0, + "loss": 0.1001, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.1875, + "rewards/margins": 8.0, + "rewards/rejected": -18.25, + "step": 2120 + }, + { + "epoch": 1.1145996860282574, + "grad_norm": 5.772340271804208, + "learning_rate": 2.4372021583175446e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.125, + "logps/chosen": -536.0, + "logps/rejected": -568.0, + "loss": 0.1159, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.875, + "rewards/margins": 6.0, + "rewards/rejected": -15.875, + "step": 2130 + }, + { + "epoch": 1.119832548403977, + "grad_norm": 4.404511520229935, + "learning_rate": 2.4143743214190778e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.4375, + "logps/chosen": -510.0, + "logps/rejected": -640.0, + "loss": 0.1329, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.8125, + "rewards/margins": 7.09375, + "rewards/rejected": -16.875, + "step": 2140 + }, + { + "epoch": 1.1250654107796965, + "grad_norm": 2.5920340144560723, + "learning_rate": 2.3915536300827414e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.3125, + "logps/chosen": -458.0, + "logps/rejected": -632.0, + "loss": 0.1462, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.1875, + "rewards/margins": 7.28125, + "rewards/rejected": -16.5, + "step": 2150 + }, + { + "epoch": 1.130298273155416, + "grad_norm": 1.9315423437420907, + "learning_rate": 2.3687419887216825e-05, + "logits/chosen": -13.25, + "logits/rejected": -12.9375, + "logps/chosen": -520.0, + "logps/rejected": -592.0, + "loss": 0.1172, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.25, + "rewards/margins": 6.5625, + "rewards/rejected": -15.8125, + "step": 2160 + }, + { + "epoch": 1.1355311355311355, + "grad_norm": 4.9387988320363085, + "learning_rate": 2.345941300993812e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.25, + "logps/chosen": -524.0, + "logps/rejected": -628.0, + "loss": 0.1262, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.125, + "rewards/margins": 7.5, + "rewards/rejected": -16.625, + "step": 2170 + }, + { + "epoch": 1.1407639979068551, + "grad_norm": 2.6490159969730476, + "learning_rate": 2.3231534696429533e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.625, + "logps/chosen": -500.0, + "logps/rejected": -640.0, + "loss": 0.0858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.8125, + "rewards/margins": 7.40625, + "rewards/rejected": -17.25, + "step": 2180 + }, + { + "epoch": 1.1459968602825745, + "grad_norm": 1.8591406460035929, + "learning_rate": 2.3003803963400468e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.25, + "logps/chosen": -504.0, + "logps/rejected": -632.0, + "loss": 0.1339, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.625, + "rewards/margins": 7.1875, + "rewards/rejected": -16.75, + "step": 2190 + }, + { + "epoch": 1.1512297226582942, + "grad_norm": 3.440279490788567, + "learning_rate": 2.2776239815244543e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.5625, + "logps/chosen": -536.0, + "logps/rejected": -648.0, + "loss": 0.1023, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.8125, + "rewards/margins": 8.8125, + "rewards/rejected": -17.625, + "step": 2200 + }, + { + "epoch": 1.1564625850340136, + "grad_norm": 2.3561174482239413, + "learning_rate": 2.2548861242453742e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -528.0, + "logps/rejected": -608.0, + "loss": 0.0903, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.8125, + "rewards/margins": 7.125, + "rewards/rejected": -16.875, + "step": 2210 + }, + { + "epoch": 1.1616954474097332, + "grad_norm": 3.129509901044873, + "learning_rate": 2.2321687220033523e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.6875, + "logps/chosen": -498.0, + "logps/rejected": -652.0, + "loss": 0.1041, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.5, + "rewards/margins": 8.8125, + "rewards/rejected": -18.375, + "step": 2220 + }, + { + "epoch": 1.1669283097854526, + "grad_norm": 6.073604137970551, + "learning_rate": 2.2094736705919368e-05, + "logits/chosen": -13.0625, + "logits/rejected": -12.8125, + "logps/chosen": -568.0, + "logps/rejected": -672.0, + "loss": 0.1124, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.75, + "rewards/margins": 8.25, + "rewards/rejected": -18.0, + "step": 2230 + }, + { + "epoch": 1.1721611721611722, + "grad_norm": 4.837786173506918, + "learning_rate": 2.186802863939477e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.5, + "logps/chosen": -502.0, + "logps/rejected": -664.0, + "loss": 0.1104, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.625, + "rewards/margins": 8.625, + "rewards/rejected": -19.25, + "step": 2240 + }, + { + "epoch": 1.1773940345368916, + "grad_norm": 8.171951988640048, + "learning_rate": 2.1641581939510667e-05, + "logits/chosen": -13.4375, + "logits/rejected": -13.1875, + "logps/chosen": -552.0, + "logps/rejected": -672.0, + "loss": 0.0996, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.5, + "rewards/margins": 7.1875, + "rewards/rejected": -17.75, + "step": 2250 + }, + { + "epoch": 1.1826268969126112, + "grad_norm": 1.7116523821577394, + "learning_rate": 2.1415415503506653e-05, + "logits/chosen": -13.4375, + "logits/rejected": -13.1875, + "logps/chosen": -548.0, + "logps/rejected": -676.0, + "loss": 0.09, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.5625, + "rewards/margins": 7.9375, + "rewards/rejected": -18.5, + "step": 2260 + }, + { + "epoch": 1.1878597592883307, + "grad_norm": 2.8123145023686416, + "learning_rate": 2.1189548205233975e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.25, + "logps/chosen": -584.0, + "logps/rejected": -728.0, + "loss": 0.0911, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.375, + "rewards/margins": 9.3125, + "rewards/rejected": -19.75, + "step": 2270 + }, + { + "epoch": 1.1930926216640503, + "grad_norm": 1.3758247246151873, + "learning_rate": 2.0963998893580487e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.125, + "logps/chosen": -544.0, + "logps/rejected": -672.0, + "loss": 0.1232, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.875, + "rewards/margins": 9.0625, + "rewards/rejected": -19.0, + "step": 2280 + }, + { + "epoch": 1.1983254840397697, + "grad_norm": 2.8815572728674486, + "learning_rate": 2.0738786390897696e-05, + "logits/chosen": -13.75, + "logits/rejected": -13.4375, + "logps/chosen": -504.0, + "logps/rejected": -680.0, + "loss": 0.0962, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.625, + "rewards/margins": 8.875, + "rewards/rejected": -18.625, + "step": 2290 + }, + { + "epoch": 1.2035583464154893, + "grad_norm": 2.0943865882237183, + "learning_rate": 2.0513929491430006e-05, + "logits/chosen": -14.125, + "logits/rejected": -13.8125, + "logps/chosen": -516.0, + "logps/rejected": -660.0, + "loss": 0.0908, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.125, + "rewards/margins": 8.375, + "rewards/rejected": -17.5, + "step": 2300 + }, + { + "epoch": 1.2087912087912087, + "grad_norm": 4.687232667728419, + "learning_rate": 2.028944695974633e-05, + "logits/chosen": -14.375, + "logits/rejected": -13.8125, + "logps/chosen": -488.0, + "logps/rejected": -596.0, + "loss": 0.1219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.5625, + "rewards/margins": 7.28125, + "rewards/rejected": -16.875, + "step": 2310 + }, + { + "epoch": 1.2140240711669283, + "grad_norm": 6.733912452591703, + "learning_rate": 2.006535752917414e-05, + "logits/chosen": -14.375, + "logits/rejected": -14.0, + "logps/chosen": -536.0, + "logps/rejected": -628.0, + "loss": 0.1353, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.75, + "rewards/margins": 7.6875, + "rewards/rejected": -17.375, + "step": 2320 + }, + { + "epoch": 1.2192569335426477, + "grad_norm": 2.3990026267001263, + "learning_rate": 1.9841679900236167e-05, + "logits/chosen": -13.6875, + "logits/rejected": -13.5625, + "logps/chosen": -528.0, + "logps/rejected": -624.0, + "loss": 0.1295, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.25, + "rewards/margins": 7.59375, + "rewards/rejected": -16.875, + "step": 2330 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 1.5565379549024234, + "learning_rate": 1.9618432739089843e-05, + "logits/chosen": -13.8125, + "logits/rejected": -13.5625, + "logps/chosen": -456.0, + "logps/rejected": -568.0, + "loss": 0.1056, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.5, + "rewards/margins": 7.28125, + "rewards/rejected": -15.8125, + "step": 2340 + }, + { + "epoch": 1.2297226582940868, + "grad_norm": 2.385154086116954, + "learning_rate": 1.9395634675969525e-05, + "logits/chosen": -13.75, + "logits/rejected": -13.3125, + "logps/chosen": -504.0, + "logps/rejected": -612.0, + "loss": 0.1503, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.125, + "rewards/margins": 7.46875, + "rewards/rejected": -16.5, + "step": 2350 + }, + { + "epoch": 1.2349555206698064, + "grad_norm": 0.7231966984051394, + "learning_rate": 1.9173304303631848e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.875, + "logps/chosen": -504.0, + "logps/rejected": -608.0, + "loss": 0.0951, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.8125, + "rewards/margins": 7.46875, + "rewards/rejected": -16.25, + "step": 2360 + }, + { + "epoch": 1.2401883830455258, + "grad_norm": 3.595599348043387, + "learning_rate": 1.8951460175804104e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.3125, + "logps/chosen": -548.0, + "logps/rejected": -648.0, + "loss": 0.1048, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.8125, + "rewards/margins": 7.875, + "rewards/rejected": -17.625, + "step": 2370 + }, + { + "epoch": 1.2454212454212454, + "grad_norm": 2.0288579413034777, + "learning_rate": 1.87301208056359e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.375, + "logps/chosen": -512.0, + "logps/rejected": -616.0, + "loss": 0.104, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.375, + "rewards/margins": 7.90625, + "rewards/rejected": -17.25, + "step": 2380 + }, + { + "epoch": 1.250654107796965, + "grad_norm": 2.3526669326687464, + "learning_rate": 1.8509304664154255e-05, + "logits/chosen": -13.0625, + "logits/rejected": -12.625, + "logps/chosen": -604.0, + "logps/rejected": -716.0, + "loss": 0.1063, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.375, + "rewards/margins": 7.84375, + "rewards/rejected": -18.25, + "step": 2390 + }, + { + "epoch": 1.2558869701726845, + "grad_norm": 3.112985785125725, + "learning_rate": 1.8289030178722132e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.5, + "logps/chosen": -540.0, + "logps/rejected": -676.0, + "loss": 0.1041, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -8.8125, + "rewards/margins": 9.625, + "rewards/rejected": -18.5, + "step": 2400 + }, + { + "epoch": 1.2611198325484039, + "grad_norm": 1.2581184168231971, + "learning_rate": 1.8069315731500666e-05, + "logits/chosen": -13.75, + "logits/rejected": -13.3125, + "logps/chosen": -576.0, + "logps/rejected": -692.0, + "loss": 0.1438, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.3125, + "rewards/margins": 8.5, + "rewards/rejected": -17.75, + "step": 2410 + }, + { + "epoch": 1.2663526949241235, + "grad_norm": 4.762221728293621, + "learning_rate": 1.7850179657915183e-05, + "logits/chosen": -13.25, + "logits/rejected": -12.875, + "logps/chosen": -600.0, + "logps/rejected": -616.0, + "loss": 0.1127, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.0, + "rewards/margins": 7.53125, + "rewards/rejected": -17.5, + "step": 2420 + }, + { + "epoch": 1.2715855572998431, + "grad_norm": 3.447175858247355, + "learning_rate": 1.7631640245125015e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.125, + "logps/chosen": -548.0, + "logps/rejected": -624.0, + "loss": 0.1399, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.8125, + "rewards/margins": 7.4375, + "rewards/rejected": -18.25, + "step": 2430 + }, + { + "epoch": 1.2768184196755625, + "grad_norm": 4.061321817953982, + "learning_rate": 1.7413715730497494e-05, + "logits/chosen": -13.0625, + "logits/rejected": -12.875, + "logps/chosen": -516.0, + "logps/rejected": -668.0, + "loss": 0.0983, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.5, + "rewards/margins": 9.0625, + "rewards/rejected": -18.625, + "step": 2440 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 4.489903011834727, + "learning_rate": 1.7196424300085978e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -520.0, + "logps/rejected": -604.0, + "loss": 0.0957, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.5625, + "rewards/margins": 6.75, + "rewards/rejected": -16.375, + "step": 2450 + }, + { + "epoch": 1.2872841444270016, + "grad_norm": 3.4346300831848433, + "learning_rate": 1.6979784087112188e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -468.0, + "logps/rejected": -572.0, + "loss": 0.1555, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.4375, + "rewards/margins": 7.03125, + "rewards/rejected": -16.5, + "step": 2460 + }, + { + "epoch": 1.2925170068027212, + "grad_norm": 2.4661294584896933, + "learning_rate": 1.6763813170453044e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.5625, + "logps/chosen": -474.0, + "logps/rejected": -608.0, + "loss": 0.0976, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.0, + "rewards/margins": 7.09375, + "rewards/rejected": -16.0, + "step": 2470 + }, + { + "epoch": 1.2977498691784406, + "grad_norm": 3.149223335476965, + "learning_rate": 1.6548529573131876e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.625, + "logps/chosen": -510.0, + "logps/rejected": -608.0, + "loss": 0.1061, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.625, + "rewards/margins": 7.21875, + "rewards/rejected": -16.875, + "step": 2480 + }, + { + "epoch": 1.30298273155416, + "grad_norm": 2.862506339722175, + "learning_rate": 1.6333951260814413e-05, + "logits/chosen": -13.25, + "logits/rejected": -12.875, + "logps/chosen": -592.0, + "logps/rejected": -640.0, + "loss": 0.0693, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.3125, + "rewards/margins": 8.0625, + "rewards/rejected": -17.375, + "step": 2490 + }, + { + "epoch": 1.3082155939298796, + "grad_norm": 2.7248445532753016, + "learning_rate": 1.6120096140309572e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.8125, + "logps/chosen": -460.0, + "logps/rejected": -592.0, + "loss": 0.1068, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.875, + "rewards/margins": 7.25, + "rewards/rejected": -16.125, + "step": 2500 + }, + { + "epoch": 1.3134484563055993, + "grad_norm": 1.4186556225169547, + "learning_rate": 1.5906982058075038e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.3125, + "logps/chosen": -478.0, + "logps/rejected": -656.0, + "loss": 0.0886, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.625, + "rewards/margins": 7.5, + "rewards/rejected": -17.125, + "step": 2510 + }, + { + "epoch": 1.3186813186813187, + "grad_norm": 2.9852405667865174, + "learning_rate": 1.569462679872801e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.5625, + "logps/chosen": -528.0, + "logps/rejected": -600.0, + "loss": 0.1124, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.125, + "rewards/margins": 6.9375, + "rewards/rejected": -17.0, + "step": 2520 + }, + { + "epoch": 1.323914181057038, + "grad_norm": 3.666318609283611, + "learning_rate": 1.5483048083561036e-05, + "logits/chosen": -13.5625, + "logits/rejected": -13.0, + "logps/chosen": -540.0, + "logps/rejected": -676.0, + "loss": 0.1064, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.1875, + "rewards/margins": 9.3125, + "rewards/rejected": -18.5, + "step": 2530 + }, + { + "epoch": 1.3291470434327577, + "grad_norm": 2.9887842389088215, + "learning_rate": 1.527226356906314e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.625, + "logps/chosen": -458.0, + "logps/rejected": -588.0, + "loss": 0.1584, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.0625, + "rewards/margins": 7.3125, + "rewards/rejected": -16.375, + "step": 2540 + }, + { + "epoch": 1.3343799058084773, + "grad_norm": 4.22908382077193, + "learning_rate": 1.5062290845446403e-05, + "logits/chosen": -12.625, + "logits/rejected": -11.9375, + "logps/chosen": -540.0, + "logps/rejected": -636.0, + "loss": 0.1073, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.75, + "rewards/margins": 7.25, + "rewards/rejected": -18.0, + "step": 2550 + }, + { + "epoch": 1.3396127681841967, + "grad_norm": 2.546115179849579, + "learning_rate": 1.4853147435177992e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.0625, + "logps/chosen": -524.0, + "logps/rejected": -632.0, + "loss": 0.1146, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.375, + "rewards/margins": 8.75, + "rewards/rejected": -18.125, + "step": 2560 + }, + { + "epoch": 1.3448456305599163, + "grad_norm": 2.964138623891713, + "learning_rate": 1.4644850791517933e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.625, + "logps/chosen": -528.0, + "logps/rejected": -640.0, + "loss": 0.0923, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.125, + "rewards/margins": 8.1875, + "rewards/rejected": -17.25, + "step": 2570 + }, + { + "epoch": 1.3500784929356358, + "grad_norm": 2.833859042991735, + "learning_rate": 1.4437418297062589e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.0625, + "logps/chosen": -504.0, + "logps/rejected": -624.0, + "loss": 0.1154, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.0, + "rewards/margins": 7.875, + "rewards/rejected": -16.875, + "step": 2580 + }, + { + "epoch": 1.3553113553113554, + "grad_norm": 4.611332803169445, + "learning_rate": 1.4230867262294045e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.5625, + "logps/chosen": -532.0, + "logps/rejected": -700.0, + "loss": 0.1234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.75, + "rewards/margins": 8.75, + "rewards/rejected": -17.5, + "step": 2590 + }, + { + "epoch": 1.3605442176870748, + "grad_norm": 2.585933332197896, + "learning_rate": 1.4025214924135616e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.0, + "logps/chosen": -464.0, + "logps/rejected": -584.0, + "loss": 0.1002, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.0625, + "rewards/margins": 6.875, + "rewards/rejected": -15.9375, + "step": 2600 + }, + { + "epoch": 1.3657770800627944, + "grad_norm": 5.004771837552434, + "learning_rate": 1.3820478444513288e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.25, + "logps/chosen": -540.0, + "logps/rejected": -636.0, + "loss": 0.1361, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.3125, + "rewards/margins": 7.71875, + "rewards/rejected": -17.0, + "step": 2610 + }, + { + "epoch": 1.3710099424385138, + "grad_norm": 3.3721942115741483, + "learning_rate": 1.3616674908923585e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -502.0, + "logps/rejected": -604.0, + "loss": 0.1074, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.0625, + "rewards/margins": 7.75, + "rewards/rejected": -16.875, + "step": 2620 + }, + { + "epoch": 1.3762428048142334, + "grad_norm": 1.926096891230754, + "learning_rate": 1.3413821325007834e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.125, + "logps/chosen": -460.0, + "logps/rejected": -608.0, + "loss": 0.0734, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.4375, + "rewards/margins": 7.25, + "rewards/rejected": -16.75, + "step": 2630 + }, + { + "epoch": 1.3814756671899528, + "grad_norm": 2.6811569703958438, + "learning_rate": 1.321193462113272e-05, + "logits/chosen": -12.125, + "logits/rejected": -11.5, + "logps/chosen": -500.0, + "logps/rejected": -584.0, + "loss": 0.1341, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.3125, + "rewards/margins": 6.65625, + "rewards/rejected": -16.0, + "step": 2640 + }, + { + "epoch": 1.3867085295656725, + "grad_norm": 1.5488117167812219, + "learning_rate": 1.3011031644977716e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.0, + "logps/chosen": -470.0, + "logps/rejected": -636.0, + "loss": 0.1135, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.5, + "rewards/margins": 7.875, + "rewards/rejected": -17.375, + "step": 2650 + }, + { + "epoch": 1.3919413919413919, + "grad_norm": 5.7062514391226555, + "learning_rate": 1.2811129162129065e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.125, + "logps/chosen": -548.0, + "logps/rejected": -740.0, + "loss": 0.0949, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.375, + "rewards/margins": 9.0625, + "rewards/rejected": -18.5, + "step": 2660 + }, + { + "epoch": 1.3971742543171115, + "grad_norm": 1.7643119117081274, + "learning_rate": 1.261224385468066e-05, + "logits/chosen": -12.3125, + "logits/rejected": -11.3125, + "logps/chosen": -528.0, + "logps/rejected": -604.0, + "loss": 0.0787, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.25, + "rewards/margins": 7.0, + "rewards/rejected": -17.25, + "step": 2670 + }, + { + "epoch": 1.402407116692831, + "grad_norm": 4.394449609840489, + "learning_rate": 1.2414392319841957e-05, + "logits/chosen": -12.3125, + "logits/rejected": -11.875, + "logps/chosen": -548.0, + "logps/rejected": -724.0, + "loss": 0.1152, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.1875, + "rewards/margins": 11.1875, + "rewards/rejected": -20.375, + "step": 2680 + }, + { + "epoch": 1.4076399790685505, + "grad_norm": 5.0757669192975685, + "learning_rate": 1.2217591068552894e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.1875, + "logps/chosen": -456.0, + "logps/rejected": -632.0, + "loss": 0.0827, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.25, + "rewards/margins": 7.90625, + "rewards/rejected": -17.125, + "step": 2690 + }, + { + "epoch": 1.41287284144427, + "grad_norm": 1.2128787751321914, + "learning_rate": 1.2021856524105992e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.375, + "logps/chosen": -556.0, + "logps/rejected": -652.0, + "loss": 0.0657, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.5, + "rewards/margins": 8.75, + "rewards/rejected": -18.25, + "step": 2700 + }, + { + "epoch": 1.4181057038199896, + "grad_norm": 4.371148246096117, + "learning_rate": 1.1827205020775881e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.125, + "logps/chosen": -528.0, + "logps/rejected": -624.0, + "loss": 0.1347, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.25, + "rewards/margins": 8.0, + "rewards/rejected": -17.25, + "step": 2710 + }, + { + "epoch": 1.423338566195709, + "grad_norm": 1.6427572152583667, + "learning_rate": 1.163365280245615e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.6875, + "logps/chosen": -480.0, + "logps/rejected": -592.0, + "loss": 0.1309, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -8.5, + "rewards/margins": 7.34375, + "rewards/rejected": -15.875, + "step": 2720 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 6.792621163838377, + "learning_rate": 1.1441216021303777e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.6875, + "logps/chosen": -494.0, + "logps/rejected": -608.0, + "loss": 0.1526, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.1875, + "rewards/margins": 7.3125, + "rewards/rejected": -16.5, + "step": 2730 + }, + { + "epoch": 1.433804290947148, + "grad_norm": 7.768597274251902, + "learning_rate": 1.1249910736391203e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.625, + "logps/chosen": -496.0, + "logps/rejected": -616.0, + "loss": 0.1477, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.0625, + "rewards/margins": 8.0, + "rewards/rejected": -17.125, + "step": 2740 + }, + { + "epoch": 1.4390371533228676, + "grad_norm": 1.5634511255756072, + "learning_rate": 1.1059752912366217e-05, + "logits/chosen": -13.1875, + "logits/rejected": -12.75, + "logps/chosen": -462.0, + "logps/rejected": -660.0, + "loss": 0.1114, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.375, + "rewards/margins": 8.75, + "rewards/rejected": -17.125, + "step": 2750 + }, + { + "epoch": 1.4442700156985873, + "grad_norm": 5.473079860013893, + "learning_rate": 1.0870758418119659e-05, + "logits/chosen": -13.1875, + "logits/rejected": -12.6875, + "logps/chosen": -540.0, + "logps/rejected": -584.0, + "loss": 0.0834, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.3125, + "rewards/margins": 7.875, + "rewards/rejected": -17.25, + "step": 2760 + }, + { + "epoch": 1.4495028780743067, + "grad_norm": 3.4164753814709665, + "learning_rate": 1.0682943025461136e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.0, + "logps/chosen": -544.0, + "logps/rejected": -628.0, + "loss": 0.1494, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.25, + "rewards/margins": 7.4375, + "rewards/rejected": -16.625, + "step": 2770 + }, + { + "epoch": 1.454735740450026, + "grad_norm": 4.0004197012792835, + "learning_rate": 1.049632240780288e-05, + "logits/chosen": -12.625, + "logits/rejected": -11.875, + "logps/chosen": -486.0, + "logps/rejected": -556.0, + "loss": 0.1041, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.1875, + "rewards/margins": 6.46875, + "rewards/rejected": -15.6875, + "step": 2780 + }, + { + "epoch": 1.4599686028257457, + "grad_norm": 2.937844502391212, + "learning_rate": 1.0310912138851769e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.4375, + "logps/chosen": -520.0, + "logps/rejected": -688.0, + "loss": 0.0971, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.5625, + "rewards/margins": 8.0625, + "rewards/rejected": -17.625, + "step": 2790 + }, + { + "epoch": 1.4652014652014653, + "grad_norm": 4.339889798014685, + "learning_rate": 1.0126727691309638e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.9375, + "logps/chosen": -552.0, + "logps/rejected": -704.0, + "loss": 0.1164, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.375, + "rewards/margins": 8.75, + "rewards/rejected": -18.125, + "step": 2800 + }, + { + "epoch": 1.4704343275771847, + "grad_norm": 1.8102160895635062, + "learning_rate": 9.943784435582166e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.25, + "logps/chosen": -516.0, + "logps/rejected": -660.0, + "loss": 0.0842, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.3125, + "rewards/margins": 8.1875, + "rewards/rejected": -17.5, + "step": 2810 + }, + { + "epoch": 1.4756671899529041, + "grad_norm": 1.7389222823704746, + "learning_rate": 9.76209763849609e-06, + "logits/chosen": -12.5625, + "logits/rejected": -11.9375, + "logps/chosen": -478.0, + "logps/rejected": -620.0, + "loss": 0.0995, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.0, + "rewards/margins": 8.25, + "rewards/rejected": -17.25, + "step": 2820 + }, + { + "epoch": 1.4809000523286238, + "grad_norm": 1.3162427246282047, + "learning_rate": 9.581682462025215e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.875, + "logps/chosen": -494.0, + "logps/rejected": -624.0, + "loss": 0.0881, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.0625, + "rewards/margins": 7.9375, + "rewards/rejected": -17.0, + "step": 2830 + }, + { + "epoch": 1.4861329147043434, + "grad_norm": 2.015607418663913, + "learning_rate": 9.40255396202518e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -568.0, + "logps/rejected": -660.0, + "loss": 0.1589, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.4375, + "rewards/margins": 7.84375, + "rewards/rejected": -17.25, + "step": 2840 + }, + { + "epoch": 1.4913657770800628, + "grad_norm": 1.0089278154821486, + "learning_rate": 9.22472708697692e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.3125, + "logps/chosen": -516.0, + "logps/rejected": -624.0, + "loss": 0.1199, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.9375, + "rewards/margins": 8.3125, + "rewards/rejected": -17.25, + "step": 2850 + }, + { + "epoch": 1.4965986394557822, + "grad_norm": 7.8442767175126145, + "learning_rate": 9.048216676739295e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.875, + "logps/chosen": -528.0, + "logps/rejected": -716.0, + "loss": 0.1101, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.4375, + "rewards/margins": 10.375, + "rewards/rejected": -18.875, + "step": 2860 + }, + { + "epoch": 1.5018315018315018, + "grad_norm": 0.8512762475654531, + "learning_rate": 8.87303746131066e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.125, + "logps/chosen": -536.0, + "logps/rejected": -672.0, + "loss": 0.1099, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.375, + "rewards/margins": 8.5625, + "rewards/rejected": -18.0, + "step": 2870 + }, + { + "epoch": 1.5070643642072215, + "grad_norm": 1.5593140512431443, + "learning_rate": 8.699204059599578e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -536.0, + "logps/rejected": -660.0, + "loss": 0.1107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.5, + "rewards/margins": 8.625, + "rewards/rejected": -18.125, + "step": 2880 + }, + { + "epoch": 1.5122972265829409, + "grad_norm": 3.0135057825122633, + "learning_rate": 8.526730978204933e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.3125, + "logps/chosen": -572.0, + "logps/rejected": -680.0, + "loss": 0.097, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.625, + "rewards/margins": 9.3125, + "rewards/rejected": -18.875, + "step": 2890 + }, + { + "epoch": 1.5175300889586603, + "grad_norm": 4.884151124038764, + "learning_rate": 8.35563261020529e-06, + "logits/chosen": -13.125, + "logits/rejected": -12.6875, + "logps/chosen": -548.0, + "logps/rejected": -728.0, + "loss": 0.0774, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.9375, + "rewards/margins": 10.75, + "rewards/rejected": -19.75, + "step": 2900 + }, + { + "epoch": 1.5227629513343799, + "grad_norm": 5.5540339317729295, + "learning_rate": 8.185923233957802e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.125, + "logps/chosen": -486.0, + "logps/rejected": -652.0, + "loss": 0.0961, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.1875, + "rewards/margins": 9.0, + "rewards/rejected": -18.125, + "step": 2910 + }, + { + "epoch": 1.5279958137100995, + "grad_norm": 3.6558685861938636, + "learning_rate": 8.017617011906618e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.3125, + "logps/chosen": -548.0, + "logps/rejected": -704.0, + "loss": 0.08, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.75, + "rewards/margins": 7.8125, + "rewards/rejected": -17.5, + "step": 2920 + }, + { + "epoch": 1.533228676085819, + "grad_norm": 3.6345000092419517, + "learning_rate": 7.850727989401064e-06, + "logits/chosen": -13.0, + "logits/rejected": -12.75, + "logps/chosen": -568.0, + "logps/rejected": -740.0, + "loss": 0.0925, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.625, + "rewards/margins": 8.8125, + "rewards/rejected": -18.375, + "step": 2930 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 1.68599240926016, + "learning_rate": 7.685270093523534e-06, + "logits/chosen": -13.125, + "logits/rejected": -12.625, + "logps/chosen": -532.0, + "logps/rejected": -704.0, + "loss": 0.0917, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.0, + "rewards/margins": 10.125, + "rewards/rejected": -19.125, + "step": 2940 + }, + { + "epoch": 1.543694400837258, + "grad_norm": 4.5784671678935895, + "learning_rate": 7.521257131927212e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -508.0, + "logps/rejected": -572.0, + "loss": 0.1385, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.0, + "rewards/margins": 7.28125, + "rewards/rejected": -17.25, + "step": 2950 + }, + { + "epoch": 1.5489272632129776, + "grad_norm": 3.1832934865765763, + "learning_rate": 7.358702791683869e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.875, + "logps/chosen": -478.0, + "logps/rejected": -600.0, + "loss": 0.0871, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.0625, + "rewards/margins": 8.0625, + "rewards/rejected": -17.125, + "step": 2960 + }, + { + "epoch": 1.554160125588697, + "grad_norm": 5.1042748038245565, + "learning_rate": 7.197620638141633e-06, + "logits/chosen": -13.3125, + "logits/rejected": -13.1875, + "logps/chosen": -494.0, + "logps/rejected": -616.0, + "loss": 0.0872, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.625, + "rewards/margins": 8.4375, + "rewards/rejected": -17.125, + "step": 2970 + }, + { + "epoch": 1.5593929879644164, + "grad_norm": 4.261194324303468, + "learning_rate": 7.038024113792921e-06, + "logits/chosen": -13.5, + "logits/rejected": -12.875, + "logps/chosen": -544.0, + "logps/rejected": -660.0, + "loss": 0.0934, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.75, + "rewards/margins": 8.75, + "rewards/rejected": -17.5, + "step": 2980 + }, + { + "epoch": 1.564625850340136, + "grad_norm": 5.838668015412529, + "learning_rate": 6.879926537152695e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.375, + "logps/chosen": -478.0, + "logps/rejected": -684.0, + "loss": 0.1091, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.625, + "rewards/margins": 9.75, + "rewards/rejected": -18.375, + "step": 2990 + }, + { + "epoch": 1.5698587127158556, + "grad_norm": 6.215751805578572, + "learning_rate": 6.723341101646993e-06, + "logits/chosen": -13.5, + "logits/rejected": -13.1875, + "logps/chosen": -544.0, + "logps/rejected": -680.0, + "loss": 0.0991, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.375, + "rewards/margins": 9.3125, + "rewards/rejected": -17.75, + "step": 3000 + }, + { + "epoch": 1.575091575091575, + "grad_norm": 6.288421625487859, + "learning_rate": 6.568280874511904e-06, + "logits/chosen": -13.375, + "logits/rejected": -13.0, + "logps/chosen": -544.0, + "logps/rejected": -704.0, + "loss": 0.0995, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.75, + "rewards/margins": 9.1875, + "rewards/rejected": -17.875, + "step": 3010 + }, + { + "epoch": 1.5803244374672945, + "grad_norm": 7.0546142777757845, + "learning_rate": 6.414758795703122e-06, + "logits/chosen": -13.375, + "logits/rejected": -12.9375, + "logps/chosen": -520.0, + "logps/rejected": -624.0, + "loss": 0.1143, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.125, + "rewards/margins": 7.59375, + "rewards/rejected": -16.75, + "step": 3020 + }, + { + "epoch": 1.585557299843014, + "grad_norm": 4.589578434412954, + "learning_rate": 6.262787676816093e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.625, + "logps/chosen": -552.0, + "logps/rejected": -648.0, + "loss": 0.1142, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.5, + "rewards/margins": 7.9375, + "rewards/rejected": -17.375, + "step": 3030 + }, + { + "epoch": 1.5907901622187337, + "grad_norm": 4.762993940765846, + "learning_rate": 6.112380200016832e-06, + "logits/chosen": -13.6875, + "logits/rejected": -13.1875, + "logps/chosen": -496.0, + "logps/rejected": -640.0, + "loss": 0.1374, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.625, + "rewards/margins": 7.875, + "rewards/rejected": -16.5, + "step": 3040 + }, + { + "epoch": 1.5960230245944533, + "grad_norm": 5.107564915010418, + "learning_rate": 5.963548916983627e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.375, + "logps/chosen": -452.0, + "logps/rejected": -552.0, + "loss": 0.0917, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.5, + "rewards/margins": 7.71875, + "rewards/rejected": -16.25, + "step": 3050 + }, + { + "epoch": 1.6012558869701727, + "grad_norm": 2.54635356048562, + "learning_rate": 5.816306247859571e-06, + "logits/chosen": -13.3125, + "logits/rejected": -12.875, + "logps/chosen": -552.0, + "logps/rejected": -640.0, + "loss": 0.0855, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.1875, + "rewards/margins": 9.375, + "rewards/rejected": -17.5, + "step": 3060 + }, + { + "epoch": 1.6064887493458921, + "grad_norm": 3.298880234845822, + "learning_rate": 5.670664480216087e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.75, + "logps/chosen": -516.0, + "logps/rejected": -676.0, + "loss": 0.0884, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.0, + "rewards/margins": 9.8125, + "rewards/rejected": -17.75, + "step": 3070 + }, + { + "epoch": 1.6117216117216118, + "grad_norm": 4.353379277791028, + "learning_rate": 5.526635768027489e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.625, + "logps/chosen": -528.0, + "logps/rejected": -620.0, + "loss": 0.1106, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.5, + "rewards/margins": 8.875, + "rewards/rejected": -17.375, + "step": 3080 + }, + { + "epoch": 1.6169544740973314, + "grad_norm": 1.7350907393275037, + "learning_rate": 5.384232130656772e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.5, + "logps/chosen": -536.0, + "logps/rejected": -704.0, + "loss": 0.0954, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.25, + "rewards/margins": 9.875, + "rewards/rejected": -18.125, + "step": 3090 + }, + { + "epoch": 1.6221873364730508, + "grad_norm": 1.9702905557486132, + "learning_rate": 5.243465451852547e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.25, + "logps/chosen": -512.0, + "logps/rejected": -660.0, + "loss": 0.1501, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -8.5625, + "rewards/margins": 8.625, + "rewards/rejected": -17.25, + "step": 3100 + }, + { + "epoch": 1.6274201988487702, + "grad_norm": 2.119534360879486, + "learning_rate": 5.104347478757313e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.25, + "logps/chosen": -480.0, + "logps/rejected": -636.0, + "loss": 0.1065, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.75, + "rewards/margins": 8.3125, + "rewards/rejected": -17.125, + "step": 3110 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 5.305348928368352, + "learning_rate": 4.9668898209272094e-06, + "logits/chosen": -13.625, + "logits/rejected": -13.0625, + "logps/chosen": -544.0, + "logps/rejected": -648.0, + "loss": 0.1039, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.0, + "rewards/margins": 9.6875, + "rewards/rejected": -17.75, + "step": 3120 + }, + { + "epoch": 1.6378859236002095, + "grad_norm": 3.865588511769586, + "learning_rate": 4.831103949363103e-06, + "logits/chosen": -13.0, + "logits/rejected": -12.4375, + "logps/chosen": -486.0, + "logps/rejected": -576.0, + "loss": 0.1176, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.1875, + "rewards/margins": 7.9375, + "rewards/rejected": -16.125, + "step": 3130 + }, + { + "epoch": 1.6431187859759289, + "grad_norm": 1.1087995031281264, + "learning_rate": 4.697001195553366e-06, + "logits/chosen": -13.3125, + "logits/rejected": -12.875, + "logps/chosen": -476.0, + "logps/rejected": -628.0, + "loss": 0.0754, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.0625, + "rewards/margins": 8.5, + "rewards/rejected": -17.5, + "step": 3140 + }, + { + "epoch": 1.6483516483516483, + "grad_norm": 9.607813501681845, + "learning_rate": 4.564592750528271e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.75, + "logps/chosen": -506.0, + "logps/rejected": -584.0, + "loss": 0.1084, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.9375, + "rewards/margins": 7.5625, + "rewards/rejected": -16.5, + "step": 3150 + }, + { + "epoch": 1.653584510727368, + "grad_norm": 2.166356498800773, + "learning_rate": 4.4338896639260276e-06, + "logits/chosen": -13.4375, + "logits/rejected": -13.0, + "logps/chosen": -508.0, + "logps/rejected": -632.0, + "loss": 0.0882, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.375, + "rewards/margins": 8.0, + "rewards/rejected": -16.375, + "step": 3160 + }, + { + "epoch": 1.6588173731030875, + "grad_norm": 1.3202863612431788, + "learning_rate": 4.304902843070701e-06, + "logits/chosen": -13.375, + "logits/rejected": -12.75, + "logps/chosen": -494.0, + "logps/rejected": -580.0, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.5, + "rewards/margins": 7.5625, + "rewards/rejected": -16.0, + "step": 3170 + }, + { + "epoch": 1.664050235478807, + "grad_norm": 6.022229933522954, + "learning_rate": 4.177643052062039e-06, + "logits/chosen": -13.375, + "logits/rejected": -12.75, + "logps/chosen": -510.0, + "logps/rejected": -656.0, + "loss": 0.0964, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.9375, + "rewards/margins": 8.0, + "rewards/rejected": -16.875, + "step": 3180 + }, + { + "epoch": 1.6692830978545263, + "grad_norm": 4.697489562167333, + "learning_rate": 4.0521209108770945e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.625, + "logps/chosen": -506.0, + "logps/rejected": -640.0, + "loss": 0.1347, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.0, + "rewards/margins": 8.5, + "rewards/rejected": -17.5, + "step": 3190 + }, + { + "epoch": 1.674515960230246, + "grad_norm": 7.606755469102715, + "learning_rate": 3.928346894484056e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.9375, + "logps/chosen": -552.0, + "logps/rejected": -600.0, + "loss": 0.1338, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.375, + "rewards/margins": 6.28125, + "rewards/rejected": -15.625, + "step": 3200 + }, + { + "epoch": 1.6797488226059656, + "grad_norm": 6.996228868642036, + "learning_rate": 3.8063313319680686e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.25, + "logps/chosen": -544.0, + "logps/rejected": -680.0, + "loss": 0.1061, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.0625, + "rewards/margins": 8.6875, + "rewards/rejected": -17.75, + "step": 3210 + }, + { + "epoch": 1.684981684981685, + "grad_norm": 2.1894497750464494, + "learning_rate": 3.686084405669249e-06, + "logits/chosen": -13.4375, + "logits/rejected": -13.0625, + "logps/chosen": -540.0, + "logps/rejected": -712.0, + "loss": 0.1001, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.4375, + "rewards/margins": 10.25, + "rewards/rejected": -18.75, + "step": 3220 + }, + { + "epoch": 1.6902145473574044, + "grad_norm": 3.29711389402712, + "learning_rate": 3.567616150332992e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.6875, + "logps/chosen": -504.0, + "logps/rejected": -588.0, + "loss": 0.1136, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.625, + "rewards/margins": 7.40625, + "rewards/rejected": -16.0, + "step": 3230 + }, + { + "epoch": 1.695447409733124, + "grad_norm": 3.5590799179096213, + "learning_rate": 3.450936452272524e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.8125, + "logps/chosen": -490.0, + "logps/rejected": -612.0, + "loss": 0.1291, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.4375, + "rewards/margins": 7.8125, + "rewards/rejected": -16.25, + "step": 3240 + }, + { + "epoch": 1.7006802721088436, + "grad_norm": 2.1632084987538476, + "learning_rate": 3.3360550485439067e-06, + "logits/chosen": -13.5625, + "logits/rejected": -13.0625, + "logps/chosen": -454.0, + "logps/rejected": -636.0, + "loss": 0.0754, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.71875, + "rewards/margins": 9.375, + "rewards/rejected": -17.125, + "step": 3250 + }, + { + "epoch": 1.705913134484563, + "grad_norm": 6.079477309011048, + "learning_rate": 3.222981526133434e-06, + "logits/chosen": -13.1875, + "logits/rejected": -12.9375, + "logps/chosen": -486.0, + "logps/rejected": -600.0, + "loss": 0.1236, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.0, + "rewards/margins": 7.5, + "rewards/rejected": -16.5, + "step": 3260 + }, + { + "epoch": 1.7111459968602825, + "grad_norm": 1.5499240661013363, + "learning_rate": 3.111725321157627e-06, + "logits/chosen": -13.4375, + "logits/rejected": -13.1875, + "logps/chosen": -516.0, + "logps/rejected": -764.0, + "loss": 0.0815, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.1875, + "rewards/margins": 12.0625, + "rewards/rejected": -19.25, + "step": 3270 + }, + { + "epoch": 1.716378859236002, + "grad_norm": 3.7623282869444963, + "learning_rate": 3.002295718075762e-06, + "logits/chosen": -13.1875, + "logits/rejected": -12.8125, + "logps/chosen": -520.0, + "logps/rejected": -680.0, + "loss": 0.0881, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.25, + "rewards/margins": 10.0625, + "rewards/rejected": -18.375, + "step": 3280 + }, + { + "epoch": 1.7216117216117217, + "grad_norm": 7.519232363887075, + "learning_rate": 2.8947018489150517e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.625, + "logps/chosen": -502.0, + "logps/rejected": -652.0, + "loss": 0.0775, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.6875, + "rewards/margins": 8.5, + "rewards/rejected": -17.125, + "step": 3290 + }, + { + "epoch": 1.7268445839874411, + "grad_norm": 5.31099596157704, + "learning_rate": 2.7889526925085978e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.8125, + "logps/chosen": -516.0, + "logps/rejected": -644.0, + "loss": 0.1053, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.0625, + "rewards/margins": 9.4375, + "rewards/rejected": -17.5, + "step": 3300 + }, + { + "epoch": 1.7320774463631605, + "grad_norm": 7.1196364277612165, + "learning_rate": 2.6850570737460916e-06, + "logits/chosen": -13.125, + "logits/rejected": -12.8125, + "logps/chosen": -482.0, + "logps/rejected": -600.0, + "loss": 0.1152, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -8.125, + "rewards/margins": 8.75, + "rewards/rejected": -16.875, + "step": 3310 + }, + { + "epoch": 1.7373103087388801, + "grad_norm": 2.680324074975716, + "learning_rate": 2.5830236628373363e-06, + "logits/chosen": -13.1875, + "logits/rejected": -13.125, + "logps/chosen": -502.0, + "logps/rejected": -676.0, + "loss": 0.0778, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.0, + "rewards/margins": 9.0, + "rewards/rejected": -18.0, + "step": 3320 + }, + { + "epoch": 1.7425431711145998, + "grad_norm": 2.2573623813902244, + "learning_rate": 2.482860974588755e-06, + "logits/chosen": -13.125, + "logits/rejected": -12.5, + "logps/chosen": -532.0, + "logps/rejected": -652.0, + "loss": 0.0792, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.125, + "rewards/margins": 8.9375, + "rewards/rejected": -18.125, + "step": 3330 + }, + { + "epoch": 1.7477760334903192, + "grad_norm": 2.3988175015656794, + "learning_rate": 2.3845773676927863e-06, + "logits/chosen": -13.375, + "logits/rejected": -12.9375, + "logps/chosen": -500.0, + "logps/rejected": -644.0, + "loss": 0.0737, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.0625, + "rewards/margins": 8.625, + "rewards/rejected": -17.75, + "step": 3340 + }, + { + "epoch": 1.7530088958660386, + "grad_norm": 2.499191138157377, + "learning_rate": 2.288181044030341e-06, + "logits/chosen": -13.3125, + "logits/rejected": -12.8125, + "logps/chosen": -454.0, + "logps/rejected": -592.0, + "loss": 0.1102, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.1875, + "rewards/margins": 9.0, + "rewards/rejected": -17.125, + "step": 3350 + }, + { + "epoch": 1.7582417582417582, + "grad_norm": 1.7297577696934656, + "learning_rate": 2.193680047986385e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.75, + "logps/chosen": -478.0, + "logps/rejected": -608.0, + "loss": 0.1039, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.3125, + "rewards/margins": 7.375, + "rewards/rejected": -16.625, + "step": 3360 + }, + { + "epoch": 1.7634746206174778, + "grad_norm": 1.3844986076148504, + "learning_rate": 2.1010822657785673e-06, + "logits/chosen": -13.25, + "logits/rejected": -13.0625, + "logps/chosen": -552.0, + "logps/rejected": -624.0, + "loss": 0.0855, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.1875, + "rewards/margins": 8.625, + "rewards/rejected": -16.75, + "step": 3370 + }, + { + "epoch": 1.7687074829931972, + "grad_norm": 2.5009715621795063, + "learning_rate": 2.0103954247991525e-06, + "logits/chosen": -13.125, + "logits/rejected": -12.5625, + "logps/chosen": -488.0, + "logps/rejected": -612.0, + "loss": 0.09, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.625, + "rewards/margins": 8.6875, + "rewards/rejected": -17.25, + "step": 3380 + }, + { + "epoch": 1.7739403453689166, + "grad_norm": 1.8945829950156903, + "learning_rate": 1.9216270929701407e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.625, + "logps/chosen": -474.0, + "logps/rejected": -652.0, + "loss": 0.0723, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.75, + "rewards/margins": 8.0625, + "rewards/rejected": -16.75, + "step": 3390 + }, + { + "epoch": 1.7791732077446363, + "grad_norm": 5.069862077253051, + "learning_rate": 1.8347846781117201e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.625, + "logps/chosen": -512.0, + "logps/rejected": -616.0, + "loss": 0.1063, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.8125, + "rewards/margins": 9.0625, + "rewards/rejected": -17.875, + "step": 3400 + }, + { + "epoch": 1.784406070120356, + "grad_norm": 1.3102672583456336, + "learning_rate": 1.7498754273240713e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.5, + "logps/chosen": -516.0, + "logps/rejected": -656.0, + "loss": 0.102, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.375, + "rewards/margins": 9.5, + "rewards/rejected": -17.875, + "step": 3410 + }, + { + "epoch": 1.7896389324960753, + "grad_norm": 5.748063923557189, + "learning_rate": 1.6669064263826028e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.625, + "logps/chosen": -572.0, + "logps/rejected": -608.0, + "loss": 0.1097, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.8125, + "rewards/margins": 8.125, + "rewards/rejected": -17.0, + "step": 3420 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 4.829587618568215, + "learning_rate": 1.5858845991466088e-06, + "logits/chosen": -13.0, + "logits/rejected": -12.875, + "logps/chosen": -472.0, + "logps/rejected": -576.0, + "loss": 0.1007, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.25, + "rewards/margins": 6.65625, + "rewards/rejected": -15.875, + "step": 3430 + }, + { + "epoch": 1.8001046572475143, + "grad_norm": 0.92994359502391, + "learning_rate": 1.5068167069814926e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.625, + "logps/chosen": -560.0, + "logps/rejected": -692.0, + "loss": 0.0844, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.3125, + "rewards/margins": 9.9375, + "rewards/rejected": -18.25, + "step": 3440 + }, + { + "epoch": 1.805337519623234, + "grad_norm": 2.1999437018442163, + "learning_rate": 1.4297093481945106e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.75, + "logps/chosen": -506.0, + "logps/rejected": -608.0, + "loss": 0.0948, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.125, + "rewards/margins": 7.59375, + "rewards/rejected": -16.75, + "step": 3450 + }, + { + "epoch": 1.8105703819989536, + "grad_norm": 4.960781699983769, + "learning_rate": 1.3545689574841342e-06, + "logits/chosen": -13.125, + "logits/rejected": -12.5625, + "logps/chosen": -544.0, + "logps/rejected": -652.0, + "loss": 0.0968, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.0625, + "rewards/margins": 8.3125, + "rewards/rejected": -17.375, + "step": 3460 + }, + { + "epoch": 1.815803244374673, + "grad_norm": 3.66785793431449, + "learning_rate": 1.2814018054030623e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.75, + "logps/chosen": -520.0, + "logps/rejected": -624.0, + "loss": 0.0943, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.875, + "rewards/margins": 8.8125, + "rewards/rejected": -17.75, + "step": 3470 + }, + { + "epoch": 1.8210361067503924, + "grad_norm": 4.803515943576416, + "learning_rate": 1.2102139978349497e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.25, + "logps/chosen": -560.0, + "logps/rejected": -692.0, + "loss": 0.1218, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.625, + "rewards/margins": 9.0, + "rewards/rejected": -18.625, + "step": 3480 + }, + { + "epoch": 1.826268969126112, + "grad_norm": 1.3014751486404437, + "learning_rate": 1.14101147548486e-06, + "logits/chosen": -13.25, + "logits/rejected": -12.625, + "logps/chosen": -528.0, + "logps/rejected": -640.0, + "loss": 0.0868, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.5625, + "rewards/margins": 9.3125, + "rewards/rejected": -17.875, + "step": 3490 + }, + { + "epoch": 1.8315018315018317, + "grad_norm": 7.2868142451084, + "learning_rate": 1.0738000133834969e-06, + "logits/chosen": -13.4375, + "logits/rejected": -12.9375, + "logps/chosen": -532.0, + "logps/rejected": -616.0, + "loss": 0.1025, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.75, + "rewards/margins": 7.0625, + "rewards/rejected": -16.75, + "step": 3500 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 2.182109722434071, + "learning_rate": 1.008585220405278e-06, + "logits/chosen": -13.0, + "logits/rejected": -12.625, + "logps/chosen": -462.0, + "logps/rejected": -612.0, + "loss": 0.0964, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.5, + "rewards/margins": 6.96875, + "rewards/rejected": -16.5, + "step": 3510 + }, + { + "epoch": 1.8419675562532705, + "grad_norm": 3.0642067374107125, + "learning_rate": 9.453725388002821e-07, + "logits/chosen": -13.25, + "logits/rejected": -13.0, + "logps/chosen": -540.0, + "logps/rejected": -680.0, + "loss": 0.1071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.0, + "rewards/margins": 10.0, + "rewards/rejected": -19.0, + "step": 3520 + }, + { + "epoch": 1.84720041862899, + "grad_norm": 2.211282968905268, + "learning_rate": 8.841672437400528e-07, + "logits/chosen": -12.6875, + "logits/rejected": -12.4375, + "logps/chosen": -496.0, + "logps/rejected": -644.0, + "loss": 0.0976, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.875, + "rewards/margins": 8.25, + "rewards/rejected": -17.125, + "step": 3530 + }, + { + "epoch": 1.8524332810047097, + "grad_norm": 2.306667763441843, + "learning_rate": 8.249744428774103e-07, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -576.0, + "logps/rejected": -652.0, + "loss": 0.0807, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.75, + "rewards/margins": 9.125, + "rewards/rejected": -17.875, + "step": 3540 + }, + { + "epoch": 1.8576661433804291, + "grad_norm": 5.435125332604937, + "learning_rate": 7.677990759202086e-07, + "logits/chosen": -12.6875, + "logits/rejected": -11.9375, + "logps/chosen": -572.0, + "logps/rejected": -708.0, + "loss": 0.0971, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.875, + "rewards/margins": 9.375, + "rewards/rejected": -18.25, + "step": 3550 + }, + { + "epoch": 1.8628990057561485, + "grad_norm": 3.3348576248427326, + "learning_rate": 7.126459142190844e-07, + "logits/chosen": -12.6875, + "logits/rejected": -12.25, + "logps/chosen": -532.0, + "logps/rejected": -688.0, + "loss": 0.1057, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.125, + "rewards/margins": 9.0, + "rewards/rejected": -18.125, + "step": 3560 + }, + { + "epoch": 1.8681318681318682, + "grad_norm": 2.5808442085804644, + "learning_rate": 6.595195603693205e-07, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -488.0, + "logps/rejected": -604.0, + "loss": 0.066, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.75, + "rewards/margins": 7.1875, + "rewards/rejected": -15.9375, + "step": 3570 + }, + { + "epoch": 1.8733647305075878, + "grad_norm": 2.7672202913255366, + "learning_rate": 6.084244478267248e-07, + "logits/chosen": -12.8125, + "logits/rejected": -12.6875, + "logps/chosen": -520.0, + "logps/rejected": -628.0, + "loss": 0.1047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -8.75, + "rewards/margins": 8.5, + "rewards/rejected": -17.25, + "step": 3580 + }, + { + "epoch": 1.8785975928833072, + "grad_norm": 2.02922718371571, + "learning_rate": 5.593648405376711e-07, + "logits/chosen": -12.9375, + "logits/rejected": -12.6875, + "logps/chosen": -524.0, + "logps/rejected": -660.0, + "loss": 0.0819, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.5, + "rewards/margins": 9.0, + "rewards/rejected": -18.5, + "step": 3590 + }, + { + "epoch": 1.8838304552590266, + "grad_norm": 2.046652559740739, + "learning_rate": 5.123448325832475e-07, + "logits/chosen": -13.25, + "logits/rejected": -12.625, + "logps/chosen": -472.0, + "logps/rejected": -580.0, + "loss": 0.1085, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.1875, + "rewards/margins": 7.53125, + "rewards/rejected": -16.75, + "step": 3600 + }, + { + "epoch": 1.8890633176347462, + "grad_norm": 5.589958818337675, + "learning_rate": 4.6736834783762397e-07, + "logits/chosen": -12.5, + "logits/rejected": -12.0625, + "logps/chosen": -484.0, + "logps/rejected": -612.0, + "loss": 0.1086, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.25, + "rewards/margins": 8.0625, + "rewards/rejected": -17.25, + "step": 3610 + }, + { + "epoch": 1.8942961800104658, + "grad_norm": 2.1397763501401443, + "learning_rate": 4.24439139640595e-07, + "logits/chosen": -12.75, + "logits/rejected": -12.375, + "logps/chosen": -476.0, + "logps/rejected": -612.0, + "loss": 0.1402, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -8.8125, + "rewards/margins": 7.75, + "rewards/rejected": -16.5, + "step": 3620 + }, + { + "epoch": 1.8995290423861853, + "grad_norm": 4.234807002383162, + "learning_rate": 3.835607904843358e-07, + "logits/chosen": -13.25, + "logits/rejected": -12.875, + "logps/chosen": -496.0, + "logps/rejected": -640.0, + "loss": 0.099, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.625, + "rewards/margins": 7.4375, + "rewards/rejected": -17.125, + "step": 3630 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 5.447862432508138, + "learning_rate": 3.4473671171447174e-07, + "logits/chosen": -12.8125, + "logits/rejected": -12.4375, + "logps/chosen": -488.0, + "logps/rejected": -596.0, + "loss": 0.0801, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.5625, + "rewards/margins": 8.875, + "rewards/rejected": -17.375, + "step": 3640 + }, + { + "epoch": 1.9099947671376243, + "grad_norm": 3.7446251130949766, + "learning_rate": 3.079701432453841e-07, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -460.0, + "logps/rejected": -676.0, + "loss": 0.0963, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.9375, + "rewards/margins": 8.5625, + "rewards/rejected": -17.5, + "step": 3650 + }, + { + "epoch": 1.915227629513344, + "grad_norm": 3.968745889768715, + "learning_rate": 2.7326415328982056e-07, + "logits/chosen": -12.75, + "logits/rejected": -12.25, + "logps/chosen": -502.0, + "logps/rejected": -616.0, + "loss": 0.1276, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.125, + "rewards/margins": 6.875, + "rewards/rejected": -17.0, + "step": 3660 + }, + { + "epoch": 1.9204604918890633, + "grad_norm": 4.006875799984734, + "learning_rate": 2.4062163810288365e-07, + "logits/chosen": -13.0625, + "logits/rejected": -12.6875, + "logps/chosen": -536.0, + "logps/rejected": -632.0, + "loss": 0.0867, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.875, + "rewards/margins": 8.4375, + "rewards/rejected": -17.25, + "step": 3670 + }, + { + "epoch": 1.9256933542647827, + "grad_norm": 5.953542104090957, + "learning_rate": 2.100453217402959e-07, + "logits/chosen": -13.0625, + "logits/rejected": -12.5625, + "logps/chosen": -560.0, + "logps/rejected": -660.0, + "loss": 0.1176, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.625, + "rewards/margins": 8.125, + "rewards/rejected": -17.75, + "step": 3680 + }, + { + "epoch": 1.9309262166405023, + "grad_norm": 5.239490240155921, + "learning_rate": 1.8153775583110156e-07, + "logits/chosen": -13.0, + "logits/rejected": -12.625, + "logps/chosen": -458.0, + "logps/rejected": -604.0, + "loss": 0.0844, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.625, + "rewards/margins": 8.0, + "rewards/rejected": -16.625, + "step": 3690 + }, + { + "epoch": 1.936159079016222, + "grad_norm": 4.377024347096592, + "learning_rate": 1.5510131936472273e-07, + "logits/chosen": -13.3125, + "logits/rejected": -12.625, + "logps/chosen": -544.0, + "logps/rejected": -672.0, + "loss": 0.0835, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -8.875, + "rewards/margins": 9.3125, + "rewards/rejected": -18.25, + "step": 3700 + }, + { + "epoch": 1.9413919413919414, + "grad_norm": 1.5159821172630095, + "learning_rate": 1.307382184924266e-07, + "logits/chosen": -13.375, + "logits/rejected": -12.8125, + "logps/chosen": -478.0, + "logps/rejected": -600.0, + "loss": 0.0899, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.25, + "rewards/margins": 7.40625, + "rewards/rejected": -16.625, + "step": 3710 + }, + { + "epoch": 1.9466248037676608, + "grad_norm": 4.469755228752353, + "learning_rate": 1.0845048634321731e-07, + "logits/chosen": -12.5625, + "logits/rejected": -12.25, + "logps/chosen": -516.0, + "logps/rejected": -588.0, + "loss": 0.1217, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.8125, + "rewards/margins": 7.78125, + "rewards/rejected": -16.625, + "step": 3720 + }, + { + "epoch": 1.9518576661433804, + "grad_norm": 3.3123314260717414, + "learning_rate": 8.823998285418522e-08, + "logits/chosen": -12.9375, + "logits/rejected": -12.625, + "logps/chosen": -544.0, + "logps/rejected": -648.0, + "loss": 0.086, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.5625, + "rewards/margins": 9.125, + "rewards/rejected": -17.75, + "step": 3730 + }, + { + "epoch": 1.9570905285191, + "grad_norm": 2.731361823113672, + "learning_rate": 7.010839461526752e-08, + "logits/chosen": -12.875, + "logits/rejected": -12.5, + "logps/chosen": -544.0, + "logps/rejected": -680.0, + "loss": 0.0895, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.0, + "rewards/margins": 10.4375, + "rewards/rejected": -18.375, + "step": 3740 + }, + { + "epoch": 1.9623233908948194, + "grad_norm": 3.243299191382827, + "learning_rate": 5.4057234728521756e-08, + "logits/chosen": -12.9375, + "logits/rejected": -12.75, + "logps/chosen": -498.0, + "logps/rejected": -620.0, + "loss": 0.1018, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.625, + "rewards/margins": 8.1875, + "rewards/rejected": -16.75, + "step": 3750 + }, + { + "epoch": 1.9675562532705388, + "grad_norm": 5.643560413836883, + "learning_rate": 4.0087842681846286e-08, + "logits/chosen": -12.25, + "logits/rejected": -11.9375, + "logps/chosen": -528.0, + "logps/rejected": -604.0, + "loss": 0.1314, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.4375, + "rewards/margins": 7.6875, + "rewards/rejected": -17.125, + "step": 3760 + }, + { + "epoch": 1.9727891156462585, + "grad_norm": 1.3002574150876818, + "learning_rate": 2.820138423720309e-08, + "logits/chosen": -13.0, + "logits/rejected": -12.625, + "logps/chosen": -476.0, + "logps/rejected": -596.0, + "loss": 0.0943, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.125, + "rewards/margins": 7.78125, + "rewards/rejected": -16.875, + "step": 3770 + }, + { + "epoch": 1.978021978021978, + "grad_norm": 5.260548583290011, + "learning_rate": 1.839885133332053e-08, + "logits/chosen": -12.8125, + "logits/rejected": -12.25, + "logps/chosen": -544.0, + "logps/rejected": -696.0, + "loss": 0.1208, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.125, + "rewards/margins": 9.875, + "rewards/rejected": -19.0, + "step": 3780 + }, + { + "epoch": 1.9832548403976975, + "grad_norm": 5.112452952032154, + "learning_rate": 1.0681062002940167e-08, + "logits/chosen": -13.0625, + "logits/rejected": -12.75, + "logps/chosen": -548.0, + "logps/rejected": -684.0, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.4375, + "rewards/margins": 9.5, + "rewards/rejected": -18.0, + "step": 3790 + }, + { + "epoch": 1.988487702773417, + "grad_norm": 0.9856562812739469, + "learning_rate": 5.048660304524111e-09, + "logits/chosen": -12.875, + "logits/rejected": -12.4375, + "logps/chosen": -512.0, + "logps/rejected": -660.0, + "loss": 0.1136, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.5, + "rewards/margins": 8.8125, + "rewards/rejected": -18.25, + "step": 3800 + }, + { + "epoch": 1.9937205651491365, + "grad_norm": 1.084985961639873, + "learning_rate": 1.502116268523035e-09, + "logits/chosen": -13.3125, + "logits/rejected": -13.0625, + "logps/chosen": -536.0, + "logps/rejected": -648.0, + "loss": 0.1045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.75, + "rewards/margins": 9.0, + "rewards/rejected": -17.75, + "step": 3810 + }, + { + "epoch": 1.9989534275248562, + "grad_norm": 1.7332241383915472, + "learning_rate": 4.172585814643526e-11, + "logits/chosen": -13.4375, + "logits/rejected": -12.9375, + "logps/chosen": -508.0, + "logps/rejected": -628.0, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.5625, + "rewards/margins": 9.0625, + "rewards/rejected": -17.625, + "step": 3820 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -13.3125, + "eval_logits/rejected": -13.0625, + "eval_logps/chosen": -556.0, + "eval_logps/rejected": -564.0, + "eval_loss": 0.7793359160423279, + "eval_rewards/accuracies": 0.72265625, + "eval_rewards/chosen": -11.75, + "eval_rewards/margins": 1.9140625, + "eval_rewards/rejected": -13.6875, + "eval_runtime": 46.77, + "eval_samples_per_second": 42.762, + "eval_steps_per_second": 0.684, + "step": 3822 + }, + { + "epoch": 2.0, + "step": 3822, + "total_flos": 0.0, + "train_loss": 0.41369995401518184, + "train_runtime": 7266.4225, + "train_samples_per_second": 16.826, + "train_steps_per_second": 0.526 + } + ], + "logging_steps": 10, + "max_steps": 3822, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}