{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5733, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005232862375719519, "grad_norm": 10.351622948893763, "learning_rate": 8.710801393728224e-08, "logits/chosen": -12.5625, "logits/rejected": -11.6875, "logps/chosen": -430.0, "logps/rejected": -460.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0052328623757195184, "grad_norm": 9.861680749083718, "learning_rate": 8.710801393728223e-07, "logits/chosen": -11.25, "logits/rejected": -11.3125, "logps/chosen": -364.0, "logps/rejected": -290.0, "loss": 0.6937, "rewards/accuracies": 0.125, "rewards/chosen": 1.609325408935547e-05, "rewards/margins": -0.00677490234375, "rewards/rejected": 0.00677490234375, "step": 10 }, { "epoch": 0.010465724751439037, "grad_norm": 8.879894065080062, "learning_rate": 1.7421602787456445e-06, "logits/chosen": -11.0, "logits/rejected": -11.0625, "logps/chosen": -264.0, "logps/rejected": -256.0, "loss": 0.691, "rewards/accuracies": 0.25, "rewards/chosen": -0.00103759765625, "rewards/margins": 0.00665283203125, "rewards/rejected": -0.0076904296875, "step": 20 }, { "epoch": 0.015698587127158554, "grad_norm": 10.24487310428354, "learning_rate": 2.613240418118467e-06, "logits/chosen": -10.375, "logits/rejected": -10.3125, "logps/chosen": -326.0, "logps/rejected": -318.0, "loss": 0.6854, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 0.01275634765625, "rewards/margins": 0.02587890625, "rewards/rejected": -0.0130615234375, "step": 30 }, { "epoch": 0.020931449502878074, "grad_norm": 9.307003472676271, "learning_rate": 3.484320557491289e-06, "logits/chosen": -11.3125, "logits/rejected": -10.9375, "logps/chosen": -336.0, "logps/rejected": -310.0, "loss": 0.6801, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.01080322265625, "rewards/margins": 0.0126953125, "rewards/rejected": -0.0235595703125, "step": 40 }, { "epoch": 0.026164311878597593, "grad_norm": 9.716648284939643, "learning_rate": 4.355400696864112e-06, "logits/chosen": -11.75, "logits/rejected": -11.5, "logps/chosen": -314.0, "logps/rejected": -312.0, "loss": 0.6714, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03125, "rewards/margins": 0.05859375, "rewards/rejected": -0.08984375, "step": 50 }, { "epoch": 0.03139717425431711, "grad_norm": 9.029736646123915, "learning_rate": 5.226480836236934e-06, "logits/chosen": -11.6875, "logits/rejected": -11.3125, "logps/chosen": -330.0, "logps/rejected": -320.0, "loss": 0.6424, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.103515625, "rewards/margins": 0.06884765625, "rewards/rejected": -0.171875, "step": 60 }, { "epoch": 0.03663003663003663, "grad_norm": 10.22440855937027, "learning_rate": 6.0975609756097564e-06, "logits/chosen": -12.875, "logits/rejected": -12.75, "logps/chosen": -374.0, "logps/rejected": -332.0, "loss": 0.629, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.369140625, "rewards/margins": 0.189453125, "rewards/rejected": -0.55859375, "step": 70 }, { "epoch": 0.04186289900575615, "grad_norm": 9.387356779175425, "learning_rate": 6.968641114982578e-06, "logits/chosen": -13.0, "logits/rejected": -13.0, "logps/chosen": -326.0, "logps/rejected": -318.0, "loss": 0.6411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5859375, "rewards/margins": 0.201171875, "rewards/rejected": -0.78515625, "step": 80 }, { "epoch": 0.04709576138147567, "grad_norm": 8.674280226009019, "learning_rate": 7.8397212543554e-06, "logits/chosen": -12.1875, "logits/rejected": -11.75, "logps/chosen": -296.0, "logps/rejected": -274.0, "loss": 0.655, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3125, "rewards/margins": 0.17578125, "rewards/rejected": -0.48828125, "step": 90 }, { "epoch": 0.052328623757195186, "grad_norm": 11.202378620141069, "learning_rate": 8.710801393728225e-06, "logits/chosen": -12.1875, "logits/rejected": -11.25, "logps/chosen": -352.0, "logps/rejected": -282.0, "loss": 0.616, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.181640625, "rewards/margins": 0.322265625, "rewards/rejected": -0.50390625, "step": 100 }, { "epoch": 0.0575614861329147, "grad_norm": 8.409262794688129, "learning_rate": 9.581881533101046e-06, "logits/chosen": -11.4375, "logits/rejected": -10.9375, "logps/chosen": -314.0, "logps/rejected": -284.0, "loss": 0.6012, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.30078125, "rewards/margins": 0.41796875, "rewards/rejected": -0.71875, "step": 110 }, { "epoch": 0.06279434850863422, "grad_norm": 10.27018949715906, "learning_rate": 1.0452961672473868e-05, "logits/chosen": -12.0, "logits/rejected": -11.75, "logps/chosen": -420.0, "logps/rejected": -374.0, "loss": 0.6532, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.494140625, "rewards/margins": 0.2578125, "rewards/rejected": -0.75, "step": 120 }, { "epoch": 0.06802721088435375, "grad_norm": 8.585428960670738, "learning_rate": 1.132404181184669e-05, "logits/chosen": -12.125, "logits/rejected": -11.25, "logps/chosen": -280.0, "logps/rejected": -286.0, "loss": 0.6281, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5390625, "rewards/margins": 0.29296875, "rewards/rejected": -0.83203125, "step": 130 }, { "epoch": 0.07326007326007326, "grad_norm": 11.197641077169887, "learning_rate": 1.2195121951219513e-05, "logits/chosen": -12.8125, "logits/rejected": -12.625, "logps/chosen": -356.0, "logps/rejected": -334.0, "loss": 0.5618, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8515625, "rewards/margins": 0.5546875, "rewards/rejected": -1.40625, "step": 140 }, { "epoch": 0.07849293563579278, "grad_norm": 9.57565914517034, "learning_rate": 1.3066202090592336e-05, "logits/chosen": -13.4375, "logits/rejected": -13.3125, "logps/chosen": -352.0, "logps/rejected": -320.0, "loss": 0.6211, "rewards/accuracies": 0.625, "rewards/chosen": -1.2421875, "rewards/margins": 0.48046875, "rewards/rejected": -1.71875, "step": 150 }, { "epoch": 0.0837257980115123, "grad_norm": 10.73566421003925, "learning_rate": 1.3937282229965156e-05, "logits/chosen": -13.375, "logits/rejected": -13.625, "logps/chosen": -368.0, "logps/rejected": -322.0, "loss": 0.626, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.359375, "rewards/margins": 0.63671875, "rewards/rejected": -2.0, "step": 160 }, { "epoch": 0.08895866038723181, "grad_norm": 8.935497274328549, "learning_rate": 1.4808362369337981e-05, "logits/chosen": -13.5625, "logits/rejected": -13.375, "logps/chosen": -348.0, "logps/rejected": -354.0, "loss": 0.6129, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.8046875, "rewards/margins": 0.09375, "rewards/rejected": -1.8984375, "step": 170 }, { "epoch": 0.09419152276295134, "grad_norm": 6.705738405802169, "learning_rate": 1.56794425087108e-05, "logits/chosen": -12.1875, "logits/rejected": -12.1875, "logps/chosen": -306.0, "logps/rejected": -320.0, "loss": 0.6128, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6328125, "rewards/margins": 0.3046875, "rewards/rejected": -1.9375, "step": 180 }, { "epoch": 0.09942438513867086, "grad_norm": 9.341880358696546, "learning_rate": 1.6550522648083624e-05, "logits/chosen": -12.25, "logits/rejected": -12.25, "logps/chosen": -416.0, "logps/rejected": -352.0, "loss": 0.5737, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6953125, "rewards/margins": 0.53125, "rewards/rejected": -2.21875, "step": 190 }, { "epoch": 0.10465724751439037, "grad_norm": 8.99823436974031, "learning_rate": 1.742160278745645e-05, "logits/chosen": -13.0, "logits/rejected": -12.8125, "logps/chosen": -362.0, "logps/rejected": -358.0, "loss": 0.6059, "rewards/accuracies": 0.5625, "rewards/chosen": -1.765625, "rewards/margins": 0.361328125, "rewards/rejected": -2.125, "step": 200 }, { "epoch": 0.10989010989010989, "grad_norm": 8.121852231192932, "learning_rate": 1.8292682926829268e-05, "logits/chosen": -12.6875, "logits/rejected": -12.75, "logps/chosen": -310.0, "logps/rejected": -288.0, "loss": 0.6293, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5078125, "rewards/margins": 0.296875, "rewards/rejected": -1.796875, "step": 210 }, { "epoch": 0.1151229722658294, "grad_norm": 8.331199806773945, "learning_rate": 1.9163763066202093e-05, "logits/chosen": -13.25, "logits/rejected": -12.6875, "logps/chosen": -330.0, "logps/rejected": -336.0, "loss": 0.5854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7109375, "rewards/margins": 0.49609375, "rewards/rejected": -2.203125, "step": 220 }, { "epoch": 0.12035583464154893, "grad_norm": 9.385440280022902, "learning_rate": 2.0034843205574914e-05, "logits/chosen": -12.9375, "logits/rejected": -12.625, "logps/chosen": -378.0, "logps/rejected": -412.0, "loss": 0.5854, "rewards/accuracies": 0.75, "rewards/chosen": -1.9140625, "rewards/margins": 0.53515625, "rewards/rejected": -2.453125, "step": 230 }, { "epoch": 0.12558869701726844, "grad_norm": 7.856024282920787, "learning_rate": 2.0905923344947736e-05, "logits/chosen": -11.75, "logits/rejected": -11.0625, "logps/chosen": -324.0, "logps/rejected": -332.0, "loss": 0.6277, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7890625, "rewards/margins": 0.6328125, "rewards/rejected": -2.421875, "step": 240 }, { "epoch": 0.13082155939298795, "grad_norm": 10.33844224138613, "learning_rate": 2.1777003484320557e-05, "logits/chosen": -11.5, "logits/rejected": -11.0625, "logps/chosen": -376.0, "logps/rejected": -340.0, "loss": 0.6369, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5234375, "rewards/margins": 0.58203125, "rewards/rejected": -2.109375, "step": 250 }, { "epoch": 0.1360544217687075, "grad_norm": 10.80456220320549, "learning_rate": 2.264808362369338e-05, "logits/chosen": -11.4375, "logits/rejected": -10.9375, "logps/chosen": -356.0, "logps/rejected": -328.0, "loss": 0.6055, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.53125, "rewards/margins": 0.68359375, "rewards/rejected": -2.21875, "step": 260 }, { "epoch": 0.141287284144427, "grad_norm": 7.153194193188069, "learning_rate": 2.3519163763066204e-05, "logits/chosen": -11.0625, "logits/rejected": -10.875, "logps/chosen": -350.0, "logps/rejected": -346.0, "loss": 0.5423, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1015625, "rewards/margins": 0.60546875, "rewards/rejected": -1.703125, "step": 270 }, { "epoch": 0.14652014652014653, "grad_norm": 10.212050819236445, "learning_rate": 2.4390243902439026e-05, "logits/chosen": -12.4375, "logits/rejected": -12.5, "logps/chosen": -384.0, "logps/rejected": -360.0, "loss": 0.6549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.484375, "rewards/margins": 0.609375, "rewards/rejected": -2.09375, "step": 280 }, { "epoch": 0.15175300889586604, "grad_norm": 8.60794177402858, "learning_rate": 2.5261324041811847e-05, "logits/chosen": -13.5, "logits/rejected": -13.0625, "logps/chosen": -368.0, "logps/rejected": -322.0, "loss": 0.6284, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2734375, "rewards/margins": 0.65625, "rewards/rejected": -1.9296875, "step": 290 }, { "epoch": 0.15698587127158556, "grad_norm": 8.37696998776307, "learning_rate": 2.6132404181184672e-05, "logits/chosen": -14.25, "logits/rejected": -13.875, "logps/chosen": -364.0, "logps/rejected": -330.0, "loss": 0.618, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4921875, "rewards/margins": 0.59765625, "rewards/rejected": -2.09375, "step": 300 }, { "epoch": 0.16221873364730507, "grad_norm": 8.375659905100978, "learning_rate": 2.7003484320557494e-05, "logits/chosen": -13.1875, "logits/rejected": -12.875, "logps/chosen": -374.0, "logps/rejected": -334.0, "loss": 0.6298, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8828125, "rewards/margins": 0.5625, "rewards/rejected": -2.4375, "step": 310 }, { "epoch": 0.1674515960230246, "grad_norm": 10.402832073203621, "learning_rate": 2.7874564459930312e-05, "logits/chosen": -14.1875, "logits/rejected": -13.75, "logps/chosen": -416.0, "logps/rejected": -342.0, "loss": 0.5864, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.21875, "rewards/margins": 0.578125, "rewards/rejected": -2.796875, "step": 320 }, { "epoch": 0.1726844583987441, "grad_norm": 12.297870357791856, "learning_rate": 2.874564459930314e-05, "logits/chosen": -13.6875, "logits/rejected": -13.4375, "logps/chosen": -368.0, "logps/rejected": -332.0, "loss": 0.7922, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.421875, "rewards/margins": 0.859375, "rewards/rejected": -3.28125, "step": 330 }, { "epoch": 0.17791732077446362, "grad_norm": 9.64969634838185, "learning_rate": 2.9616724738675962e-05, "logits/chosen": -13.5, "logits/rejected": -13.125, "logps/chosen": -374.0, "logps/rejected": -350.0, "loss": 0.6487, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.59375, "rewards/margins": 0.7265625, "rewards/rejected": -3.328125, "step": 340 }, { "epoch": 0.18315018315018314, "grad_norm": 19.1161905223819, "learning_rate": 3.048780487804878e-05, "logits/chosen": -13.3125, "logits/rejected": -13.125, "logps/chosen": -292.0, "logps/rejected": -316.0, "loss": 0.6017, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.515625, "rewards/margins": 0.78125, "rewards/rejected": -3.296875, "step": 350 }, { "epoch": 0.18838304552590268, "grad_norm": 12.072342717412461, "learning_rate": 3.13588850174216e-05, "logits/chosen": -13.75, "logits/rejected": -13.4375, "logps/chosen": -390.0, "logps/rejected": -370.0, "loss": 0.643, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.765625, "rewards/margins": 0.640625, "rewards/rejected": -3.40625, "step": 360 }, { "epoch": 0.1936159079016222, "grad_norm": 8.321040961655044, "learning_rate": 3.222996515679443e-05, "logits/chosen": -12.75, "logits/rejected": -12.5625, "logps/chosen": -344.0, "logps/rejected": -326.0, "loss": 0.7638, "rewards/accuracies": 0.6875, "rewards/chosen": -2.6875, "rewards/margins": 0.474609375, "rewards/rejected": -3.15625, "step": 370 }, { "epoch": 0.1988487702773417, "grad_norm": 7.6849275188322395, "learning_rate": 3.310104529616725e-05, "logits/chosen": -12.375, "logits/rejected": -12.3125, "logps/chosen": -312.0, "logps/rejected": -348.0, "loss": 0.8002, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.015625, "rewards/margins": 0.6015625, "rewards/rejected": -2.609375, "step": 380 }, { "epoch": 0.20408163265306123, "grad_norm": 9.09533864031769, "learning_rate": 3.397212543554007e-05, "logits/chosen": -12.625, "logits/rejected": -12.25, "logps/chosen": -410.0, "logps/rejected": -360.0, "loss": 0.6596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.34375, "rewards/margins": 0.96875, "rewards/rejected": -3.3125, "step": 390 }, { "epoch": 0.20931449502878074, "grad_norm": 10.474251698898403, "learning_rate": 3.48432055749129e-05, "logits/chosen": -13.6875, "logits/rejected": -13.375, "logps/chosen": -434.0, "logps/rejected": -364.0, "loss": 0.6416, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.375, "rewards/margins": 0.6875, "rewards/rejected": -4.0625, "step": 400 }, { "epoch": 0.21454735740450026, "grad_norm": 9.570607335225267, "learning_rate": 3.571428571428572e-05, "logits/chosen": -11.5, "logits/rejected": -11.3125, "logps/chosen": -414.0, "logps/rejected": -408.0, "loss": 0.7146, "rewards/accuracies": 0.625, "rewards/chosen": -3.15625, "rewards/margins": 0.416015625, "rewards/rejected": -3.578125, "step": 410 }, { "epoch": 0.21978021978021978, "grad_norm": 10.413809913164531, "learning_rate": 3.6585365853658535e-05, "logits/chosen": -11.3125, "logits/rejected": -10.9375, "logps/chosen": -380.0, "logps/rejected": -358.0, "loss": 0.6777, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.0625, "rewards/margins": 0.25390625, "rewards/rejected": -3.3125, "step": 420 }, { "epoch": 0.2250130821559393, "grad_norm": 9.443240821693522, "learning_rate": 3.745644599303136e-05, "logits/chosen": -11.6875, "logits/rejected": -11.25, "logps/chosen": -388.0, "logps/rejected": -318.0, "loss": 0.6791, "rewards/accuracies": 0.625, "rewards/chosen": -3.0, "rewards/margins": 0.6015625, "rewards/rejected": -3.59375, "step": 430 }, { "epoch": 0.2302459445316588, "grad_norm": 9.073494348342924, "learning_rate": 3.8327526132404185e-05, "logits/chosen": -11.5, "logits/rejected": -11.125, "logps/chosen": -416.0, "logps/rejected": -348.0, "loss": 0.6659, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.390625, "rewards/margins": 0.392578125, "rewards/rejected": -3.78125, "step": 440 }, { "epoch": 0.23547880690737832, "grad_norm": 16.732406709817493, "learning_rate": 3.9198606271777003e-05, "logits/chosen": -10.4375, "logits/rejected": -10.3125, "logps/chosen": -352.0, "logps/rejected": -360.0, "loss": 0.7231, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.125, "rewards/margins": 0.40234375, "rewards/rejected": -3.53125, "step": 450 }, { "epoch": 0.24071166928309787, "grad_norm": 9.62863918950795, "learning_rate": 4.006968641114983e-05, "logits/chosen": -10.625, "logits/rejected": -9.9375, "logps/chosen": -472.0, "logps/rejected": -424.0, "loss": 0.6618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.28125, "rewards/margins": 0.73828125, "rewards/rejected": -5.03125, "step": 460 }, { "epoch": 0.24594453165881738, "grad_norm": 13.659583611405836, "learning_rate": 4.0940766550522653e-05, "logits/chosen": -8.1875, "logits/rejected": -7.875, "logps/chosen": -444.0, "logps/rejected": -428.0, "loss": 0.7856, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.5625, "rewards/margins": 0.609375, "rewards/rejected": -6.15625, "step": 470 }, { "epoch": 0.25117739403453687, "grad_norm": 11.696489303834198, "learning_rate": 4.181184668989547e-05, "logits/chosen": -7.65625, "logits/rejected": -7.3125, "logps/chosen": -380.0, "logps/rejected": -368.0, "loss": 0.6726, "rewards/accuracies": 0.6875, "rewards/chosen": -4.03125, "rewards/margins": 0.609375, "rewards/rejected": -4.65625, "step": 480 }, { "epoch": 0.2564102564102564, "grad_norm": 9.07444220090164, "learning_rate": 4.26829268292683e-05, "logits/chosen": -8.9375, "logits/rejected": -8.375, "logps/chosen": -424.0, "logps/rejected": -392.0, "loss": 0.6257, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.125, "rewards/margins": 0.58203125, "rewards/rejected": -4.6875, "step": 490 }, { "epoch": 0.2616431187859759, "grad_norm": 6.290512673617353, "learning_rate": 4.3554006968641115e-05, "logits/chosen": -8.0625, "logits/rejected": -7.84375, "logps/chosen": -402.0, "logps/rejected": -386.0, "loss": 0.6275, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.5, "rewards/margins": 0.59765625, "rewards/rejected": -5.09375, "step": 500 }, { "epoch": 0.2668759811616955, "grad_norm": 8.840142113315412, "learning_rate": 4.442508710801394e-05, "logits/chosen": -9.0, "logits/rejected": -7.96875, "logps/chosen": -408.0, "logps/rejected": -378.0, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": -4.5, "rewards/margins": 0.447265625, "rewards/rejected": -4.9375, "step": 510 }, { "epoch": 0.272108843537415, "grad_norm": 7.427335145965296, "learning_rate": 4.529616724738676e-05, "logits/chosen": -9.625, "logits/rejected": -9.4375, "logps/chosen": -442.0, "logps/rejected": -448.0, "loss": 0.7685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.21875, "rewards/margins": 0.435546875, "rewards/rejected": -4.65625, "step": 520 }, { "epoch": 0.2773417059131345, "grad_norm": 6.490058212952471, "learning_rate": 4.616724738675958e-05, "logits/chosen": -8.625, "logits/rejected": -8.1875, "logps/chosen": -460.0, "logps/rejected": -384.0, "loss": 0.7412, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.75, "rewards/margins": 0.62890625, "rewards/rejected": -5.375, "step": 530 }, { "epoch": 0.282574568288854, "grad_norm": 12.243471295986348, "learning_rate": 4.703832752613241e-05, "logits/chosen": -8.375, "logits/rejected": -7.6875, "logps/chosen": -458.0, "logps/rejected": -432.0, "loss": 0.6623, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -5.4375, "rewards/margins": 1.0, "rewards/rejected": -6.4375, "step": 540 }, { "epoch": 0.28780743066457354, "grad_norm": 8.28734090693043, "learning_rate": 4.7909407665505226e-05, "logits/chosen": -8.25, "logits/rejected": -7.375, "logps/chosen": -460.0, "logps/rejected": -432.0, "loss": 0.8084, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.53125, "rewards/margins": 0.455078125, "rewards/rejected": -5.0, "step": 550 }, { "epoch": 0.29304029304029305, "grad_norm": 8.12376561727762, "learning_rate": 4.878048780487805e-05, "logits/chosen": -7.53125, "logits/rejected": -7.125, "logps/chosen": -416.0, "logps/rejected": -404.0, "loss": 0.7299, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.21875, "rewards/margins": 0.5234375, "rewards/rejected": -4.75, "step": 560 }, { "epoch": 0.29827315541601257, "grad_norm": 12.391849545439836, "learning_rate": 4.965156794425087e-05, "logits/chosen": -8.5, "logits/rejected": -7.5625, "logps/chosen": -440.0, "logps/rejected": -394.0, "loss": 0.8111, "rewards/accuracies": 0.5625, "rewards/chosen": -5.03125, "rewards/margins": -0.00567626953125, "rewards/rejected": -5.0, "step": 570 }, { "epoch": 0.3035060177917321, "grad_norm": 10.880713710599428, "learning_rate": 4.999983312905697e-05, "logits/chosen": -7.40625, "logits/rejected": -6.4375, "logps/chosen": -408.0, "logps/rejected": -356.0, "loss": 0.6451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.0, "rewards/margins": 0.73046875, "rewards/rejected": -5.75, "step": 580 }, { "epoch": 0.3087388801674516, "grad_norm": 19.233804877733014, "learning_rate": 4.9998813370250145e-05, "logits/chosen": -5.1875, "logits/rejected": -4.53125, "logps/chosen": -436.0, "logps/rejected": -418.0, "loss": 0.7623, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.65625, "rewards/margins": 0.5859375, "rewards/rejected": -7.21875, "step": 590 }, { "epoch": 0.3139717425431711, "grad_norm": 6.44253234493605, "learning_rate": 4.999686659648518e-05, "logits/chosen": -5.59375, "logits/rejected": -5.5, "logps/chosen": -466.0, "logps/rejected": -468.0, "loss": 0.7429, "rewards/accuracies": 0.625, "rewards/chosen": -6.375, "rewards/margins": 0.47265625, "rewards/rejected": -6.84375, "step": 600 }, { "epoch": 0.31920460491889063, "grad_norm": 12.099417587710867, "learning_rate": 4.999399287995303e-05, "logits/chosen": -6.875, "logits/rejected": -5.75, "logps/chosen": -390.0, "logps/rejected": -366.0, "loss": 0.7365, "rewards/accuracies": 0.625, "rewards/chosen": -5.0625, "rewards/margins": 0.5546875, "rewards/rejected": -5.625, "step": 610 }, { "epoch": 0.32443746729461015, "grad_norm": 8.715383868761732, "learning_rate": 4.9990192327217914e-05, "logits/chosen": -8.1875, "logits/rejected": -5.96875, "logps/chosen": -520.0, "logps/rejected": -390.0, "loss": 0.7497, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.0, "rewards/margins": 0.88671875, "rewards/rejected": -6.875, "step": 620 }, { "epoch": 0.32967032967032966, "grad_norm": 8.279369051319126, "learning_rate": 4.998546507921325e-05, "logits/chosen": -6.25, "logits/rejected": -5.90625, "logps/chosen": -384.0, "logps/rejected": -406.0, "loss": 0.8175, "rewards/accuracies": 0.625, "rewards/chosen": -6.25, "rewards/margins": 0.388671875, "rewards/rejected": -6.65625, "step": 630 }, { "epoch": 0.3349031920460492, "grad_norm": 9.292602707728573, "learning_rate": 4.997981131123657e-05, "logits/chosen": -7.15625, "logits/rejected": -6.53125, "logps/chosen": -444.0, "logps/rejected": -418.0, "loss": 0.7555, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -5.71875, "rewards/margins": 0.62109375, "rewards/rejected": -6.34375, "step": 640 }, { "epoch": 0.3401360544217687, "grad_norm": 7.05082008534396, "learning_rate": 4.9973231232942906e-05, "logits/chosen": -7.21875, "logits/rejected": -6.625, "logps/chosen": -434.0, "logps/rejected": -398.0, "loss": 0.6927, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.15625, "rewards/margins": 0.578125, "rewards/rejected": -5.71875, "step": 650 }, { "epoch": 0.3453689167974882, "grad_norm": 9.395818298567765, "learning_rate": 4.9965725088337103e-05, "logits/chosen": -5.9375, "logits/rejected": -5.21875, "logps/chosen": -378.0, "logps/rejected": -368.0, "loss": 0.7448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.34375, "rewards/margins": 0.62109375, "rewards/rejected": -5.96875, "step": 660 }, { "epoch": 0.35060177917320773, "grad_norm": 11.966195250971206, "learning_rate": 4.995729315576468e-05, "logits/chosen": -6.25, "logits/rejected": -4.84375, "logps/chosen": -440.0, "logps/rejected": -416.0, "loss": 0.6844, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.5, "rewards/margins": 0.66015625, "rewards/rejected": -7.15625, "step": 670 }, { "epoch": 0.35583464154892724, "grad_norm": 8.22871148505434, "learning_rate": 4.994793574790161e-05, "logits/chosen": -6.125, "logits/rejected": -5.5, "logps/chosen": -448.0, "logps/rejected": -410.0, "loss": 0.6867, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.625, "rewards/margins": 0.59375, "rewards/rejected": -7.1875, "step": 680 }, { "epoch": 0.36106750392464676, "grad_norm": 9.281604668745217, "learning_rate": 4.993765321174262e-05, "logits/chosen": -3.90625, "logits/rejected": -2.71875, "logps/chosen": -512.0, "logps/rejected": -480.0, "loss": 0.7281, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -8.25, "rewards/margins": 0.98046875, "rewards/rejected": -9.25, "step": 690 }, { "epoch": 0.3663003663003663, "grad_norm": 9.358944680779736, "learning_rate": 4.992644592858842e-05, "logits/chosen": -6.65625, "logits/rejected": -5.9375, "logps/chosen": -482.0, "logps/rejected": -422.0, "loss": 0.7949, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.75, "rewards/margins": 0.27734375, "rewards/rejected": -7.03125, "step": 700 }, { "epoch": 0.3715332286760858, "grad_norm": 14.819279276785178, "learning_rate": 4.9914314314031484e-05, "logits/chosen": -7.03125, "logits/rejected": -6.0, "logps/chosen": -500.0, "logps/rejected": -464.0, "loss": 0.7073, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -7.15625, "rewards/margins": 0.97265625, "rewards/rejected": -8.125, "step": 710 }, { "epoch": 0.37676609105180536, "grad_norm": 9.493964933081799, "learning_rate": 4.990125881794071e-05, "logits/chosen": -5.5, "logits/rejected": -5.1875, "logps/chosen": -430.0, "logps/rejected": -428.0, "loss": 0.8044, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -6.96875, "rewards/margins": 0.55859375, "rewards/rejected": -7.53125, "step": 720 }, { "epoch": 0.3819989534275249, "grad_norm": 9.549194684405798, "learning_rate": 4.988727992444467e-05, "logits/chosen": -6.875, "logits/rejected": -6.375, "logps/chosen": -448.0, "logps/rejected": -436.0, "loss": 0.7151, "rewards/accuracies": 0.625, "rewards/chosen": -6.1875, "rewards/margins": 0.69140625, "rewards/rejected": -6.875, "step": 730 }, { "epoch": 0.3872318158032444, "grad_norm": 11.119599516244255, "learning_rate": 4.987237815191371e-05, "logits/chosen": -6.5, "logits/rejected": -5.9375, "logps/chosen": -442.0, "logps/rejected": -404.0, "loss": 0.6481, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.875, "rewards/margins": 0.78125, "rewards/rejected": -7.65625, "step": 740 }, { "epoch": 0.3924646781789639, "grad_norm": 29.084911099228044, "learning_rate": 4.9856554052940705e-05, "logits/chosen": -3.890625, "logits/rejected": -3.40625, "logps/chosen": -476.0, "logps/rejected": -496.0, "loss": 0.7091, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -8.4375, "rewards/margins": 0.86328125, "rewards/rejected": -9.3125, "step": 750 }, { "epoch": 0.3976975405546834, "grad_norm": 11.3604312102586, "learning_rate": 4.983980821432055e-05, "logits/chosen": -3.890625, "logits/rejected": -2.84375, "logps/chosen": -442.0, "logps/rejected": -424.0, "loss": 0.7856, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -7.40625, "rewards/margins": 0.63671875, "rewards/rejected": -8.0, "step": 760 }, { "epoch": 0.40293040293040294, "grad_norm": 7.746503351769591, "learning_rate": 4.982214125702845e-05, "logits/chosen": -4.84375, "logits/rejected": -4.65625, "logps/chosen": -482.0, "logps/rejected": -488.0, "loss": 1.0012, "rewards/accuracies": 0.625, "rewards/chosen": -7.375, "rewards/margins": 0.56640625, "rewards/rejected": -7.9375, "step": 770 }, { "epoch": 0.40816326530612246, "grad_norm": 11.766587625625025, "learning_rate": 4.9803553836196845e-05, "logits/chosen": -3.484375, "logits/rejected": -1.890625, "logps/chosen": -474.0, "logps/rejected": -444.0, "loss": 0.6736, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -7.5, "rewards/margins": 0.578125, "rewards/rejected": -8.0625, "step": 780 }, { "epoch": 0.413396127681842, "grad_norm": 9.097797617055331, "learning_rate": 4.978404664109113e-05, "logits/chosen": -4.21875, "logits/rejected": -3.21875, "logps/chosen": -426.0, "logps/rejected": -448.0, "loss": 0.7096, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -7.4375, "rewards/margins": 1.0703125, "rewards/rejected": -8.5, "step": 790 }, { "epoch": 0.4186289900575615, "grad_norm": 11.540849585926436, "learning_rate": 4.976362039508411e-05, "logits/chosen": -8.1875, "logits/rejected": -7.46875, "logps/chosen": -504.0, "logps/rejected": -472.0, "loss": 0.7436, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -6.84375, "rewards/margins": 0.609375, "rewards/rejected": -7.46875, "step": 800 }, { "epoch": 0.423861852433281, "grad_norm": 10.390351331388345, "learning_rate": 4.9742275855629164e-05, "logits/chosen": -7.75, "logits/rejected": -7.0, "logps/chosen": -468.0, "logps/rejected": -446.0, "loss": 0.7135, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.6875, "rewards/margins": 0.7890625, "rewards/rejected": -7.5, "step": 810 }, { "epoch": 0.4290947148090005, "grad_norm": 10.670994286644559, "learning_rate": 4.9720013814232146e-05, "logits/chosen": -4.75, "logits/rejected": -3.453125, "logps/chosen": -492.0, "logps/rejected": -454.0, "loss": 0.7545, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.75, "rewards/margins": 0.61328125, "rewards/rejected": -9.3125, "step": 820 }, { "epoch": 0.43432757718472004, "grad_norm": 10.630956069066174, "learning_rate": 4.969683509642207e-05, "logits/chosen": -5.90625, "logits/rejected": -4.96875, "logps/chosen": -456.0, "logps/rejected": -436.0, "loss": 0.8482, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.1875, "rewards/margins": 0.24609375, "rewards/rejected": -8.4375, "step": 830 }, { "epoch": 0.43956043956043955, "grad_norm": 8.101257247067318, "learning_rate": 4.967274056172044e-05, "logits/chosen": -10.875, "logits/rejected": -10.5, "logps/chosen": -520.0, "logps/rejected": -462.0, "loss": 0.7421, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.8125, "rewards/margins": 0.91015625, "rewards/rejected": -7.71875, "step": 840 }, { "epoch": 0.44479330193615907, "grad_norm": 10.944762300383431, "learning_rate": 4.964773110360944e-05, "logits/chosen": -10.75, "logits/rejected": -10.875, "logps/chosen": -456.0, "logps/rejected": -416.0, "loss": 0.8146, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -7.375, "rewards/margins": 0.6171875, "rewards/rejected": -8.0, "step": 850 }, { "epoch": 0.4500261643118786, "grad_norm": 8.913339988594448, "learning_rate": 4.9621807649498764e-05, "logits/chosen": -11.4375, "logits/rejected": -11.125, "logps/chosen": -432.0, "logps/rejected": -472.0, "loss": 0.6892, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.4375, "rewards/margins": 0.8046875, "rewards/rejected": -7.25, "step": 860 }, { "epoch": 0.4552590266875981, "grad_norm": 7.134016274199884, "learning_rate": 4.9594971160691226e-05, "logits/chosen": -10.5625, "logits/rejected": -10.4375, "logps/chosen": -466.0, "logps/rejected": -426.0, "loss": 0.7858, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.84375, "rewards/margins": 0.5390625, "rewards/rejected": -7.40625, "step": 870 }, { "epoch": 0.4604918890633176, "grad_norm": 10.328569383276303, "learning_rate": 4.9567222632347116e-05, "logits/chosen": -11.3125, "logits/rejected": -11.5, "logps/chosen": -520.0, "logps/rejected": -458.0, "loss": 0.6878, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.125, "rewards/margins": 0.828125, "rewards/rejected": -9.0, "step": 880 }, { "epoch": 0.46572475143903713, "grad_norm": 9.62796454911049, "learning_rate": 4.953856309344731e-05, "logits/chosen": -11.625, "logits/rejected": -11.6875, "logps/chosen": -540.0, "logps/rejected": -496.0, "loss": 0.7736, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.875, "rewards/margins": 0.609375, "rewards/rejected": -9.5, "step": 890 }, { "epoch": 0.47095761381475665, "grad_norm": 10.482736030956378, "learning_rate": 4.9508993606755115e-05, "logits/chosen": -10.9375, "logits/rejected": -10.875, "logps/chosen": -460.0, "logps/rejected": -488.0, "loss": 0.8452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.09375, "rewards/margins": 1.1328125, "rewards/rejected": -8.1875, "step": 900 }, { "epoch": 0.47619047619047616, "grad_norm": 12.475233890425232, "learning_rate": 4.947851526877682e-05, "logits/chosen": -12.5625, "logits/rejected": -12.625, "logps/chosen": -424.0, "logps/rejected": -410.0, "loss": 0.8266, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -7.40625, "rewards/margins": 0.734375, "rewards/rejected": -8.125, "step": 910 }, { "epoch": 0.48142333856619574, "grad_norm": 9.454823173352347, "learning_rate": 4.944712920972109e-05, "logits/chosen": -12.4375, "logits/rejected": -12.625, "logps/chosen": -494.0, "logps/rejected": -462.0, "loss": 0.6861, "rewards/accuracies": 0.5625, "rewards/chosen": -7.6875, "rewards/margins": 0.3359375, "rewards/rejected": -8.0, "step": 920 }, { "epoch": 0.48665620094191525, "grad_norm": 18.342099499785732, "learning_rate": 4.9414836593457004e-05, "logits/chosen": -12.125, "logits/rejected": -12.25, "logps/chosen": -488.0, "logps/rejected": -452.0, "loss": 0.6957, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -7.71875, "rewards/margins": 0.8125, "rewards/rejected": -8.5625, "step": 930 }, { "epoch": 0.49188906331763477, "grad_norm": 7.4614916483387335, "learning_rate": 4.938163861747095e-05, "logits/chosen": -13.625, "logits/rejected": -13.875, "logps/chosen": -488.0, "logps/rejected": -446.0, "loss": 0.6317, "rewards/accuracies": 0.625, "rewards/chosen": -7.8125, "rewards/margins": 0.890625, "rewards/rejected": -8.6875, "step": 940 }, { "epoch": 0.4971219256933543, "grad_norm": 10.561175013265123, "learning_rate": 4.934753651282216e-05, "logits/chosen": -13.0625, "logits/rejected": -13.5, "logps/chosen": -468.0, "logps/rejected": -454.0, "loss": 0.7105, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -7.34375, "rewards/margins": 0.8203125, "rewards/rejected": -8.1875, "step": 950 }, { "epoch": 0.5023547880690737, "grad_norm": 8.854443288097167, "learning_rate": 4.9312531544097107e-05, "logits/chosen": -13.875, "logits/rejected": -14.25, "logps/chosen": -484.0, "logps/rejected": -484.0, "loss": 0.657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.5625, "rewards/margins": 1.4296875, "rewards/rejected": -10.0, "step": 960 }, { "epoch": 0.5075876504447933, "grad_norm": 14.652422394907061, "learning_rate": 4.92766250093626e-05, "logits/chosen": -13.0625, "logits/rejected": -13.625, "logps/chosen": -552.0, "logps/rejected": -488.0, "loss": 0.7733, "rewards/accuracies": 0.6875, "rewards/chosen": -9.1875, "rewards/margins": 1.4453125, "rewards/rejected": -10.625, "step": 970 }, { "epoch": 0.5128205128205128, "grad_norm": 7.681622448151709, "learning_rate": 4.923981824011761e-05, "logits/chosen": -12.0, "logits/rejected": -12.625, "logps/chosen": -544.0, "logps/rejected": -500.0, "loss": 0.6913, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.375, "rewards/margins": 1.5546875, "rewards/rejected": -9.875, "step": 980 }, { "epoch": 0.5180533751962323, "grad_norm": 9.667593919000552, "learning_rate": 4.9202112601243956e-05, "logits/chosen": -12.875, "logits/rejected": -13.5, "logps/chosen": -480.0, "logps/rejected": -440.0, "loss": 0.7261, "rewards/accuracies": 0.6875, "rewards/chosen": -8.375, "rewards/margins": 0.8671875, "rewards/rejected": -9.25, "step": 990 }, { "epoch": 0.5232862375719518, "grad_norm": 9.808142730894566, "learning_rate": 4.916350949095566e-05, "logits/chosen": -14.375, "logits/rejected": -14.625, "logps/chosen": -472.0, "logps/rejected": -452.0, "loss": 0.7067, "rewards/accuracies": 0.625, "rewards/chosen": -8.8125, "rewards/margins": 0.78125, "rewards/rejected": -9.625, "step": 1000 }, { "epoch": 0.5285190999476713, "grad_norm": 9.81071285242805, "learning_rate": 4.9124010340747084e-05, "logits/chosen": -14.25, "logits/rejected": -14.3125, "logps/chosen": -512.0, "logps/rejected": -516.0, "loss": 0.7869, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -8.4375, "rewards/margins": 0.8359375, "rewards/rejected": -9.25, "step": 1010 }, { "epoch": 0.533751962323391, "grad_norm": 9.281663389122608, "learning_rate": 4.908361661533989e-05, "logits/chosen": -14.1875, "logits/rejected": -14.5625, "logps/chosen": -510.0, "logps/rejected": -474.0, "loss": 0.7217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.875, "rewards/margins": 1.4140625, "rewards/rejected": -10.25, "step": 1020 }, { "epoch": 0.5389848246991105, "grad_norm": 6.703363126668062, "learning_rate": 4.904232981262866e-05, "logits/chosen": -13.75, "logits/rejected": -14.0625, "logps/chosen": -528.0, "logps/rejected": -464.0, "loss": 0.7286, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -9.4375, "rewards/margins": 0.65234375, "rewards/rejected": -10.0625, "step": 1030 }, { "epoch": 0.54421768707483, "grad_norm": 7.751957354860659, "learning_rate": 4.900015146362544e-05, "logits/chosen": -12.8125, "logits/rejected": -12.6875, "logps/chosen": -478.0, "logps/rejected": -508.0, "loss": 0.7434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.5625, "rewards/margins": 1.0078125, "rewards/rejected": -9.5625, "step": 1040 }, { "epoch": 0.5494505494505495, "grad_norm": 9.518426255540042, "learning_rate": 4.895708313240286e-05, "logits/chosen": -12.5, "logits/rejected": -12.5625, "logps/chosen": -476.0, "logps/rejected": -478.0, "loss": 0.9173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.0, "rewards/margins": 0.8359375, "rewards/rejected": -8.875, "step": 1050 }, { "epoch": 0.554683411826269, "grad_norm": 9.615827437190525, "learning_rate": 4.891312641603623e-05, "logits/chosen": -12.4375, "logits/rejected": -12.875, "logps/chosen": -492.0, "logps/rejected": -460.0, "loss": 0.7663, "rewards/accuracies": 0.6875, "rewards/chosen": -8.25, "rewards/margins": 0.6953125, "rewards/rejected": -8.9375, "step": 1060 }, { "epoch": 0.5599162742019885, "grad_norm": 7.981258184209591, "learning_rate": 4.8868282944544266e-05, "logits/chosen": -11.75, "logits/rejected": -11.8125, "logps/chosen": -516.0, "logps/rejected": -482.0, "loss": 0.6175, "rewards/accuracies": 0.6875, "rewards/chosen": -7.28125, "rewards/margins": 0.8984375, "rewards/rejected": -8.1875, "step": 1070 }, { "epoch": 0.565149136577708, "grad_norm": 9.025058114682, "learning_rate": 4.882255438082863e-05, "logits/chosen": -11.4375, "logits/rejected": -11.9375, "logps/chosen": -508.0, "logps/rejected": -472.0, "loss": 0.7627, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.0, "rewards/margins": 0.9375, "rewards/rejected": -9.875, "step": 1080 }, { "epoch": 0.5703819989534276, "grad_norm": 15.956761347357418, "learning_rate": 4.877594242061234e-05, "logits/chosen": -11.375, "logits/rejected": -11.875, "logps/chosen": -540.0, "logps/rejected": -466.0, "loss": 0.9185, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -10.0625, "rewards/margins": 0.76953125, "rewards/rejected": -10.8125, "step": 1090 }, { "epoch": 0.5756148613291471, "grad_norm": 11.243288973766504, "learning_rate": 4.87284487923768e-05, "logits/chosen": -11.75, "logits/rejected": -11.875, "logps/chosen": -474.0, "logps/rejected": -450.0, "loss": 0.7132, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.9375, "rewards/margins": 0.58984375, "rewards/rejected": -9.5, "step": 1100 }, { "epoch": 0.5808477237048666, "grad_norm": 9.173477529825552, "learning_rate": 4.868007525729775e-05, "logits/chosen": -12.375, "logits/rejected": -12.6875, "logps/chosen": -466.0, "logps/rejected": -456.0, "loss": 0.8585, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -8.5, "rewards/margins": 0.41796875, "rewards/rejected": -8.9375, "step": 1110 }, { "epoch": 0.5860805860805861, "grad_norm": 8.587651387175145, "learning_rate": 4.8630823609179975e-05, "logits/chosen": -12.4375, "logits/rejected": -12.6875, "logps/chosen": -560.0, "logps/rejected": -492.0, "loss": 0.9801, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -9.375, "rewards/margins": 0.0250244140625, "rewards/rejected": -9.375, "step": 1120 }, { "epoch": 0.5913134484563056, "grad_norm": 10.121602680832632, "learning_rate": 4.858069567439073e-05, "logits/chosen": -12.125, "logits/rejected": -12.1875, "logps/chosen": -472.0, "logps/rejected": -486.0, "loss": 0.8725, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -8.125, "rewards/margins": 0.7734375, "rewards/rejected": -8.875, "step": 1130 }, { "epoch": 0.5965463108320251, "grad_norm": 6.012474856608194, "learning_rate": 4.852969331179206e-05, "logits/chosen": -12.375, "logits/rejected": -12.9375, "logps/chosen": -552.0, "logps/rejected": -500.0, "loss": 0.7115, "rewards/accuracies": 0.625, "rewards/chosen": -9.875, "rewards/margins": 0.671875, "rewards/rejected": -10.5625, "step": 1140 }, { "epoch": 0.6017791732077447, "grad_norm": 9.660457794042633, "learning_rate": 4.847781841267186e-05, "logits/chosen": -12.75, "logits/rejected": -13.125, "logps/chosen": -506.0, "logps/rejected": -496.0, "loss": 0.6596, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -10.125, "rewards/margins": 1.3828125, "rewards/rejected": -11.5, "step": 1150 }, { "epoch": 0.6070120355834642, "grad_norm": 7.623993751180003, "learning_rate": 4.842507290067374e-05, "logits/chosen": -13.625, "logits/rejected": -13.8125, "logps/chosen": -516.0, "logps/rejected": -450.0, "loss": 0.7645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.0625, "rewards/margins": 0.77734375, "rewards/rejected": -11.875, "step": 1160 }, { "epoch": 0.6122448979591837, "grad_norm": 8.389337355599595, "learning_rate": 4.8371458731725676e-05, "logits/chosen": -12.6875, "logits/rejected": -13.0, "logps/chosen": -532.0, "logps/rejected": -544.0, "loss": 0.6616, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -10.25, "rewards/margins": 0.8125, "rewards/rejected": -11.125, "step": 1170 }, { "epoch": 0.6174777603349032, "grad_norm": 6.931265574049484, "learning_rate": 4.83169778939675e-05, "logits/chosen": -12.8125, "logits/rejected": -13.4375, "logps/chosen": -528.0, "logps/rejected": -494.0, "loss": 0.6737, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -9.9375, "rewards/margins": 0.8828125, "rewards/rejected": -10.8125, "step": 1180 }, { "epoch": 0.6227106227106227, "grad_norm": 8.050651620193062, "learning_rate": 4.8261632407677174e-05, "logits/chosen": -13.5625, "logits/rejected": -14.0, "logps/chosen": -512.0, "logps/rejected": -498.0, "loss": 0.5961, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -9.875, "rewards/margins": 0.953125, "rewards/rejected": -10.875, "step": 1190 }, { "epoch": 0.6279434850863422, "grad_norm": 6.392594343316606, "learning_rate": 4.820542432519583e-05, "logits/chosen": -13.3125, "logits/rejected": -13.5, "logps/chosen": -506.0, "logps/rejected": -512.0, "loss": 0.6625, "rewards/accuracies": 0.75, "rewards/chosen": -9.5, "rewards/margins": 1.4140625, "rewards/rejected": -10.9375, "step": 1200 }, { "epoch": 0.6331763474620618, "grad_norm": 8.843935462259351, "learning_rate": 4.814835573085177e-05, "logits/chosen": -13.3125, "logits/rejected": -13.6875, "logps/chosen": -564.0, "logps/rejected": -494.0, "loss": 0.7504, "rewards/accuracies": 0.6875, "rewards/chosen": -9.875, "rewards/margins": 1.2734375, "rewards/rejected": -11.125, "step": 1210 }, { "epoch": 0.6384092098377813, "grad_norm": 10.684751793528747, "learning_rate": 4.809042874088304e-05, "logits/chosen": -12.8125, "logits/rejected": -13.3125, "logps/chosen": -544.0, "logps/rejected": -494.0, "loss": 0.805, "rewards/accuracies": 0.625, "rewards/chosen": -9.25, "rewards/margins": 0.921875, "rewards/rejected": -10.1875, "step": 1220 }, { "epoch": 0.6436420722135008, "grad_norm": 8.122057886743049, "learning_rate": 4.803164550335906e-05, "logits/chosen": -12.5, "logits/rejected": -13.0, "logps/chosen": -540.0, "logps/rejected": -496.0, "loss": 0.7183, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -8.625, "rewards/margins": 2.03125, "rewards/rejected": -10.6875, "step": 1230 }, { "epoch": 0.6488749345892203, "grad_norm": 8.217900203907949, "learning_rate": 4.79720081981009e-05, "logits/chosen": -12.5, "logits/rejected": -12.5, "logps/chosen": -450.0, "logps/rejected": -450.0, "loss": 0.7273, "rewards/accuracies": 0.625, "rewards/chosen": -8.875, "rewards/margins": 0.859375, "rewards/rejected": -9.75, "step": 1240 }, { "epoch": 0.6541077969649398, "grad_norm": 9.65521490229528, "learning_rate": 4.79115190366005e-05, "logits/chosen": -12.5625, "logits/rejected": -12.6875, "logps/chosen": -520.0, "logps/rejected": -510.0, "loss": 0.7829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -10.1875, "rewards/margins": 0.9375, "rewards/rejected": -11.125, "step": 1250 }, { "epoch": 0.6593406593406593, "grad_norm": 6.471532717972355, "learning_rate": 4.785018026193863e-05, "logits/chosen": -12.25, "logits/rejected": -12.6875, "logps/chosen": -520.0, "logps/rejected": -468.0, "loss": 0.6638, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.6875, "rewards/margins": 1.4375, "rewards/rejected": -11.125, "step": 1260 }, { "epoch": 0.6645735217163788, "grad_norm": 8.558298845858685, "learning_rate": 4.778799414870171e-05, "logits/chosen": -11.875, "logits/rejected": -12.3125, "logps/chosen": -520.0, "logps/rejected": -490.0, "loss": 0.8021, "rewards/accuracies": 0.625, "rewards/chosen": -9.9375, "rewards/margins": 0.9453125, "rewards/rejected": -10.875, "step": 1270 }, { "epoch": 0.6698063840920984, "grad_norm": 10.859527406335348, "learning_rate": 4.772496300289748e-05, "logits/chosen": -12.0, "logits/rejected": -12.0, "logps/chosen": -502.0, "logps/rejected": -464.0, "loss": 0.7863, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -9.4375, "rewards/margins": 0.94140625, "rewards/rejected": -10.375, "step": 1280 }, { "epoch": 0.6750392464678179, "grad_norm": 12.60115094547818, "learning_rate": 4.76610891618695e-05, "logits/chosen": -11.75, "logits/rejected": -11.875, "logps/chosen": -486.0, "logps/rejected": -504.0, "loss": 0.7492, "rewards/accuracies": 0.5625, "rewards/chosen": -9.1875, "rewards/margins": 0.75390625, "rewards/rejected": -9.9375, "step": 1290 }, { "epoch": 0.6802721088435374, "grad_norm": 8.716034881986472, "learning_rate": 4.7596374994210424e-05, "logits/chosen": -12.375, "logits/rejected": -12.3125, "logps/chosen": -504.0, "logps/rejected": -520.0, "loss": 0.7127, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -9.4375, "rewards/margins": 1.015625, "rewards/rejected": -10.4375, "step": 1300 }, { "epoch": 0.6855049712192569, "grad_norm": 9.364442782095258, "learning_rate": 4.753082289967421e-05, "logits/chosen": -12.1875, "logits/rejected": -12.375, "logps/chosen": -544.0, "logps/rejected": -502.0, "loss": 0.7904, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -9.875, "rewards/margins": 0.7578125, "rewards/rejected": -10.625, "step": 1310 }, { "epoch": 0.6907378335949764, "grad_norm": 6.165484242150408, "learning_rate": 4.746443530908714e-05, "logits/chosen": -12.3125, "logits/rejected": -12.5, "logps/chosen": -548.0, "logps/rejected": -536.0, "loss": 0.7193, "rewards/accuracies": 0.6875, "rewards/chosen": -10.9375, "rewards/margins": 1.0546875, "rewards/rejected": -12.0, "step": 1320 }, { "epoch": 0.6959706959706959, "grad_norm": 10.533560780263379, "learning_rate": 4.7397214684257636e-05, "logits/chosen": -12.0, "logits/rejected": -11.625, "logps/chosen": -552.0, "logps/rejected": -584.0, "loss": 0.7145, "rewards/accuracies": 0.6875, "rewards/chosen": -10.3125, "rewards/margins": 1.25, "rewards/rejected": -11.5625, "step": 1330 }, { "epoch": 0.7012035583464155, "grad_norm": 8.838646171009014, "learning_rate": 4.7329163517885e-05, "logits/chosen": -12.5, "logits/rejected": -12.6875, "logps/chosen": -548.0, "logps/rejected": -510.0, "loss": 0.7471, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -10.125, "rewards/margins": 0.453125, "rewards/rejected": -10.625, "step": 1340 }, { "epoch": 0.706436420722135, "grad_norm": 7.05986771230505, "learning_rate": 4.726028433346697e-05, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -584.0, "logps/rejected": -576.0, "loss": 0.7426, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -11.125, "rewards/margins": 1.1171875, "rewards/rejected": -12.25, "step": 1350 }, { "epoch": 0.7116692830978545, "grad_norm": 9.672518916674647, "learning_rate": 4.7190579685206175e-05, "logits/chosen": -12.5625, "logits/rejected": -12.75, "logps/chosen": -620.0, "logps/rejected": -588.0, "loss": 0.9889, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.9375, "rewards/margins": 0.875, "rewards/rejected": -12.8125, "step": 1360 }, { "epoch": 0.716902145473574, "grad_norm": 11.391966596496053, "learning_rate": 4.712005215791535e-05, "logits/chosen": -11.9375, "logits/rejected": -12.1875, "logps/chosen": -564.0, "logps/rejected": -510.0, "loss": 0.6305, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -11.25, "rewards/margins": 0.6015625, "rewards/rejected": -11.8125, "step": 1370 }, { "epoch": 0.7221350078492935, "grad_norm": 6.417146855452024, "learning_rate": 4.704870436692154e-05, "logits/chosen": -12.25, "logits/rejected": -12.6875, "logps/chosen": -504.0, "logps/rejected": -486.0, "loss": 0.6748, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -9.8125, "rewards/margins": 1.7109375, "rewards/rejected": -11.5625, "step": 1380 }, { "epoch": 0.727367870225013, "grad_norm": 8.005396824784661, "learning_rate": 4.697653895796912e-05, "logits/chosen": -12.125, "logits/rejected": -12.1875, "logps/chosen": -502.0, "logps/rejected": -498.0, "loss": 0.6587, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -10.25, "rewards/margins": 1.2734375, "rewards/rejected": -11.5, "step": 1390 }, { "epoch": 0.7326007326007326, "grad_norm": 9.511953260913776, "learning_rate": 4.6903558607121634e-05, "logits/chosen": -12.5, "logits/rejected": -12.75, "logps/chosen": -520.0, "logps/rejected": -508.0, "loss": 0.7308, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -10.875, "rewards/margins": 1.1015625, "rewards/rejected": -12.0, "step": 1400 }, { "epoch": 0.7378335949764521, "grad_norm": 9.796525964563228, "learning_rate": 4.682976602066263e-05, "logits/chosen": -12.1875, "logits/rejected": -12.3125, "logps/chosen": -516.0, "logps/rejected": -508.0, "loss": 0.8031, "rewards/accuracies": 0.625, "rewards/chosen": -10.9375, "rewards/margins": 0.76171875, "rewards/rejected": -11.6875, "step": 1410 }, { "epoch": 0.7430664573521716, "grad_norm": 10.015711312302077, "learning_rate": 4.6755163934995226e-05, "logits/chosen": -11.0625, "logits/rejected": -11.375, "logps/chosen": -486.0, "logps/rejected": -468.0, "loss": 0.8148, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.9375, "rewards/margins": 1.078125, "rewards/rejected": -10.0, "step": 1420 }, { "epoch": 0.7482993197278912, "grad_norm": 6.246681815455791, "learning_rate": 4.6679755116540726e-05, "logits/chosen": -11.9375, "logits/rejected": -12.1875, "logps/chosen": -510.0, "logps/rejected": -462.0, "loss": 0.7297, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -9.5625, "rewards/margins": 0.2470703125, "rewards/rejected": -9.875, "step": 1430 }, { "epoch": 0.7535321821036107, "grad_norm": 9.65158699681941, "learning_rate": 4.660354236163596e-05, "logits/chosen": -12.125, "logits/rejected": -12.5625, "logps/chosen": -568.0, "logps/rejected": -520.0, "loss": 0.7922, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -10.0625, "rewards/margins": 0.9921875, "rewards/rejected": -11.0625, "step": 1440 }, { "epoch": 0.7587650444793302, "grad_norm": 8.442087483469699, "learning_rate": 4.652652849642961e-05, "logits/chosen": -12.625, "logits/rejected": -12.875, "logps/chosen": -540.0, "logps/rejected": -520.0, "loss": 0.7822, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -10.9375, "rewards/margins": 0.66015625, "rewards/rejected": -11.5625, "step": 1450 }, { "epoch": 0.7639979068550498, "grad_norm": 8.897264080440657, "learning_rate": 4.644871637677746e-05, "logits/chosen": -13.0, "logits/rejected": -13.0, "logps/chosen": -516.0, "logps/rejected": -472.0, "loss": 0.7131, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -10.125, "rewards/margins": 0.57421875, "rewards/rejected": -10.75, "step": 1460 }, { "epoch": 0.7692307692307693, "grad_norm": 6.696938194802173, "learning_rate": 4.637010888813639e-05, "logits/chosen": -13.125, "logits/rejected": -13.3125, "logps/chosen": -504.0, "logps/rejected": -480.0, "loss": 0.6593, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -9.75, "rewards/margins": 1.0078125, "rewards/rejected": -10.75, "step": 1470 }, { "epoch": 0.7744636316064888, "grad_norm": 8.791582206884076, "learning_rate": 4.6290708945457494e-05, "logits/chosen": -13.0625, "logits/rejected": -13.0, "logps/chosen": -512.0, "logps/rejected": -512.0, "loss": 0.8411, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -11.3125, "rewards/margins": 0.77734375, "rewards/rejected": -12.0625, "step": 1480 }, { "epoch": 0.7796964939822083, "grad_norm": 10.696923852598411, "learning_rate": 4.6210519493077895e-05, "logits/chosen": -12.625, "logits/rejected": -12.8125, "logps/chosen": -516.0, "logps/rejected": -502.0, "loss": 0.7563, "rewards/accuracies": 0.625, "rewards/chosen": -10.3125, "rewards/margins": 0.80078125, "rewards/rejected": -11.125, "step": 1490 }, { "epoch": 0.7849293563579278, "grad_norm": 7.4467241861253735, "learning_rate": 4.612954350461161e-05, "logits/chosen": -13.0625, "logits/rejected": -12.875, "logps/chosen": -468.0, "logps/rejected": -508.0, "loss": 0.569, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -10.3125, "rewards/margins": 1.390625, "rewards/rejected": -11.6875, "step": 1500 }, { "epoch": 0.7901622187336473, "grad_norm": 7.183432507030938, "learning_rate": 4.6047783982839274e-05, "logits/chosen": -13.5, "logits/rejected": -13.375, "logps/chosen": -524.0, "logps/rejected": -540.0, "loss": 0.7395, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -11.0625, "rewards/margins": 0.82421875, "rewards/rejected": -11.875, "step": 1510 }, { "epoch": 0.7953950811093669, "grad_norm": 6.780334689025147, "learning_rate": 4.5965243959596785e-05, "logits/chosen": -13.1875, "logits/rejected": -13.5, "logps/chosen": -472.0, "logps/rejected": -464.0, "loss": 0.6941, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -9.9375, "rewards/margins": 0.94140625, "rewards/rejected": -10.875, "step": 1520 }, { "epoch": 0.8006279434850864, "grad_norm": 12.754455794530319, "learning_rate": 4.5881926495662854e-05, "logits/chosen": -12.875, "logits/rejected": -12.5625, "logps/chosen": -572.0, "logps/rejected": -648.0, "loss": 0.7111, "rewards/accuracies": 0.5625, "rewards/chosen": -11.5, "rewards/margins": 0.8828125, "rewards/rejected": -12.375, "step": 1530 }, { "epoch": 0.8058608058608059, "grad_norm": 7.504670748944528, "learning_rate": 4.579783468064556e-05, "logits/chosen": -12.5, "logits/rejected": -12.625, "logps/chosen": -580.0, "logps/rejected": -564.0, "loss": 0.7643, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -11.0, "rewards/margins": 0.62109375, "rewards/rejected": -11.625, "step": 1540 }, { "epoch": 0.8110936682365254, "grad_norm": 9.695054676896365, "learning_rate": 4.5712971632867715e-05, "logits/chosen": -12.75, "logits/rejected": -13.0625, "logps/chosen": -568.0, "logps/rejected": -500.0, "loss": 0.7573, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -9.625, "rewards/margins": 1.1640625, "rewards/rejected": -10.75, "step": 1550 }, { "epoch": 0.8163265306122449, "grad_norm": 7.970194532217434, "learning_rate": 4.5627340499251294e-05, "logits/chosen": -12.1875, "logits/rejected": -12.5625, "logps/chosen": -572.0, "logps/rejected": -548.0, "loss": 0.7577, "rewards/accuracies": 0.5625, "rewards/chosen": -10.8125, "rewards/margins": 0.66015625, "rewards/rejected": -11.5, "step": 1560 }, { "epoch": 0.8215593929879644, "grad_norm": 7.8570308161394085, "learning_rate": 4.5540944455200666e-05, "logits/chosen": -13.0, "logits/rejected": -13.0, "logps/chosen": -510.0, "logps/rejected": -502.0, "loss": 0.7306, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -10.9375, "rewards/margins": 1.0859375, "rewards/rejected": -12.0, "step": 1570 }, { "epoch": 0.826792255363684, "grad_norm": 8.679285528741042, "learning_rate": 4.545378670448492e-05, "logits/chosen": -12.5625, "logits/rejected": -12.9375, "logps/chosen": -592.0, "logps/rejected": -528.0, "loss": 0.7772, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -10.75, "rewards/margins": 0.94921875, "rewards/rejected": -11.6875, "step": 1580 }, { "epoch": 0.8320251177394035, "grad_norm": 7.294489262151519, "learning_rate": 4.536587047911901e-05, "logits/chosen": -11.9375, "logits/rejected": -12.1875, "logps/chosen": -528.0, "logps/rejected": -496.0, "loss": 0.6925, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -9.875, "rewards/margins": 1.109375, "rewards/rejected": -11.0, "step": 1590 }, { "epoch": 0.837257980115123, "grad_norm": 10.134656229651304, "learning_rate": 4.527719903924392e-05, "logits/chosen": -11.5625, "logits/rejected": -11.5, "logps/chosen": -508.0, "logps/rejected": -544.0, "loss": 0.7312, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -10.375, "rewards/margins": 0.81640625, "rewards/rejected": -11.125, "step": 1600 }, { "epoch": 0.8424908424908425, "grad_norm": 8.860873669647612, "learning_rate": 4.518777567300575e-05, "logits/chosen": -11.125, "logits/rejected": -11.625, "logps/chosen": -596.0, "logps/rejected": -552.0, "loss": 0.7317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -10.8125, "rewards/margins": 1.2578125, "rewards/rejected": -12.0625, "step": 1610 }, { "epoch": 0.847723704866562, "grad_norm": 6.680364487046663, "learning_rate": 4.5097603696433845e-05, "logits/chosen": -11.875, "logits/rejected": -12.25, "logps/chosen": -536.0, "logps/rejected": -490.0, "loss": 0.8224, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -11.625, "rewards/margins": 0.53125, "rewards/rejected": -12.1875, "step": 1620 }, { "epoch": 0.8529565672422815, "grad_norm": 8.12581157579285, "learning_rate": 4.5006686453317734e-05, "logits/chosen": -12.4375, "logits/rejected": -12.5625, "logps/chosen": -556.0, "logps/rejected": -564.0, "loss": 0.8097, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -11.0625, "rewards/margins": 1.015625, "rewards/rejected": -12.0625, "step": 1630 }, { "epoch": 0.858189429618001, "grad_norm": 8.268449988760125, "learning_rate": 4.4915027315083246e-05, "logits/chosen": -12.0, "logits/rejected": -12.375, "logps/chosen": -588.0, "logps/rejected": -548.0, "loss": 0.7346, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -10.375, "rewards/margins": 0.9453125, "rewards/rejected": -11.3125, "step": 1640 }, { "epoch": 0.8634222919937206, "grad_norm": 9.632043499380252, "learning_rate": 4.4822629680667375e-05, "logits/chosen": -12.375, "logits/rejected": -12.3125, "logps/chosen": -524.0, "logps/rejected": -528.0, "loss": 0.7777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -10.25, "rewards/margins": 0.9140625, "rewards/rejected": -11.1875, "step": 1650 }, { "epoch": 0.8686551543694401, "grad_norm": 6.63480458290659, "learning_rate": 4.472949697639233e-05, "logits/chosen": -13.0, "logits/rejected": -13.0, "logps/chosen": -480.0, "logps/rejected": -478.0, "loss": 0.7749, "rewards/accuracies": 0.6875, "rewards/chosen": -9.6875, "rewards/margins": 1.078125, "rewards/rejected": -10.75, "step": 1660 }, { "epoch": 0.8738880167451596, "grad_norm": 8.946660909641642, "learning_rate": 4.463563265583843e-05, "logits/chosen": -12.6875, "logits/rejected": -13.0625, "logps/chosen": -556.0, "logps/rejected": -532.0, "loss": 0.7023, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -10.5, "rewards/margins": 1.3359375, "rewards/rejected": -11.875, "step": 1670 }, { "epoch": 0.8791208791208791, "grad_norm": 8.79339110053614, "learning_rate": 4.4541040199716066e-05, "logits/chosen": -13.3125, "logits/rejected": -13.5, "logps/chosen": -508.0, "logps/rejected": -532.0, "loss": 0.6736, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -10.125, "rewards/margins": 1.4296875, "rewards/rejected": -11.5625, "step": 1680 }, { "epoch": 0.8843537414965986, "grad_norm": 7.324747045901078, "learning_rate": 4.444572311573659e-05, "logits/chosen": -13.3125, "logits/rejected": -13.375, "logps/chosen": -536.0, "logps/rejected": -506.0, "loss": 0.6513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.125, "rewards/margins": 0.73046875, "rewards/rejected": -11.875, "step": 1690 }, { "epoch": 0.8895866038723181, "grad_norm": 9.433520118340347, "learning_rate": 4.4349684938482286e-05, "logits/chosen": -13.125, "logits/rejected": -13.6875, "logps/chosen": -564.0, "logps/rejected": -498.0, "loss": 0.5843, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -10.8125, "rewards/margins": 1.5546875, "rewards/rejected": -12.375, "step": 1700 }, { "epoch": 0.8948194662480377, "grad_norm": 7.781493747070478, "learning_rate": 4.4252929229275255e-05, "logits/chosen": -12.8125, "logits/rejected": -13.1875, "logps/chosen": -576.0, "logps/rejected": -556.0, "loss": 0.7485, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.6875, "rewards/margins": 1.1328125, "rewards/rejected": -12.875, "step": 1710 }, { "epoch": 0.9000523286237572, "grad_norm": 7.028647219407263, "learning_rate": 4.41554595760454e-05, "logits/chosen": -13.0, "logits/rejected": -13.375, "logps/chosen": -568.0, "logps/rejected": -536.0, "loss": 0.6974, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -11.9375, "rewards/margins": 0.953125, "rewards/rejected": -12.9375, "step": 1720 }, { "epoch": 0.9052851909994767, "grad_norm": 14.387364821525113, "learning_rate": 4.405727959319733e-05, "logits/chosen": -12.875, "logits/rejected": -13.0, "logps/chosen": -528.0, "logps/rejected": -536.0, "loss": 0.7899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.0625, "rewards/margins": 1.5, "rewards/rejected": -12.5625, "step": 1730 }, { "epoch": 0.9105180533751962, "grad_norm": 8.271769950792597, "learning_rate": 4.3958392921476376e-05, "logits/chosen": -12.625, "logits/rejected": -13.125, "logps/chosen": -576.0, "logps/rejected": -532.0, "loss": 0.8312, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -11.8125, "rewards/margins": 0.8359375, "rewards/rejected": -12.625, "step": 1740 }, { "epoch": 0.9157509157509157, "grad_norm": 8.418576821960974, "learning_rate": 4.385880322783353e-05, "logits/chosen": -12.9375, "logits/rejected": -13.0625, "logps/chosen": -604.0, "logps/rejected": -592.0, "loss": 0.7557, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -11.5, "rewards/margins": 1.2578125, "rewards/rejected": -12.75, "step": 1750 }, { "epoch": 0.9209837781266352, "grad_norm": 12.101413536769304, "learning_rate": 4.375851420528952e-05, "logits/chosen": -12.75, "logits/rejected": -12.8125, "logps/chosen": -552.0, "logps/rejected": -584.0, "loss": 0.7447, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -11.1875, "rewards/margins": 0.95703125, "rewards/rejected": -12.125, "step": 1760 }, { "epoch": 0.9262166405023547, "grad_norm": 6.9704706163859225, "learning_rate": 4.3657529572797804e-05, "logits/chosen": -12.125, "logits/rejected": -12.4375, "logps/chosen": -576.0, "logps/rejected": -568.0, "loss": 0.8642, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -11.5, "rewards/margins": 0.640625, "rewards/rejected": -12.1875, "step": 1770 }, { "epoch": 0.9314495028780743, "grad_norm": 7.989756027543109, "learning_rate": 4.355585307510675e-05, "logits/chosen": -12.8125, "logits/rejected": -12.875, "logps/chosen": -568.0, "logps/rejected": -568.0, "loss": 0.7395, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.125, "rewards/margins": 1.6796875, "rewards/rejected": -12.8125, "step": 1780 }, { "epoch": 0.9366823652537938, "grad_norm": 7.4392668707427525, "learning_rate": 4.345348848262068e-05, "logits/chosen": -12.125, "logits/rejected": -12.375, "logps/chosen": -588.0, "logps/rejected": -584.0, "loss": 0.7134, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -11.125, "rewards/margins": 1.2109375, "rewards/rejected": -12.375, "step": 1790 }, { "epoch": 0.9419152276295133, "grad_norm": 8.83048121979174, "learning_rate": 4.3350439591260105e-05, "logits/chosen": -12.8125, "logits/rejected": -12.8125, "logps/chosen": -536.0, "logps/rejected": -572.0, "loss": 0.8313, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.75, "rewards/margins": 1.125, "rewards/rejected": -12.875, "step": 1800 }, { "epoch": 0.9471480900052328, "grad_norm": 7.162765830891214, "learning_rate": 4.3246710222320956e-05, "logits/chosen": -12.8125, "logits/rejected": -12.75, "logps/chosen": -486.0, "logps/rejected": -492.0, "loss": 0.7147, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -10.9375, "rewards/margins": 1.2578125, "rewards/rejected": -12.1875, "step": 1810 }, { "epoch": 0.9523809523809523, "grad_norm": 7.105352599980392, "learning_rate": 4.314230422233286e-05, "logits/chosen": -12.5, "logits/rejected": -12.5, "logps/chosen": -450.0, "logps/rejected": -448.0, "loss": 0.7855, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -9.9375, "rewards/margins": 0.65625, "rewards/rejected": -10.625, "step": 1820 }, { "epoch": 0.957613814756672, "grad_norm": 10.300313199279667, "learning_rate": 4.303722546291656e-05, "logits/chosen": -12.3125, "logits/rejected": -12.625, "logps/chosen": -544.0, "logps/rejected": -510.0, "loss": 0.7179, "rewards/accuracies": 0.6875, "rewards/chosen": -11.125, "rewards/margins": 0.8203125, "rewards/rejected": -11.9375, "step": 1830 }, { "epoch": 0.9628466771323915, "grad_norm": 7.351746246119899, "learning_rate": 4.293147784064025e-05, "logits/chosen": -12.8125, "logits/rejected": -13.1875, "logps/chosen": -568.0, "logps/rejected": -544.0, "loss": 0.6308, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -11.5, "rewards/margins": 1.28125, "rewards/rejected": -12.75, "step": 1840 }, { "epoch": 0.968079539508111, "grad_norm": 9.715474841922566, "learning_rate": 4.282506527687518e-05, "logits/chosen": -12.1875, "logits/rejected": -12.5625, "logps/chosen": -640.0, "logps/rejected": -576.0, "loss": 0.7679, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -11.9375, "rewards/margins": 1.0390625, "rewards/rejected": -13.0, "step": 1850 }, { "epoch": 0.9733124018838305, "grad_norm": 7.494793387712396, "learning_rate": 4.2717991717650164e-05, "logits/chosen": -12.1875, "logits/rejected": -12.125, "logps/chosen": -560.0, "logps/rejected": -520.0, "loss": 0.8072, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -11.6875, "rewards/margins": 0.61328125, "rewards/rejected": -12.25, "step": 1860 }, { "epoch": 0.97854526425955, "grad_norm": 9.106930582299938, "learning_rate": 4.261026113350532e-05, "logits/chosen": -12.1875, "logits/rejected": -12.5625, "logps/chosen": -516.0, "logps/rejected": -472.0, "loss": 0.7537, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -10.5, "rewards/margins": 0.53125, "rewards/rejected": -11.0, "step": 1870 }, { "epoch": 0.9837781266352695, "grad_norm": 6.646188139442894, "learning_rate": 4.25018775193448e-05, "logits/chosen": -12.125, "logits/rejected": -11.9375, "logps/chosen": -540.0, "logps/rejected": -556.0, "loss": 0.7246, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -11.1875, "rewards/margins": 0.875, "rewards/rejected": -12.125, "step": 1880 }, { "epoch": 0.989010989010989, "grad_norm": 10.495153682929477, "learning_rate": 4.239284489428861e-05, "logits/chosen": -12.25, "logits/rejected": -12.4375, "logps/chosen": -636.0, "logps/rejected": -620.0, "loss": 0.6875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -11.75, "rewards/margins": 0.9453125, "rewards/rejected": -12.6875, "step": 1890 }, { "epoch": 0.9942438513867086, "grad_norm": 8.896250743023822, "learning_rate": 4.2283167301523636e-05, "logits/chosen": -12.125, "logits/rejected": -12.0625, "logps/chosen": -540.0, "logps/rejected": -540.0, "loss": 0.7295, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -11.625, "rewards/margins": 0.8515625, "rewards/rejected": -12.4375, "step": 1900 }, { "epoch": 0.9994767137624281, "grad_norm": 9.076036052295438, "learning_rate": 4.217284880815369e-05, "logits/chosen": -12.3125, "logits/rejected": -12.3125, "logps/chosen": -608.0, "logps/rejected": -560.0, "loss": 0.7494, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -11.875, "rewards/margins": 0.6796875, "rewards/rejected": -12.5625, "step": 1910 }, { "epoch": 1.0, "eval_logits/chosen": -12.5625, "eval_logits/rejected": -12.8125, "eval_logps/chosen": -552.0, "eval_logps/rejected": -548.0, "eval_loss": 0.7878593802452087, "eval_rewards/accuracies": 0.6796875, "eval_rewards/chosen": -11.6875, "eval_rewards/margins": 1.1796875, "eval_rewards/rejected": -12.8125, "eval_runtime": 47.5543, "eval_samples_per_second": 42.057, "eval_steps_per_second": 0.673, "step": 1911 }, { "epoch": 1.0047095761381475, "grad_norm": 5.025796276275169, "learning_rate": 4.20618935050487e-05, "logits/chosen": -12.6875, "logits/rejected": -12.875, "logps/chosen": -482.0, "logps/rejected": -528.0, "loss": 0.3374, "rewards/accuracies": 0.875, "rewards/chosen": -10.125, "rewards/margins": 3.65625, "rewards/rejected": -13.75, "step": 1920 }, { "epoch": 1.0099424385138671, "grad_norm": 2.617047145095191, "learning_rate": 4.195030550669297e-05, "logits/chosen": -12.5, "logits/rejected": -12.875, "logps/chosen": -576.0, "logps/rejected": -624.0, "loss": 0.1885, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.3125, "rewards/margins": 6.96875, "rewards/rejected": -16.25, "step": 1930 }, { "epoch": 1.0151753008895865, "grad_norm": 3.8017233289248216, "learning_rate": 4.1838088951032665e-05, "logits/chosen": -12.5, "logits/rejected": -12.4375, "logps/chosen": -528.0, "logps/rejected": -660.0, "loss": 0.1563, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.5, "rewards/margins": 7.28125, "rewards/rejected": -16.75, "step": 1940 }, { "epoch": 1.0204081632653061, "grad_norm": 2.1972464246724654, "learning_rate": 4.1725247999322316e-05, "logits/chosen": -12.3125, "logits/rejected": -12.125, "logps/chosen": -502.0, "logps/rejected": -588.0, "loss": 0.1881, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.6875, "rewards/margins": 5.1875, "rewards/rejected": -15.875, "step": 1950 }, { "epoch": 1.0256410256410255, "grad_norm": 2.912719312978293, "learning_rate": 4.161178683597054e-05, "logits/chosen": -12.125, "logits/rejected": -11.6875, "logps/chosen": -528.0, "logps/rejected": -568.0, "loss": 0.1273, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.375, "rewards/margins": 5.90625, "rewards/rejected": -16.25, "step": 1960 }, { "epoch": 1.0308738880167452, "grad_norm": 2.7302757245440983, "learning_rate": 4.149770966838489e-05, "logits/chosen": -12.4375, "logits/rejected": -12.375, "logps/chosen": -580.0, "logps/rejected": -680.0, "loss": 0.1697, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.1875, "rewards/margins": 9.1875, "rewards/rejected": -18.375, "step": 1970 }, { "epoch": 1.0361067503924646, "grad_norm": 2.7591867060534776, "learning_rate": 4.1383020726815744e-05, "logits/chosen": -12.4375, "logits/rejected": -12.5, "logps/chosen": -484.0, "logps/rejected": -572.0, "loss": 0.13, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.8125, "rewards/margins": 5.84375, "rewards/rejected": -15.6875, "step": 1980 }, { "epoch": 1.0413396127681842, "grad_norm": 3.8734929943590486, "learning_rate": 4.1267724264199595e-05, "logits/chosen": -13.1875, "logits/rejected": -13.125, "logps/chosen": -560.0, "logps/rejected": -672.0, "loss": 0.1567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.375, "rewards/margins": 7.40625, "rewards/rejected": -16.75, "step": 1990 }, { "epoch": 1.0465724751439036, "grad_norm": 2.5264805572715905, "learning_rate": 4.115182455600115e-05, "logits/chosen": -13.125, "logits/rejected": -13.4375, "logps/chosen": -532.0, "logps/rejected": -592.0, "loss": 0.2246, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.9375, "rewards/margins": 5.71875, "rewards/rejected": -15.6875, "step": 2000 }, { "epoch": 1.0518053375196232, "grad_norm": 1.4136788093992387, "learning_rate": 4.103532590005496e-05, "logits/chosen": -13.375, "logits/rejected": -13.4375, "logps/chosen": -524.0, "logps/rejected": -620.0, "loss": 0.1506, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.5, "rewards/margins": 7.3125, "rewards/rejected": -17.75, "step": 2010 }, { "epoch": 1.0570381998953426, "grad_norm": 5.8472125628304905, "learning_rate": 4.0918232616405925e-05, "logits/chosen": -13.375, "logits/rejected": -13.4375, "logps/chosen": -540.0, "logps/rejected": -604.0, "loss": 0.1525, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.5625, "rewards/margins": 6.84375, "rewards/rejected": -17.375, "step": 2020 }, { "epoch": 1.0622710622710623, "grad_norm": 3.7593254661033937, "learning_rate": 4.080054904714917e-05, "logits/chosen": -13.75, "logits/rejected": -13.4375, "logps/chosen": -544.0, "logps/rejected": -680.0, "loss": 0.168, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.3125, "rewards/margins": 7.53125, "rewards/rejected": -18.875, "step": 2030 }, { "epoch": 1.0675039246467817, "grad_norm": 2.4096345005426985, "learning_rate": 4.0682279556269e-05, "logits/chosen": -13.75, "logits/rejected": -13.75, "logps/chosen": -612.0, "logps/rejected": -704.0, "loss": 0.155, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.0625, "rewards/margins": 7.5625, "rewards/rejected": -18.625, "step": 2040 }, { "epoch": 1.0727367870225013, "grad_norm": 3.736602458261602, "learning_rate": 4.056342852947706e-05, "logits/chosen": -13.4375, "logits/rejected": -13.75, "logps/chosen": -564.0, "logps/rejected": -640.0, "loss": 0.1078, "rewards/accuracies": 0.9375, "rewards/chosen": -10.0625, "rewards/margins": 7.625, "rewards/rejected": -17.75, "step": 2050 }, { "epoch": 1.077969649398221, "grad_norm": 5.699200852229238, "learning_rate": 4.044400037404974e-05, "logits/chosen": -14.125, "logits/rejected": -13.75, "logps/chosen": -488.0, "logps/rejected": -616.0, "loss": 0.165, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -9.5625, "rewards/margins": 7.0625, "rewards/rejected": -16.625, "step": 2060 }, { "epoch": 1.0832025117739403, "grad_norm": 9.084933877985728, "learning_rate": 4.032399951866469e-05, "logits/chosen": -13.5, "logits/rejected": -13.5, "logps/chosen": -456.0, "logps/rejected": -528.0, "loss": 0.1929, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.375, "rewards/margins": 5.46875, "rewards/rejected": -14.875, "step": 2070 }, { "epoch": 1.08843537414966, "grad_norm": 2.1048784655877046, "learning_rate": 4.020343041323664e-05, "logits/chosen": -12.5, "logits/rejected": -12.75, "logps/chosen": -540.0, "logps/rejected": -632.0, "loss": 0.2385, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.5625, "rewards/margins": 6.9375, "rewards/rejected": -16.5, "step": 2080 }, { "epoch": 1.0936682365253794, "grad_norm": 1.3259403661494187, "learning_rate": 4.008229752875241e-05, "logits/chosen": -12.0625, "logits/rejected": -12.125, "logps/chosen": -470.0, "logps/rejected": -588.0, "loss": 0.134, "rewards/accuracies": 0.9375, "rewards/chosen": -9.375, "rewards/margins": 6.71875, "rewards/rejected": -16.125, "step": 2090 }, { "epoch": 1.098901098901099, "grad_norm": 5.124076145163906, "learning_rate": 3.996060535710501e-05, "logits/chosen": -11.6875, "logits/rejected": -11.875, "logps/chosen": -524.0, "logps/rejected": -612.0, "loss": 0.1352, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.6875, "rewards/margins": 7.4375, "rewards/rejected": -17.125, "step": 2100 }, { "epoch": 1.1041339612768184, "grad_norm": 1.8708820089615186, "learning_rate": 3.9838358410927165e-05, "logits/chosen": -11.625, "logits/rejected": -12.0, "logps/chosen": -560.0, "logps/rejected": -620.0, "loss": 0.1389, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.4375, "rewards/margins": 6.75, "rewards/rejected": -17.125, "step": 2110 }, { "epoch": 1.109366823652538, "grad_norm": 8.468865024177259, "learning_rate": 3.9715561223423984e-05, "logits/chosen": -12.0625, "logits/rejected": -12.25, "logps/chosen": -512.0, "logps/rejected": -636.0, "loss": 0.1779, "rewards/accuracies": 0.9375, "rewards/chosen": -10.8125, "rewards/margins": 7.1875, "rewards/rejected": -18.0, "step": 2120 }, { "epoch": 1.1145996860282574, "grad_norm": 4.553312461630311, "learning_rate": 3.959221834820477e-05, "logits/chosen": -12.0, "logits/rejected": -12.375, "logps/chosen": -548.0, "logps/rejected": -572.0, "loss": 0.2247, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.4375, "rewards/margins": 5.625, "rewards/rejected": -16.125, "step": 2130 }, { "epoch": 1.119832548403977, "grad_norm": 4.725944299007761, "learning_rate": 3.946833435911424e-05, "logits/chosen": -12.875, "logits/rejected": -12.875, "logps/chosen": -544.0, "logps/rejected": -680.0, "loss": 0.155, "rewards/accuracies": 0.9375, "rewards/chosen": -11.5, "rewards/margins": 7.5, "rewards/rejected": -19.0, "step": 2140 }, { "epoch": 1.1250654107796965, "grad_norm": 5.280153035514204, "learning_rate": 3.9343913850062855e-05, "logits/chosen": -13.3125, "logits/rejected": -12.9375, "logps/chosen": -496.0, "logps/rejected": -668.0, "loss": 0.1808, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.0625, "rewards/margins": 7.25, "rewards/rejected": -18.375, "step": 2150 }, { "epoch": 1.130298273155416, "grad_norm": 2.203699427092351, "learning_rate": 3.921896143485657e-05, "logits/chosen": -13.0, "logits/rejected": -13.25, "logps/chosen": -556.0, "logps/rejected": -624.0, "loss": 0.1895, "rewards/accuracies": 0.9375, "rewards/chosen": -11.1875, "rewards/margins": 6.375, "rewards/rejected": -17.5, "step": 2160 }, { "epoch": 1.1355311355311355, "grad_norm": 5.252548484606017, "learning_rate": 3.909348174702562e-05, "logits/chosen": -13.375, "logits/rejected": -13.4375, "logps/chosen": -544.0, "logps/rejected": -636.0, "loss": 0.1515, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.0625, "rewards/margins": 6.9375, "rewards/rejected": -17.0, "step": 2170 }, { "epoch": 1.1407639979068551, "grad_norm": 3.319289830814653, "learning_rate": 3.8967479439652755e-05, "logits/chosen": -13.125, "logits/rejected": -13.0, "logps/chosen": -504.0, "logps/rejected": -648.0, "loss": 0.1031, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.125, "rewards/margins": 7.6875, "rewards/rejected": -17.75, "step": 2180 }, { "epoch": 1.1459968602825745, "grad_norm": 2.762256013302049, "learning_rate": 3.884095918520072e-05, "logits/chosen": -13.125, "logits/rejected": -13.0625, "logps/chosen": -516.0, "logps/rejected": -632.0, "loss": 0.1179, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.125, "rewards/margins": 6.6875, "rewards/rejected": -16.875, "step": 2190 }, { "epoch": 1.1512297226582942, "grad_norm": 11.413493442922952, "learning_rate": 3.871392567533893e-05, "logits/chosen": -12.75, "logits/rejected": -12.875, "logps/chosen": -556.0, "logps/rejected": -648.0, "loss": 0.1835, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.8125, "rewards/margins": 7.8125, "rewards/rejected": -17.625, "step": 2200 }, { "epoch": 1.1564625850340136, "grad_norm": 4.885527079207722, "learning_rate": 3.8586383620769536e-05, "logits/chosen": -12.8125, "logits/rejected": -12.9375, "logps/chosen": -548.0, "logps/rejected": -620.0, "loss": 0.1468, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.75, "rewards/margins": 6.9375, "rewards/rejected": -17.75, "step": 2210 }, { "epoch": 1.1616954474097332, "grad_norm": 4.344316843378256, "learning_rate": 3.845833775105272e-05, "logits/chosen": -13.125, "logits/rejected": -13.125, "logps/chosen": -524.0, "logps/rejected": -672.0, "loss": 0.1325, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.875, "rewards/margins": 8.4375, "rewards/rejected": -19.25, "step": 2220 }, { "epoch": 1.1669283097854526, "grad_norm": 8.466061149421654, "learning_rate": 3.832979281443133e-05, "logits/chosen": -13.0, "logits/rejected": -13.0, "logps/chosen": -596.0, "logps/rejected": -676.0, "loss": 0.1534, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.0, "rewards/margins": 7.25, "rewards/rejected": -18.25, "step": 2230 }, { "epoch": 1.1721611721611722, "grad_norm": 8.33172408761989, "learning_rate": 3.8200753577654766e-05, "logits/chosen": -13.5, "logits/rejected": -13.4375, "logps/chosen": -504.0, "logps/rejected": -648.0, "loss": 0.1957, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.75, "rewards/margins": 7.65625, "rewards/rejected": -18.375, "step": 2240 }, { "epoch": 1.1773940345368916, "grad_norm": 2.8067450193311405, "learning_rate": 3.807122482580228e-05, "logits/chosen": -13.0, "logits/rejected": -13.1875, "logps/chosen": -540.0, "logps/rejected": -644.0, "loss": 0.1298, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.0, "rewards/margins": 6.40625, "rewards/rejected": -16.375, "step": 2250 }, { "epoch": 1.1826268969126112, "grad_norm": 2.6171778099575698, "learning_rate": 3.794121136210546e-05, "logits/chosen": -13.5625, "logits/rejected": -13.5625, "logps/chosen": -544.0, "logps/rejected": -660.0, "loss": 0.1174, "rewards/accuracies": 0.9375, "rewards/chosen": -10.375, "rewards/margins": 7.40625, "rewards/rejected": -17.75, "step": 2260 }, { "epoch": 1.1878597592883307, "grad_norm": 4.4393184562284045, "learning_rate": 3.7810718007770175e-05, "logits/chosen": -13.8125, "logits/rejected": -13.8125, "logps/chosen": -580.0, "logps/rejected": -712.0, "loss": 0.1337, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.1875, "rewards/margins": 8.625, "rewards/rejected": -18.875, "step": 2270 }, { "epoch": 1.1930926216640503, "grad_norm": 2.8975735844666466, "learning_rate": 3.7679749601797765e-05, "logits/chosen": -13.6875, "logits/rejected": -13.75, "logps/chosen": -544.0, "logps/rejected": -648.0, "loss": 0.1665, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.875, "rewards/margins": 8.0625, "rewards/rejected": -17.875, "step": 2280 }, { "epoch": 1.1983254840397697, "grad_norm": 4.705036027569537, "learning_rate": 3.754831100080561e-05, "logits/chosen": -14.125, "logits/rejected": -14.0, "logps/chosen": -500.0, "logps/rejected": -660.0, "loss": 0.1207, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.5, "rewards/margins": 8.0625, "rewards/rejected": -17.625, "step": 2290 }, { "epoch": 1.2035583464154893, "grad_norm": 3.3554354031784976, "learning_rate": 3.741640707884702e-05, "logits/chosen": -13.625, "logits/rejected": -13.75, "logps/chosen": -516.0, "logps/rejected": -640.0, "loss": 0.1299, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.125, "rewards/margins": 7.40625, "rewards/rejected": -16.5, "step": 2300 }, { "epoch": 1.2087912087912087, "grad_norm": 4.380253542568911, "learning_rate": 3.728404272723051e-05, "logits/chosen": -13.75, "logits/rejected": -13.875, "logps/chosen": -492.0, "logps/rejected": -596.0, "loss": 0.1698, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.8125, "rewards/margins": 6.96875, "rewards/rejected": -16.75, "step": 2310 }, { "epoch": 1.2140240711669283, "grad_norm": 5.863469122548758, "learning_rate": 3.715122285433842e-05, "logits/chosen": -13.5625, "logits/rejected": -13.875, "logps/chosen": -560.0, "logps/rejected": -632.0, "loss": 0.1684, "rewards/accuracies": 0.9375, "rewards/chosen": -11.0, "rewards/margins": 6.5625, "rewards/rejected": -17.5, "step": 2320 }, { "epoch": 1.2192569335426477, "grad_norm": 3.85026586750782, "learning_rate": 3.701795238544488e-05, "logits/chosen": -13.3125, "logits/rejected": -13.25, "logps/chosen": -564.0, "logps/rejected": -644.0, "loss": 0.2142, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.0, "rewards/margins": 6.75, "rewards/rejected": -17.75, "step": 2330 }, { "epoch": 1.2244897959183674, "grad_norm": 2.8862551859168333, "learning_rate": 3.6884236262533187e-05, "logits/chosen": -13.1875, "logits/rejected": -12.875, "logps/chosen": -508.0, "logps/rejected": -608.0, "loss": 0.1627, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.1875, "rewards/margins": 6.53125, "rewards/rejected": -17.625, "step": 2340 }, { "epoch": 1.2297226582940868, "grad_norm": 1.7267058938947848, "learning_rate": 3.6750079444112535e-05, "logits/chosen": -13.5625, "logits/rejected": -13.25, "logps/chosen": -548.0, "logps/rejected": -636.0, "loss": 0.2021, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.4375, "rewards/margins": 6.46875, "rewards/rejected": -17.875, "step": 2350 }, { "epoch": 1.2349555206698064, "grad_norm": 1.8094583649915732, "learning_rate": 3.661548690503417e-05, "logits/chosen": -13.0625, "logits/rejected": -12.75, "logps/chosen": -552.0, "logps/rejected": -652.0, "loss": 0.1449, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.3125, "rewards/margins": 7.125, "rewards/rejected": -18.375, "step": 2360 }, { "epoch": 1.2401883830455258, "grad_norm": 6.973218769623946, "learning_rate": 3.648046363630685e-05, "logits/chosen": -13.5, "logits/rejected": -13.375, "logps/chosen": -588.0, "logps/rejected": -684.0, "loss": 0.1426, "rewards/accuracies": 0.9375, "rewards/chosen": -12.0, "rewards/margins": 7.4375, "rewards/rejected": -19.375, "step": 2370 }, { "epoch": 1.2454212454212454, "grad_norm": 2.724033971663383, "learning_rate": 3.6345014644911835e-05, "logits/chosen": -12.875, "logits/rejected": -12.5, "logps/chosen": -544.0, "logps/rejected": -640.0, "loss": 0.1231, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.0625, "rewards/margins": 7.34375, "rewards/rejected": -18.375, "step": 2380 }, { "epoch": 1.250654107796965, "grad_norm": 3.3380336722140207, "learning_rate": 3.620914495361718e-05, "logits/chosen": -12.875, "logits/rejected": -12.3125, "logps/chosen": -616.0, "logps/rejected": -736.0, "loss": 0.1705, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.0625, "rewards/margins": 8.25, "rewards/rejected": -19.25, "step": 2390 }, { "epoch": 1.2558869701726845, "grad_norm": 5.007066809175749, "learning_rate": 3.607285960079146e-05, "logits/chosen": -12.875, "logits/rejected": -12.8125, "logps/chosen": -556.0, "logps/rejected": -664.0, "loss": 0.1405, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -9.75, "rewards/margins": 8.0, "rewards/rejected": -17.75, "step": 2400 }, { "epoch": 1.2611198325484039, "grad_norm": 1.9696214011970825, "learning_rate": 3.5936163640217014e-05, "logits/chosen": -12.75, "logits/rejected": -13.25, "logps/chosen": -576.0, "logps/rejected": -676.0, "loss": 0.2191, "rewards/accuracies": 0.9375, "rewards/chosen": -9.3125, "rewards/margins": 7.78125, "rewards/rejected": -17.125, "step": 2410 }, { "epoch": 1.2663526949241235, "grad_norm": 5.480736608377085, "learning_rate": 3.5799062140902417e-05, "logits/chosen": -12.875, "logits/rejected": -13.5625, "logps/chosen": -592.0, "logps/rejected": -612.0, "loss": 0.1494, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.5625, "rewards/margins": 7.59375, "rewards/rejected": -17.125, "step": 2420 }, { "epoch": 1.2715855572998431, "grad_norm": 3.9789353089091826, "learning_rate": 3.566156018689463e-05, "logits/chosen": -13.3125, "logits/rejected": -13.4375, "logps/chosen": -556.0, "logps/rejected": -608.0, "loss": 0.246, "rewards/accuracies": 0.875, "rewards/chosen": -11.0625, "rewards/margins": 6.5, "rewards/rejected": -17.5, "step": 2430 }, { "epoch": 1.2768184196755625, "grad_norm": 3.161435784298808, "learning_rate": 3.552366287709038e-05, "logits/chosen": -13.25, "logits/rejected": -13.25, "logps/chosen": -536.0, "logps/rejected": -640.0, "loss": 0.1567, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.3125, "rewards/margins": 6.8125, "rewards/rejected": -17.125, "step": 2440 }, { "epoch": 1.282051282051282, "grad_norm": 4.24755187840357, "learning_rate": 3.5385375325047166e-05, "logits/chosen": -13.0, "logits/rejected": -12.9375, "logps/chosen": -532.0, "logps/rejected": -592.0, "loss": 0.1187, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.25, "rewards/margins": 5.5625, "rewards/rejected": -15.8125, "step": 2450 }, { "epoch": 1.2872841444270016, "grad_norm": 9.226106460297151, "learning_rate": 3.524670265879354e-05, "logits/chosen": -13.125, "logits/rejected": -13.0625, "logps/chosen": -488.0, "logps/rejected": -568.0, "loss": 0.1767, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.5, "rewards/margins": 5.875, "rewards/rejected": -16.375, "step": 2460 }, { "epoch": 1.2925170068027212, "grad_norm": 3.368430888172939, "learning_rate": 3.5107650020639014e-05, "logits/chosen": -13.1875, "logits/rejected": -12.75, "logps/chosen": -494.0, "logps/rejected": -600.0, "loss": 0.1158, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -9.875, "rewards/margins": 5.875, "rewards/rejected": -15.75, "step": 2470 }, { "epoch": 1.2977498691784406, "grad_norm": 6.787912182490308, "learning_rate": 3.496822256698337e-05, "logits/chosen": -13.0, "logits/rejected": -12.5625, "logps/chosen": -532.0, "logps/rejected": -624.0, "loss": 0.1839, "rewards/accuracies": 0.9375, "rewards/chosen": -10.75, "rewards/margins": 6.84375, "rewards/rejected": -17.625, "step": 2480 }, { "epoch": 1.30298273155416, "grad_norm": 3.0940351195501674, "learning_rate": 3.482842546812544e-05, "logits/chosen": -12.75, "logits/rejected": -13.0, "logps/chosen": -604.0, "logps/rejected": -648.0, "loss": 0.0897, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.875, "rewards/margins": 7.8125, "rewards/rejected": -17.75, "step": 2490 }, { "epoch": 1.3082155939298796, "grad_norm": 6.558994468239535, "learning_rate": 3.468826390807131e-05, "logits/chosen": -13.0625, "logits/rejected": -12.9375, "logps/chosen": -488.0, "logps/rejected": -592.0, "loss": 0.1485, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.3125, "rewards/margins": 5.96875, "rewards/rejected": -16.25, "step": 2500 }, { "epoch": 1.3134484563055993, "grad_norm": 1.2318773732626507, "learning_rate": 3.454774308434222e-05, "logits/chosen": -12.875, "logits/rejected": -12.6875, "logps/chosen": -492.0, "logps/rejected": -656.0, "loss": 0.1238, "rewards/accuracies": 0.9375, "rewards/chosen": -10.375, "rewards/margins": 6.71875, "rewards/rejected": -17.125, "step": 2510 }, { "epoch": 1.3186813186813187, "grad_norm": 5.803207144946933, "learning_rate": 3.4406868207781725e-05, "logits/chosen": -12.6875, "logits/rejected": -12.75, "logps/chosen": -548.0, "logps/rejected": -604.0, "loss": 0.1611, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.0625, "rewards/margins": 6.15625, "rewards/rejected": -17.25, "step": 2520 }, { "epoch": 1.323914181057038, "grad_norm": 2.294633376201315, "learning_rate": 3.4265644502362495e-05, "logits/chosen": -12.6875, "logits/rejected": -13.0625, "logps/chosen": -580.0, "logps/rejected": -680.0, "loss": 0.1449, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.125, "rewards/margins": 7.65625, "rewards/rejected": -18.75, "step": 2530 }, { "epoch": 1.3291470434327577, "grad_norm": 4.04713234571419, "learning_rate": 3.4124077204992576e-05, "logits/chosen": -12.8125, "logits/rejected": -12.875, "logps/chosen": -492.0, "logps/rejected": -600.0, "loss": 0.2189, "rewards/accuracies": 0.875, "rewards/chosen": -10.8125, "rewards/margins": 6.15625, "rewards/rejected": -17.0, "step": 2540 }, { "epoch": 1.3343799058084773, "grad_norm": 3.405504777424004, "learning_rate": 3.398217156532125e-05, "logits/chosen": -12.875, "logits/rejected": -13.0625, "logps/chosen": -552.0, "logps/rejected": -632.0, "loss": 0.1636, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -11.4375, "rewards/margins": 6.40625, "rewards/rejected": -17.875, "step": 2550 }, { "epoch": 1.3396127681841967, "grad_norm": 3.1168289976514116, "learning_rate": 3.383993284554431e-05, "logits/chosen": -12.5625, "logits/rejected": -12.75, "logps/chosen": -544.0, "logps/rejected": -608.0, "loss": 0.1437, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.4375, "rewards/margins": 6.6875, "rewards/rejected": -17.125, "step": 2560 }, { "epoch": 1.3448456305599163, "grad_norm": 3.505753194889169, "learning_rate": 3.3697366320208955e-05, "logits/chosen": -12.875, "logits/rejected": -13.0, "logps/chosen": -552.0, "logps/rejected": -640.0, "loss": 0.136, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.3125, "rewards/margins": 7.0625, "rewards/rejected": -17.375, "step": 2570 }, { "epoch": 1.3500784929356358, "grad_norm": 4.005823523601315, "learning_rate": 3.355447727601816e-05, "logits/chosen": -12.25, "logits/rejected": -12.1875, "logps/chosen": -528.0, "logps/rejected": -636.0, "loss": 0.2811, "rewards/accuracies": 0.9375, "rewards/chosen": -10.1875, "rewards/margins": 7.3125, "rewards/rejected": -17.5, "step": 2580 }, { "epoch": 1.3553113553113554, "grad_norm": 4.516125334834659, "learning_rate": 3.34112710116347e-05, "logits/chosen": -12.5, "logits/rejected": -12.4375, "logps/chosen": -568.0, "logps/rejected": -704.0, "loss": 0.2014, "rewards/accuracies": 0.9375, "rewards/chosen": -10.4375, "rewards/margins": 7.4375, "rewards/rejected": -17.875, "step": 2590 }, { "epoch": 1.3605442176870748, "grad_norm": 2.4451528489453866, "learning_rate": 3.326775283748459e-05, "logits/chosen": -12.5, "logits/rejected": -12.25, "logps/chosen": -492.0, "logps/rejected": -608.0, "loss": 0.1692, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.4375, "rewards/margins": 6.5, "rewards/rejected": -17.0, "step": 2600 }, { "epoch": 1.3657770800627944, "grad_norm": 5.747504543634415, "learning_rate": 3.3123928075560204e-05, "logits/chosen": -12.5625, "logits/rejected": -12.625, "logps/chosen": -560.0, "logps/rejected": -628.0, "loss": 0.1808, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.25, "rewards/margins": 6.625, "rewards/rejected": -16.875, "step": 2610 }, { "epoch": 1.3710099424385138, "grad_norm": 2.096012785984857, "learning_rate": 3.297980205922294e-05, "logits/chosen": -12.875, "logits/rejected": -13.0, "logps/chosen": -532.0, "logps/rejected": -600.0, "loss": 0.1871, "rewards/accuracies": 0.9375, "rewards/chosen": -10.5625, "rewards/margins": 6.0625, "rewards/rejected": -16.625, "step": 2620 }, { "epoch": 1.3762428048142334, "grad_norm": 2.4760284272107365, "learning_rate": 3.2835380133005375e-05, "logits/chosen": -13.125, "logits/rejected": -12.8125, "logps/chosen": -482.0, "logps/rejected": -616.0, "loss": 0.1288, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.625, "rewards/margins": 6.375, "rewards/rejected": -17.0, "step": 2630 }, { "epoch": 1.3814756671899528, "grad_norm": 1.9511277267799827, "learning_rate": 3.269066765241314e-05, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -536.0, "logps/rejected": -600.0, "loss": 0.1738, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.1875, "rewards/margins": 5.71875, "rewards/rejected": -16.875, "step": 2640 }, { "epoch": 1.3867085295656725, "grad_norm": 2.9900866237679318, "learning_rate": 3.254566998372634e-05, "logits/chosen": -12.875, "logits/rejected": -12.6875, "logps/chosen": -506.0, "logps/rejected": -640.0, "loss": 0.1671, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.3125, "rewards/margins": 6.34375, "rewards/rejected": -17.625, "step": 2650 }, { "epoch": 1.3919413919413919, "grad_norm": 4.0853159087986075, "learning_rate": 3.240039250380048e-05, "logits/chosen": -12.75, "logits/rejected": -12.625, "logps/chosen": -580.0, "logps/rejected": -760.0, "loss": 0.1026, "rewards/accuracies": 0.9375, "rewards/chosen": -10.9375, "rewards/margins": 8.4375, "rewards/rejected": -19.375, "step": 2660 }, { "epoch": 1.3971742543171115, "grad_norm": 3.301818111899913, "learning_rate": 3.225484059986715e-05, "logits/chosen": -12.6875, "logits/rejected": -12.625, "logps/chosen": -560.0, "logps/rejected": -624.0, "loss": 0.1356, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -11.875, "rewards/margins": 6.4375, "rewards/rejected": -18.25, "step": 2670 }, { "epoch": 1.402407116692831, "grad_norm": 3.5961658429484795, "learning_rate": 3.2109019669334216e-05, "logits/chosen": -12.75, "logits/rejected": -12.6875, "logps/chosen": -572.0, "logps/rejected": -720.0, "loss": 0.1695, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.5, "rewards/margins": 9.6875, "rewards/rejected": -20.25, "step": 2680 }, { "epoch": 1.4076399790685505, "grad_norm": 4.222385432033824, "learning_rate": 3.1962935119585705e-05, "logits/chosen": -12.875, "logits/rejected": -12.875, "logps/chosen": -484.0, "logps/rejected": -636.0, "loss": 0.1752, "rewards/accuracies": 0.9375, "rewards/chosen": -10.625, "rewards/margins": 6.625, "rewards/rejected": -17.25, "step": 2690 }, { "epoch": 1.41287284144427, "grad_norm": 3.6697103875093746, "learning_rate": 3.181659236778124e-05, "logits/chosen": -12.4375, "logits/rejected": -12.8125, "logps/chosen": -580.0, "logps/rejected": -656.0, "loss": 0.1405, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.75, "rewards/margins": 7.5625, "rewards/rejected": -18.25, "step": 2700 }, { "epoch": 1.4181057038199896, "grad_norm": 5.4446370048728, "learning_rate": 3.166999684065521e-05, "logits/chosen": -12.4375, "logits/rejected": -12.5625, "logps/chosen": -552.0, "logps/rejected": -632.0, "loss": 0.1835, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.625, "rewards/margins": 7.0625, "rewards/rejected": -17.625, "step": 2710 }, { "epoch": 1.423338566195709, "grad_norm": 1.4971937298054474, "learning_rate": 3.15231539743155e-05, "logits/chosen": -12.625, "logits/rejected": -12.75, "logps/chosen": -524.0, "logps/rejected": -616.0, "loss": 0.166, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.75, "rewards/margins": 6.3125, "rewards/rejected": -17.0, "step": 2720 }, { "epoch": 1.4285714285714286, "grad_norm": 7.798045925589297, "learning_rate": 3.1376069214041913e-05, "logits/chosen": -12.875, "logits/rejected": -13.0625, "logps/chosen": -540.0, "logps/rejected": -636.0, "loss": 0.2303, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.5625, "rewards/margins": 6.34375, "rewards/rejected": -17.875, "step": 2730 }, { "epoch": 1.433804290947148, "grad_norm": 7.892586302760298, "learning_rate": 3.1228748014084246e-05, "logits/chosen": -12.5625, "logits/rejected": -12.75, "logps/chosen": -552.0, "logps/rejected": -644.0, "loss": 0.2323, "rewards/accuracies": 0.875, "rewards/chosen": -11.875, "rewards/margins": 6.65625, "rewards/rejected": -18.5, "step": 2740 }, { "epoch": 1.4390371533228676, "grad_norm": 2.2260286497847126, "learning_rate": 3.1081195837460055e-05, "logits/chosen": -12.625, "logits/rejected": -12.5, "logps/chosen": -528.0, "logps/rejected": -688.0, "loss": 0.1634, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.6875, "rewards/margins": 6.90625, "rewards/rejected": -18.5, "step": 2750 }, { "epoch": 1.4442700156985873, "grad_norm": 3.902080264497474, "learning_rate": 3.0933418155752026e-05, "logits/chosen": -12.625, "logits/rejected": -12.75, "logps/chosen": -580.0, "logps/rejected": -604.0, "loss": 0.1366, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.375, "rewards/margins": 6.96875, "rewards/rejected": -18.375, "step": 2760 }, { "epoch": 1.4495028780743067, "grad_norm": 6.565793274642643, "learning_rate": 3.0785420448905134e-05, "logits/chosen": -12.5, "logits/rejected": -12.6875, "logps/chosen": -576.0, "logps/rejected": -652.0, "loss": 0.1804, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -10.875, "rewards/margins": 7.125, "rewards/rejected": -18.0, "step": 2770 }, { "epoch": 1.454735740450026, "grad_norm": 4.78069477411253, "learning_rate": 3.063720820502339e-05, "logits/chosen": -12.375, "logits/rejected": -12.3125, "logps/chosen": -516.0, "logps/rejected": -580.0, "loss": 0.1679, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.625, "rewards/margins": 6.15625, "rewards/rejected": -16.75, "step": 2780 }, { "epoch": 1.4599686028257457, "grad_norm": 2.9641760959749854, "learning_rate": 3.0488786920166345e-05, "logits/chosen": -12.3125, "logits/rejected": -12.1875, "logps/chosen": -548.0, "logps/rejected": -700.0, "loss": 0.1477, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.9375, "rewards/margins": 7.3125, "rewards/rejected": -18.25, "step": 2790 }, { "epoch": 1.4652014652014653, "grad_norm": 3.1703555226123155, "learning_rate": 3.03401620981453e-05, "logits/chosen": -12.3125, "logits/rejected": -12.375, "logps/chosen": -588.0, "logps/rejected": -724.0, "loss": 0.1567, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.0625, "rewards/margins": 8.125, "rewards/rejected": -19.25, "step": 2800 }, { "epoch": 1.4704343275771847, "grad_norm": 1.6396098144570719, "learning_rate": 3.019133925031915e-05, "logits/chosen": -12.25, "logits/rejected": -12.0625, "logps/chosen": -572.0, "logps/rejected": -692.0, "loss": 0.1122, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.125, "rewards/margins": 6.875, "rewards/rejected": -19.0, "step": 2810 }, { "epoch": 1.4756671899529041, "grad_norm": 3.24108170871626, "learning_rate": 3.004232389539011e-05, "logits/chosen": -12.25, "logits/rejected": -12.0, "logps/chosen": -524.0, "logps/rejected": -640.0, "loss": 0.1385, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.25, "rewards/margins": 7.09375, "rewards/rejected": -18.375, "step": 2820 }, { "epoch": 1.4809000523286238, "grad_norm": 1.6066493200954974, "learning_rate": 2.9893121559198983e-05, "logits/chosen": -12.6875, "logits/rejected": -12.875, "logps/chosen": -528.0, "logps/rejected": -632.0, "loss": 0.1041, "rewards/accuracies": 0.875, "rewards/chosen": -10.75, "rewards/margins": 6.75, "rewards/rejected": -17.5, "step": 2830 }, { "epoch": 1.4861329147043434, "grad_norm": 1.4015525250582879, "learning_rate": 2.974373777452027e-05, "logits/chosen": -12.3125, "logits/rejected": -12.375, "logps/chosen": -588.0, "logps/rejected": -664.0, "loss": 0.2043, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.5625, "rewards/margins": 6.90625, "rewards/rejected": -17.5, "step": 2840 }, { "epoch": 1.4913657770800628, "grad_norm": 3.1847220201497715, "learning_rate": 2.959417808085702e-05, "logits/chosen": -12.375, "logits/rejected": -12.375, "logps/chosen": -548.0, "logps/rejected": -636.0, "loss": 0.1418, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.5625, "rewards/margins": 7.5, "rewards/rejected": -18.0, "step": 2850 }, { "epoch": 1.4965986394557822, "grad_norm": 6.675033635871589, "learning_rate": 2.9444448024235422e-05, "logits/chosen": -11.875, "logits/rejected": -12.0, "logps/chosen": -544.0, "logps/rejected": -696.0, "loss": 0.1554, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.3125, "rewards/margins": 8.6875, "rewards/rejected": -18.0, "step": 2860 }, { "epoch": 1.5018315018315018, "grad_norm": 1.8005269219564757, "learning_rate": 2.9294553156999082e-05, "logits/chosen": -11.75, "logits/rejected": -11.8125, "logps/chosen": -552.0, "logps/rejected": -648.0, "loss": 0.1726, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.25, "rewards/margins": 6.53125, "rewards/rejected": -16.75, "step": 2870 }, { "epoch": 1.5070643642072215, "grad_norm": 3.5018889880690436, "learning_rate": 2.9144499037603207e-05, "logits/chosen": -11.5625, "logits/rejected": -11.5625, "logps/chosen": -540.0, "logps/rejected": -652.0, "loss": 0.1742, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.75, "rewards/margins": 8.0625, "rewards/rejected": -17.75, "step": 2880 }, { "epoch": 1.5122972265829409, "grad_norm": 4.176359733461575, "learning_rate": 2.8994291230408432e-05, "logits/chosen": -11.4375, "logits/rejected": -11.6875, "logps/chosen": -600.0, "logps/rejected": -684.0, "loss": 0.1528, "rewards/accuracies": 0.9375, "rewards/chosen": -11.0, "rewards/margins": 8.1875, "rewards/rejected": -19.125, "step": 2890 }, { "epoch": 1.5175300889586603, "grad_norm": 3.263054937351028, "learning_rate": 2.8843935305474524e-05, "logits/chosen": -12.5, "logits/rejected": -12.375, "logps/chosen": -588.0, "logps/rejected": -764.0, "loss": 0.1019, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.8125, "rewards/margins": 10.625, "rewards/rejected": -21.5, "step": 2900 }, { "epoch": 1.5227629513343799, "grad_norm": 6.019092926637461, "learning_rate": 2.869343683835376e-05, "logits/chosen": -12.75, "logits/rejected": -12.4375, "logps/chosen": -536.0, "logps/rejected": -676.0, "loss": 0.174, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.75, "rewards/margins": 7.53125, "rewards/rejected": -19.25, "step": 2910 }, { "epoch": 1.5279958137100995, "grad_norm": 3.2240174666369628, "learning_rate": 2.8542801409884258e-05, "logits/chosen": -12.3125, "logits/rejected": -12.1875, "logps/chosen": -588.0, "logps/rejected": -736.0, "loss": 0.1138, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.75, "rewards/margins": 7.40625, "rewards/rejected": -19.125, "step": 2920 }, { "epoch": 1.533228676085819, "grad_norm": 3.2530918121387717, "learning_rate": 2.839203460598297e-05, "logits/chosen": -12.125, "logits/rejected": -12.25, "logps/chosen": -600.0, "logps/rejected": -752.0, "loss": 0.1355, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.25, "rewards/margins": 7.6875, "rewards/rejected": -19.0, "step": 2930 }, { "epoch": 1.5384615384615383, "grad_norm": 4.134756743952236, "learning_rate": 2.824114201743856e-05, "logits/chosen": -12.375, "logits/rejected": -12.3125, "logps/chosen": -564.0, "logps/rejected": -716.0, "loss": 0.1474, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.5625, "rewards/margins": 9.1875, "rewards/rejected": -19.75, "step": 2940 }, { "epoch": 1.543694400837258, "grad_norm": 5.566935384470763, "learning_rate": 2.8090129239704083e-05, "logits/chosen": -12.5, "logits/rejected": -12.6875, "logps/chosen": -536.0, "logps/rejected": -592.0, "loss": 0.1594, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -11.3125, "rewards/margins": 6.9375, "rewards/rejected": -18.25, "step": 2950 }, { "epoch": 1.5489272632129776, "grad_norm": 1.021451185458759, "learning_rate": 2.7939001872689498e-05, "logits/chosen": -12.125, "logits/rejected": -12.25, "logps/chosen": -520.0, "logps/rejected": -604.0, "loss": 0.147, "rewards/accuracies": 0.875, "rewards/chosen": -11.0, "rewards/margins": 6.4375, "rewards/rejected": -17.5, "step": 2960 }, { "epoch": 1.554160125588697, "grad_norm": 4.827200174694787, "learning_rate": 2.7787765520553984e-05, "logits/chosen": -12.125, "logits/rejected": -12.375, "logps/chosen": -536.0, "logps/rejected": -632.0, "loss": 0.1299, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.6875, "rewards/margins": 7.21875, "rewards/rejected": -17.875, "step": 2970 }, { "epoch": 1.5593929879644164, "grad_norm": 4.491667565433733, "learning_rate": 2.7636425791498178e-05, "logits/chosen": -12.1875, "logits/rejected": -12.25, "logps/chosen": -592.0, "logps/rejected": -672.0, "loss": 0.161, "rewards/accuracies": 0.9375, "rewards/chosen": -11.3125, "rewards/margins": 6.6875, "rewards/rejected": -18.0, "step": 2980 }, { "epoch": 1.564625850340136, "grad_norm": 3.4830523290144852, "learning_rate": 2.748498829755615e-05, "logits/chosen": -12.0, "logits/rejected": -11.8125, "logps/chosen": -532.0, "logps/rejected": -696.0, "loss": 0.1582, "rewards/accuracies": 0.9375, "rewards/chosen": -11.3125, "rewards/margins": 7.6875, "rewards/rejected": -19.0, "step": 2990 }, { "epoch": 1.5698587127158556, "grad_norm": 5.529987237479786, "learning_rate": 2.7333458654387345e-05, "logits/chosen": -12.0, "logits/rejected": -12.125, "logps/chosen": -592.0, "logps/rejected": -688.0, "loss": 0.1609, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.75, "rewards/margins": 7.40625, "rewards/rejected": -18.125, "step": 3000 }, { "epoch": 1.575091575091575, "grad_norm": 7.669263543730199, "learning_rate": 2.7181842481068282e-05, "logits/chosen": -11.875, "logits/rejected": -11.9375, "logps/chosen": -584.0, "logps/rejected": -716.0, "loss": 0.1702, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.6875, "rewards/margins": 7.78125, "rewards/rejected": -18.5, "step": 3010 }, { "epoch": 1.5803244374672945, "grad_norm": 5.58838804324731, "learning_rate": 2.703014539988428e-05, "logits/chosen": -12.375, "logits/rejected": -12.5625, "logps/chosen": -572.0, "logps/rejected": -636.0, "loss": 0.1976, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.6875, "rewards/margins": 5.65625, "rewards/rejected": -17.375, "step": 3020 }, { "epoch": 1.585557299843014, "grad_norm": 4.214778326380665, "learning_rate": 2.6878373036120852e-05, "logits/chosen": -11.875, "logits/rejected": -12.0625, "logps/chosen": -596.0, "logps/rejected": -648.0, "loss": 0.1694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.6875, "rewards/margins": 5.84375, "rewards/rejected": -17.5, "step": 3030 }, { "epoch": 1.5907901622187337, "grad_norm": 4.08908410186959, "learning_rate": 2.6726531017855194e-05, "logits/chosen": -12.1875, "logits/rejected": -12.125, "logps/chosen": -524.0, "logps/rejected": -668.0, "loss": 0.1309, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.125, "rewards/margins": 7.75, "rewards/rejected": -17.875, "step": 3040 }, { "epoch": 1.5960230245944533, "grad_norm": 2.1197256668242597, "learning_rate": 2.657462497574747e-05, "logits/chosen": -12.125, "logits/rejected": -12.25, "logps/chosen": -498.0, "logps/rejected": -576.0, "loss": 0.1233, "rewards/accuracies": 0.9375, "rewards/chosen": -10.75, "rewards/margins": 6.5, "rewards/rejected": -17.25, "step": 3050 }, { "epoch": 1.6012558869701727, "grad_norm": 2.5029724664557276, "learning_rate": 2.642266054283198e-05, "logits/chosen": -11.9375, "logits/rejected": -12.375, "logps/chosen": -600.0, "logps/rejected": -660.0, "loss": 0.1141, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.6875, "rewards/margins": 7.8125, "rewards/rejected": -18.5, "step": 3060 }, { "epoch": 1.6064887493458921, "grad_norm": 3.899947414356875, "learning_rate": 2.6270643354308288e-05, "logits/chosen": -11.9375, "logits/rejected": -12.0, "logps/chosen": -564.0, "logps/rejected": -684.0, "loss": 0.1493, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.5, "rewards/margins": 7.65625, "rewards/rejected": -18.25, "step": 3070 }, { "epoch": 1.6117216117216118, "grad_norm": 2.3195694442688977, "learning_rate": 2.611857904733227e-05, "logits/chosen": -11.8125, "logits/rejected": -12.3125, "logps/chosen": -560.0, "logps/rejected": -620.0, "loss": 0.1386, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.1875, "rewards/margins": 7.28125, "rewards/rejected": -17.5, "step": 3080 }, { "epoch": 1.6169544740973314, "grad_norm": 2.0725325059582, "learning_rate": 2.5966473260807078e-05, "logits/chosen": -11.875, "logits/rejected": -11.9375, "logps/chosen": -576.0, "logps/rejected": -720.0, "loss": 0.125, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.375, "rewards/margins": 8.625, "rewards/rejected": -19.0, "step": 3090 }, { "epoch": 1.6221873364730508, "grad_norm": 4.113993754263742, "learning_rate": 2.5814331635173987e-05, "logits/chosen": -11.6875, "logits/rejected": -11.8125, "logps/chosen": -560.0, "logps/rejected": -656.0, "loss": 0.172, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.8125, "rewards/margins": 6.28125, "rewards/rejected": -17.125, "step": 3100 }, { "epoch": 1.6274201988487702, "grad_norm": 2.678743740836168, "learning_rate": 2.5662159812203313e-05, "logits/chosen": -12.0, "logits/rejected": -11.875, "logps/chosen": -512.0, "logps/rejected": -636.0, "loss": 0.1411, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.4375, "rewards/margins": 6.5625, "rewards/rejected": -17.0, "step": 3110 }, { "epoch": 1.6326530612244898, "grad_norm": 2.9143799299622337, "learning_rate": 2.550996343478514e-05, "logits/chosen": -11.5625, "logits/rejected": -12.125, "logps/chosen": -568.0, "logps/rejected": -652.0, "loss": 0.1315, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.3125, "rewards/margins": 8.375, "rewards/rejected": -17.75, "step": 3120 }, { "epoch": 1.6378859236002095, "grad_norm": 6.560949719147845, "learning_rate": 2.535774814672008e-05, "logits/chosen": -11.5, "logits/rejected": -12.0, "logps/chosen": -512.0, "logps/rejected": -580.0, "loss": 0.1572, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.5625, "rewards/margins": 6.8125, "rewards/rejected": -16.375, "step": 3130 }, { "epoch": 1.6431187859759289, "grad_norm": 5.931912626292325, "learning_rate": 2.5205519592509995e-05, "logits/chosen": -11.9375, "logits/rejected": -12.1875, "logps/chosen": -502.0, "logps/rejected": -624.0, "loss": 0.1467, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.3125, "rewards/margins": 7.125, "rewards/rejected": -17.375, "step": 3140 }, { "epoch": 1.6483516483516483, "grad_norm": 7.463777806346751, "learning_rate": 2.505328341714873e-05, "logits/chosen": -12.0625, "logits/rejected": -12.375, "logps/chosen": -536.0, "logps/rejected": -592.0, "loss": 0.1254, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.4375, "rewards/margins": 6.53125, "rewards/rejected": -17.0, "step": 3150 }, { "epoch": 1.653584510727368, "grad_norm": 3.2753809334776762, "learning_rate": 2.490104526591269e-05, "logits/chosen": -12.3125, "logits/rejected": -12.375, "logps/chosen": -544.0, "logps/rejected": -644.0, "loss": 0.1273, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.1875, "rewards/margins": 6.875, "rewards/rejected": -17.0, "step": 3160 }, { "epoch": 1.6588173731030875, "grad_norm": 2.3621905847019034, "learning_rate": 2.474881078415156e-05, "logits/chosen": -12.625, "logits/rejected": -12.9375, "logps/chosen": -532.0, "logps/rejected": -600.0, "loss": 0.0922, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.3125, "rewards/margins": 6.6875, "rewards/rejected": -17.0, "step": 3170 }, { "epoch": 1.664050235478807, "grad_norm": 6.109434642218217, "learning_rate": 2.4596585617078982e-05, "logits/chosen": -12.4375, "logits/rejected": -12.25, "logps/chosen": -544.0, "logps/rejected": -672.0, "loss": 0.1383, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.75, "rewards/margins": 6.96875, "rewards/rejected": -17.75, "step": 3180 }, { "epoch": 1.6692830978545263, "grad_norm": 5.313432208482434, "learning_rate": 2.4444375409563145e-05, "logits/chosen": -12.6875, "logits/rejected": -12.75, "logps/chosen": -544.0, "logps/rejected": -668.0, "loss": 0.2005, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.875, "rewards/margins": 8.125, "rewards/rejected": -19.0, "step": 3190 }, { "epoch": 1.674515960230246, "grad_norm": 9.780521211739286, "learning_rate": 2.429218580591753e-05, "logits/chosen": -12.625, "logits/rejected": -12.625, "logps/chosen": -588.0, "logps/rejected": -620.0, "loss": 0.2085, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -11.1875, "rewards/margins": 5.4375, "rewards/rejected": -16.625, "step": 3200 }, { "epoch": 1.6797488226059656, "grad_norm": 6.9752987720127475, "learning_rate": 2.4140022449691583e-05, "logits/chosen": -12.0625, "logits/rejected": -12.125, "logps/chosen": -580.0, "logps/rejected": -692.0, "loss": 0.1815, "rewards/accuracies": 0.9375, "rewards/chosen": -10.9375, "rewards/margins": 7.625, "rewards/rejected": -18.5, "step": 3210 }, { "epoch": 1.684981684981685, "grad_norm": 2.104780380076254, "learning_rate": 2.3987890983461407e-05, "logits/chosen": -12.1875, "logits/rejected": -12.375, "logps/chosen": -580.0, "logps/rejected": -744.0, "loss": 0.1257, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.5, "rewards/margins": 9.875, "rewards/rejected": -20.375, "step": 3220 }, { "epoch": 1.6902145473574044, "grad_norm": 4.920138400963844, "learning_rate": 2.3835797048620567e-05, "logits/chosen": -12.375, "logits/rejected": -12.625, "logps/chosen": -552.0, "logps/rejected": -616.0, "loss": 0.2009, "rewards/accuracies": 0.9375, "rewards/chosen": -11.125, "rewards/margins": 6.25, "rewards/rejected": -17.375, "step": 3230 }, { "epoch": 1.695447409733124, "grad_norm": 6.088610610369365, "learning_rate": 2.368374628517088e-05, "logits/chosen": -12.125, "logits/rejected": -12.25, "logps/chosen": -532.0, "logps/rejected": -624.0, "loss": 0.1616, "rewards/accuracies": 0.9375, "rewards/chosen": -10.5, "rewards/margins": 6.40625, "rewards/rejected": -16.875, "step": 3240 }, { "epoch": 1.7006802721088436, "grad_norm": 0.9648785608001027, "learning_rate": 2.353174433151325e-05, "logits/chosen": -12.1875, "logits/rejected": -12.25, "logps/chosen": -490.0, "logps/rejected": -644.0, "loss": 0.1135, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.4375, "rewards/margins": 7.96875, "rewards/rejected": -17.375, "step": 3250 }, { "epoch": 1.705913134484563, "grad_norm": 7.125968557878401, "learning_rate": 2.3379796824238608e-05, "logits/chosen": -12.0, "logits/rejected": -12.3125, "logps/chosen": -520.0, "logps/rejected": -612.0, "loss": 0.1642, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.8125, "rewards/margins": 6.28125, "rewards/rejected": -17.0, "step": 3260 }, { "epoch": 1.7111459968602825, "grad_norm": 2.1436506196796894, "learning_rate": 2.3227909397918897e-05, "logits/chosen": -11.6875, "logits/rejected": -11.6875, "logps/chosen": -548.0, "logps/rejected": -772.0, "loss": 0.1473, "rewards/accuracies": 1.0, "rewards/chosen": -8.875, "rewards/margins": 10.8125, "rewards/rejected": -19.625, "step": 3270 }, { "epoch": 1.716378859236002, "grad_norm": 3.8126804432159385, "learning_rate": 2.307608768489808e-05, "logits/chosen": -11.8125, "logits/rejected": -12.125, "logps/chosen": -564.0, "logps/rejected": -700.0, "loss": 0.1341, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.375, "rewards/margins": 8.9375, "rewards/rejected": -19.25, "step": 3280 }, { "epoch": 1.7216117216117217, "grad_norm": 5.170043751490522, "learning_rate": 2.2924337315083356e-05, "logits/chosen": -11.9375, "logits/rejected": -12.25, "logps/chosen": -552.0, "logps/rejected": -680.0, "loss": 0.1538, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.375, "rewards/margins": 7.25, "rewards/rejected": -18.625, "step": 3290 }, { "epoch": 1.7268445839874411, "grad_norm": 3.086020388242938, "learning_rate": 2.277266391573633e-05, "logits/chosen": -11.875, "logits/rejected": -12.0625, "logps/chosen": -568.0, "logps/rejected": -676.0, "loss": 0.1303, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.625, "rewards/margins": 8.6875, "rewards/rejected": -19.25, "step": 3300 }, { "epoch": 1.7320774463631605, "grad_norm": 6.888685972751711, "learning_rate": 2.262107311126436e-05, "logits/chosen": -12.3125, "logits/rejected": -12.375, "logps/chosen": -540.0, "logps/rejected": -640.0, "loss": 0.1396, "rewards/accuracies": 0.9375, "rewards/chosen": -11.0, "rewards/margins": 7.96875, "rewards/rejected": -19.0, "step": 3310 }, { "epoch": 1.7373103087388801, "grad_norm": 3.3674197456767434, "learning_rate": 2.2469570523011996e-05, "logits/chosen": -12.125, "logits/rejected": -12.0625, "logps/chosen": -560.0, "logps/rejected": -704.0, "loss": 0.1428, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.9375, "rewards/margins": 7.46875, "rewards/rejected": -19.375, "step": 3320 }, { "epoch": 1.7425431711145998, "grad_norm": 1.53918095587136, "learning_rate": 2.2318161769052525e-05, "logits/chosen": -11.875, "logits/rejected": -12.0, "logps/chosen": -592.0, "logps/rejected": -676.0, "loss": 0.1163, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -12.0, "rewards/margins": 7.3125, "rewards/rejected": -19.375, "step": 3330 }, { "epoch": 1.7477760334903192, "grad_norm": 1.6924601836899695, "learning_rate": 2.2166852463979625e-05, "logits/chosen": -12.625, "logits/rejected": -12.5, "logps/chosen": -548.0, "logps/rejected": -672.0, "loss": 0.1679, "rewards/accuracies": 0.9375, "rewards/chosen": -11.375, "rewards/margins": 7.5625, "rewards/rejected": -19.0, "step": 3340 }, { "epoch": 1.7530088958660386, "grad_norm": 3.439235498676025, "learning_rate": 2.2015648218699202e-05, "logits/chosen": -12.375, "logits/rejected": -12.375, "logps/chosen": -492.0, "logps/rejected": -608.0, "loss": 0.1573, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.0, "rewards/margins": 7.875, "rewards/rejected": -17.875, "step": 3350 }, { "epoch": 1.7582417582417582, "grad_norm": 2.5725031256973536, "learning_rate": 2.1864554640221245e-05, "logits/chosen": -12.25, "logits/rejected": -12.25, "logps/chosen": -520.0, "logps/rejected": -632.0, "loss": 0.1078, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.375, "rewards/margins": 6.5625, "rewards/rejected": -18.0, "step": 3360 }, { "epoch": 1.7634746206174778, "grad_norm": 2.3780858254933848, "learning_rate": 2.1713577331452017e-05, "logits/chosen": -12.25, "logits/rejected": -12.6875, "logps/chosen": -608.0, "logps/rejected": -648.0, "loss": 0.1472, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.9375, "rewards/margins": 6.8125, "rewards/rejected": -17.75, "step": 3370 }, { "epoch": 1.7687074829931972, "grad_norm": 3.788476799560272, "learning_rate": 2.1562721890986202e-05, "logits/chosen": -12.25, "logits/rejected": -12.5625, "logps/chosen": -536.0, "logps/rejected": -636.0, "loss": 0.1387, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.9375, "rewards/margins": 7.53125, "rewards/rejected": -18.5, "step": 3380 }, { "epoch": 1.7739403453689166, "grad_norm": 3.0735850422446442, "learning_rate": 2.1411993912899285e-05, "logits/chosen": -12.0625, "logits/rejected": -11.9375, "logps/chosen": -516.0, "logps/rejected": -676.0, "loss": 0.1236, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.875, "rewards/margins": 7.0625, "rewards/rejected": -18.0, "step": 3390 }, { "epoch": 1.7791732077446363, "grad_norm": 5.819898842195535, "learning_rate": 2.126139898654021e-05, "logits/chosen": -12.375, "logits/rejected": -12.5625, "logps/chosen": -552.0, "logps/rejected": -628.0, "loss": 0.1581, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.8125, "rewards/margins": 7.5625, "rewards/rejected": -18.375, "step": 3400 }, { "epoch": 1.784406070120356, "grad_norm": 3.1283675613331243, "learning_rate": 2.1110942696324017e-05, "logits/chosen": -11.6875, "logits/rejected": -11.8125, "logps/chosen": -548.0, "logps/rejected": -664.0, "loss": 0.1817, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.0625, "rewards/margins": 8.0, "rewards/rejected": -18.125, "step": 3410 }, { "epoch": 1.7896389324960753, "grad_norm": 5.783953308594588, "learning_rate": 2.0960630621524763e-05, "logits/chosen": -11.625, "logits/rejected": -12.25, "logps/chosen": -604.0, "logps/rejected": -616.0, "loss": 0.2123, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.5625, "rewards/margins": 6.78125, "rewards/rejected": -17.375, "step": 3420 }, { "epoch": 1.7948717948717947, "grad_norm": 4.261494497992645, "learning_rate": 2.0810468336068696e-05, "logits/chosen": -12.0625, "logits/rejected": -12.25, "logps/chosen": -508.0, "logps/rejected": -584.0, "loss": 0.1938, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -10.9375, "rewards/margins": 5.25, "rewards/rejected": -16.25, "step": 3430 }, { "epoch": 1.8001046572475143, "grad_norm": 1.741639534150338, "learning_rate": 2.0660461408327536e-05, "logits/chosen": -11.5625, "logits/rejected": -11.75, "logps/chosen": -600.0, "logps/rejected": -716.0, "loss": 0.1325, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.3125, "rewards/margins": 9.0625, "rewards/rejected": -19.375, "step": 3440 }, { "epoch": 1.805337519623234, "grad_norm": 1.2924995018776002, "learning_rate": 2.051061540091191e-05, "logits/chosen": -11.8125, "logits/rejected": -12.1875, "logps/chosen": -548.0, "logps/rejected": -628.0, "loss": 0.1469, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.25, "rewards/margins": 6.53125, "rewards/rejected": -17.75, "step": 3450 }, { "epoch": 1.8105703819989536, "grad_norm": 6.814794323471826, "learning_rate": 2.0360935870465188e-05, "logits/chosen": -11.875, "logits/rejected": -12.25, "logps/chosen": -584.0, "logps/rejected": -680.0, "loss": 0.137, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.25, "rewards/margins": 7.59375, "rewards/rejected": -18.875, "step": 3460 }, { "epoch": 1.815803244374673, "grad_norm": 5.821728263769117, "learning_rate": 2.021142836745739e-05, "logits/chosen": -12.0, "logits/rejected": -12.375, "logps/chosen": -572.0, "logps/rejected": -656.0, "loss": 0.1666, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.5, "rewards/margins": 7.8125, "rewards/rejected": -19.25, "step": 3470 }, { "epoch": 1.8210361067503924, "grad_norm": 4.713492097008847, "learning_rate": 2.006209843597931e-05, "logits/chosen": -12.25, "logits/rejected": -12.0, "logps/chosen": -604.0, "logps/rejected": -716.0, "loss": 0.1517, "rewards/accuracies": 0.9375, "rewards/chosen": -11.875, "rewards/margins": 8.0, "rewards/rejected": -19.875, "step": 3480 }, { "epoch": 1.826268969126112, "grad_norm": 3.673927096473944, "learning_rate": 1.9912951613537e-05, "logits/chosen": -11.6875, "logits/rejected": -12.125, "logps/chosen": -564.0, "logps/rejected": -636.0, "loss": 0.1386, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.3125, "rewards/margins": 7.34375, "rewards/rejected": -17.625, "step": 3490 }, { "epoch": 1.8315018315018317, "grad_norm": 7.301159459916146, "learning_rate": 1.9763993430846395e-05, "logits/chosen": -12.0625, "logits/rejected": -12.5625, "logps/chosen": -564.0, "logps/rejected": -624.0, "loss": 0.1443, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.4375, "rewards/margins": 5.90625, "rewards/rejected": -17.375, "step": 3500 }, { "epoch": 1.836734693877551, "grad_norm": 3.656823847113072, "learning_rate": 1.9615229411628215e-05, "logits/chosen": -12.0, "logits/rejected": -12.0625, "logps/chosen": -498.0, "logps/rejected": -620.0, "loss": 0.1425, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.3125, "rewards/margins": 5.65625, "rewards/rejected": -17.0, "step": 3510 }, { "epoch": 1.8419675562532705, "grad_norm": 3.2318169109546697, "learning_rate": 1.9466665072403142e-05, "logits/chosen": -11.75, "logits/rejected": -12.0625, "logps/chosen": -588.0, "logps/rejected": -672.0, "loss": 0.117, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.5625, "rewards/margins": 7.125, "rewards/rejected": -18.75, "step": 3520 }, { "epoch": 1.84720041862899, "grad_norm": 1.5645281327092355, "learning_rate": 1.931830592228727e-05, "logits/chosen": -11.6875, "logits/rejected": -11.6875, "logps/chosen": -524.0, "logps/rejected": -640.0, "loss": 0.1387, "rewards/accuracies": 0.9375, "rewards/chosen": -10.25, "rewards/margins": 6.65625, "rewards/rejected": -16.875, "step": 3530 }, { "epoch": 1.8524332810047097, "grad_norm": 2.110816123401139, "learning_rate": 1.9170157462787764e-05, "logits/chosen": -11.75, "logits/rejected": -12.125, "logps/chosen": -608.0, "logps/rejected": -648.0, "loss": 0.1096, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.3125, "rewards/margins": 7.40625, "rewards/rejected": -17.75, "step": 3540 }, { "epoch": 1.8576661433804291, "grad_norm": 6.374688050556006, "learning_rate": 1.902222518759891e-05, "logits/chosen": -11.4375, "logits/rejected": -11.8125, "logps/chosen": -604.0, "logps/rejected": -724.0, "loss": 0.1465, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.5, "rewards/margins": 8.4375, "rewards/rejected": -18.875, "step": 3550 }, { "epoch": 1.8628990057561485, "grad_norm": 6.999097914110718, "learning_rate": 1.887451458239837e-05, "logits/chosen": -11.5625, "logits/rejected": -11.6875, "logps/chosen": -564.0, "logps/rejected": -696.0, "loss": 0.161, "rewards/accuracies": 0.9375, "rewards/chosen": -10.6875, "rewards/margins": 7.71875, "rewards/rejected": -18.375, "step": 3560 }, { "epoch": 1.8681318681318682, "grad_norm": 4.0625683050938, "learning_rate": 1.872703112464374e-05, "logits/chosen": -12.0, "logits/rejected": -12.3125, "logps/chosen": -520.0, "logps/rejected": -624.0, "loss": 0.1101, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.375, "rewards/margins": 6.65625, "rewards/rejected": -17.0, "step": 3570 }, { "epoch": 1.8733647305075878, "grad_norm": 2.9250058221000166, "learning_rate": 1.8579780283369475e-05, "logits/chosen": -11.875, "logits/rejected": -12.125, "logps/chosen": -556.0, "logps/rejected": -640.0, "loss": 0.1699, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -10.4375, "rewards/margins": 7.40625, "rewards/rejected": -17.875, "step": 3580 }, { "epoch": 1.8785975928833072, "grad_norm": 3.3917682629483004, "learning_rate": 1.8432767518984047e-05, "logits/chosen": -12.3125, "logits/rejected": -12.375, "logps/chosen": -556.0, "logps/rejected": -664.0, "loss": 0.1231, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.0, "rewards/margins": 7.5625, "rewards/rejected": -18.625, "step": 3590 }, { "epoch": 1.8838304552590266, "grad_norm": 4.3214481399637386, "learning_rate": 1.828599828306748e-05, "logits/chosen": -12.5, "logits/rejected": -12.625, "logps/chosen": -506.0, "logps/rejected": -604.0, "loss": 0.1113, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -10.8125, "rewards/margins": 7.0625, "rewards/rejected": -17.875, "step": 3600 }, { "epoch": 1.8890633176347462, "grad_norm": 5.266498753372491, "learning_rate": 1.8139478018169197e-05, "logits/chosen": -12.4375, "logits/rejected": -12.375, "logps/chosen": -520.0, "logps/rejected": -640.0, "loss": 0.1884, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.0625, "rewards/margins": 7.5625, "rewards/rejected": -18.625, "step": 3610 }, { "epoch": 1.8942961800104658, "grad_norm": 1.5962079455355056, "learning_rate": 1.7993212157606172e-05, "logits/chosen": -12.1875, "logits/rejected": -12.0625, "logps/chosen": -516.0, "logps/rejected": -624.0, "loss": 0.1699, "rewards/accuracies": 0.9375, "rewards/chosen": -10.75, "rewards/margins": 6.25, "rewards/rejected": -17.0, "step": 3620 }, { "epoch": 1.8995290423861853, "grad_norm": 4.621607845404804, "learning_rate": 1.784720612526148e-05, "logits/chosen": -12.875, "logits/rejected": -12.875, "logps/chosen": -540.0, "logps/rejected": -664.0, "loss": 0.1533, "rewards/accuracies": 0.9375, "rewards/chosen": -11.9375, "rewards/margins": 6.3125, "rewards/rejected": -18.25, "step": 3630 }, { "epoch": 1.9047619047619047, "grad_norm": 5.216061893758242, "learning_rate": 1.770146533538315e-05, "logits/chosen": -12.75, "logits/rejected": -12.875, "logps/chosen": -544.0, "logps/rejected": -608.0, "loss": 0.1264, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -11.3125, "rewards/margins": 6.875, "rewards/rejected": -18.25, "step": 3640 }, { "epoch": 1.9099947671376243, "grad_norm": 1.709150410480554, "learning_rate": 1.755599519238338e-05, "logits/chosen": -12.75, "logits/rejected": -12.375, "logps/chosen": -500.0, "logps/rejected": -712.0, "loss": 0.1324, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.9375, "rewards/margins": 8.3125, "rewards/rejected": -19.25, "step": 3650 }, { "epoch": 1.915227629513344, "grad_norm": 5.0117582881977425, "learning_rate": 1.741080109063817e-05, "logits/chosen": -12.9375, "logits/rejected": -12.625, "logps/chosen": -544.0, "logps/rejected": -644.0, "loss": 0.1836, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -12.1875, "rewards/margins": 6.25, "rewards/rejected": -18.5, "step": 3660 }, { "epoch": 1.9204604918890633, "grad_norm": 3.8566029873849295, "learning_rate": 1.7265888414287247e-05, "logits/chosen": -12.4375, "logits/rejected": -12.875, "logps/chosen": -580.0, "logps/rejected": -652.0, "loss": 0.157, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.25, "rewards/margins": 6.9375, "rewards/rejected": -18.25, "step": 3670 }, { "epoch": 1.9256933542647827, "grad_norm": 5.667257724840644, "learning_rate": 1.7121262537034397e-05, "logits/chosen": -12.875, "logits/rejected": -12.6875, "logps/chosen": -604.0, "logps/rejected": -688.0, "loss": 0.1454, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.75, "rewards/margins": 7.34375, "rewards/rejected": -19.125, "step": 3680 }, { "epoch": 1.9309262166405023, "grad_norm": 3.0310912811799913, "learning_rate": 1.6976928821948263e-05, "logits/chosen": -13.0625, "logits/rejected": -13.0625, "logps/chosen": -504.0, "logps/rejected": -620.0, "loss": 0.1329, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.9375, "rewards/margins": 6.5625, "rewards/rejected": -17.5, "step": 3690 }, { "epoch": 1.936159079016222, "grad_norm": 5.731239526582649, "learning_rate": 1.6832892621263407e-05, "logits/chosen": -12.6875, "logits/rejected": -12.875, "logps/chosen": -588.0, "logps/rejected": -708.0, "loss": 0.1494, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.1875, "rewards/margins": 8.8125, "rewards/rejected": -20.0, "step": 3700 }, { "epoch": 1.9413919413919414, "grad_norm": 4.90791941322355, "learning_rate": 1.6689159276181832e-05, "logits/chosen": -13.25, "logits/rejected": -13.1875, "logps/chosen": -516.0, "logps/rejected": -628.0, "loss": 0.1598, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -11.1875, "rewards/margins": 6.84375, "rewards/rejected": -18.0, "step": 3710 }, { "epoch": 1.9466248037676608, "grad_norm": 5.067966638143667, "learning_rate": 1.6545734116674966e-05, "logits/chosen": -12.25, "logits/rejected": -12.625, "logps/chosen": -556.0, "logps/rejected": -612.0, "loss": 0.1497, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.8125, "rewards/margins": 6.84375, "rewards/rejected": -17.75, "step": 3720 }, { "epoch": 1.9518576661433804, "grad_norm": 1.0617423449341068, "learning_rate": 1.6402622461286003e-05, "logits/chosen": -12.8125, "logits/rejected": -13.25, "logps/chosen": -588.0, "logps/rejected": -656.0, "loss": 0.1265, "rewards/accuracies": 0.9375, "rewards/chosen": -10.9375, "rewards/margins": 7.25, "rewards/rejected": -18.125, "step": 3730 }, { "epoch": 1.9570905285191, "grad_norm": 4.244712168957068, "learning_rate": 1.625982961693262e-05, "logits/chosen": -12.625, "logits/rejected": -12.75, "logps/chosen": -584.0, "logps/rejected": -688.0, "loss": 0.1025, "rewards/accuracies": 0.9375, "rewards/chosen": -10.125, "rewards/margins": 8.6875, "rewards/rejected": -18.75, "step": 3740 }, { "epoch": 1.9623233908948194, "grad_norm": 6.34435554599616, "learning_rate": 1.6117360878710265e-05, "logits/chosen": -12.9375, "logits/rejected": -12.875, "logps/chosen": -536.0, "logps/rejected": -640.0, "loss": 0.1722, "rewards/accuracies": 0.9375, "rewards/chosen": -10.5, "rewards/margins": 7.125, "rewards/rejected": -17.625, "step": 3750 }, { "epoch": 1.9675562532705388, "grad_norm": 4.415617650295368, "learning_rate": 1.5975221529695774e-05, "logits/chosen": -12.125, "logits/rejected": -12.5, "logps/chosen": -556.0, "logps/rejected": -628.0, "loss": 0.1358, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -11.0, "rewards/margins": 7.3125, "rewards/rejected": -18.25, "step": 3760 }, { "epoch": 1.9727891156462585, "grad_norm": 3.059644737728734, "learning_rate": 1.583341684075141e-05, "logits/chosen": -12.6875, "logits/rejected": -12.9375, "logps/chosen": -510.0, "logps/rejected": -612.0, "loss": 0.1268, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -10.875, "rewards/margins": 6.71875, "rewards/rejected": -17.5, "step": 3770 }, { "epoch": 1.978021978021978, "grad_norm": 5.247635738453357, "learning_rate": 1.5691952070329495e-05, "logits/chosen": -12.5, "logits/rejected": -12.6875, "logps/chosen": -576.0, "logps/rejected": -708.0, "loss": 0.1451, "rewards/accuracies": 0.9375, "rewards/chosen": -10.6875, "rewards/margins": 8.875, "rewards/rejected": -19.625, "step": 3780 }, { "epoch": 1.9832548403976975, "grad_norm": 7.712781298345494, "learning_rate": 1.555083246427734e-05, "logits/chosen": -12.5, "logits/rejected": -12.625, "logps/chosen": -600.0, "logps/rejected": -704.0, "loss": 0.0989, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.125, "rewards/margins": 7.875, "rewards/rejected": -19.0, "step": 3790 }, { "epoch": 1.988487702773417, "grad_norm": 1.7203767588512628, "learning_rate": 1.541006325564277e-05, "logits/chosen": -12.5, "logits/rejected": -12.8125, "logps/chosen": -560.0, "logps/rejected": -680.0, "loss": 0.1498, "rewards/accuracies": 0.875, "rewards/chosen": -11.8125, "rewards/margins": 7.5, "rewards/rejected": -19.375, "step": 3800 }, { "epoch": 1.9937205651491365, "grad_norm": 0.9271784942387864, "learning_rate": 1.5269649664480038e-05, "logits/chosen": -12.9375, "logits/rejected": -13.125, "logps/chosen": -580.0, "logps/rejected": -680.0, "loss": 0.1312, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.0625, "rewards/margins": 8.25, "rewards/rejected": -19.25, "step": 3810 }, { "epoch": 1.9989534275248562, "grad_norm": 1.754148795500601, "learning_rate": 1.5129596897656257e-05, "logits/chosen": -13.1875, "logits/rejected": -13.0625, "logps/chosen": -560.0, "logps/rejected": -660.0, "loss": 0.0999, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.25, "rewards/margins": 7.9375, "rewards/rejected": -19.25, "step": 3820 }, { "epoch": 2.0, "eval_logits/chosen": -13.0, "eval_logits/rejected": -13.0625, "eval_logps/chosen": -604.0, "eval_logps/rejected": -608.0, "eval_loss": 0.8018984198570251, "eval_rewards/accuracies": 0.69921875, "eval_rewards/chosen": -14.125, "eval_rewards/margins": 1.8125, "eval_rewards/rejected": -15.9375, "eval_runtime": 46.7492, "eval_samples_per_second": 42.781, "eval_steps_per_second": 0.685, "step": 3822 }, { "epoch": 2.004186289900576, "grad_norm": 0.44069773718681493, "learning_rate": 1.4989910148658325e-05, "logits/chosen": -12.8125, "logits/rejected": -12.375, "logps/chosen": -584.0, "logps/rejected": -720.0, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": -11.3125, "rewards/margins": 9.375, "rewards/rejected": -20.625, "step": 3830 }, { "epoch": 2.009419152276295, "grad_norm": 0.5769119803775666, "learning_rate": 1.4850594597400352e-05, "logits/chosen": -12.75, "logits/rejected": -12.5625, "logps/chosen": -592.0, "logps/rejected": -744.0, "loss": 0.0116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.625, "rewards/margins": 11.9375, "rewards/rejected": -22.5, "step": 3840 }, { "epoch": 2.0146520146520146, "grad_norm": 0.5107802441858381, "learning_rate": 1.4711655410031538e-05, "logits/chosen": -12.8125, "logits/rejected": -12.5, "logps/chosen": -532.0, "logps/rejected": -684.0, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -10.625, "rewards/margins": 10.1875, "rewards/rejected": -20.875, "step": 3850 }, { "epoch": 2.0198848770277342, "grad_norm": 0.5560067424928957, "learning_rate": 1.4573097738744623e-05, "logits/chosen": -13.0, "logits/rejected": -12.3125, "logps/chosen": -498.0, "logps/rejected": -700.0, "loss": 0.0208, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.6875, "rewards/margins": 10.4375, "rewards/rejected": -21.125, "step": 3860 }, { "epoch": 2.025117739403454, "grad_norm": 0.691304079384525, "learning_rate": 1.4434926721584865e-05, "logits/chosen": -12.9375, "logits/rejected": -12.875, "logps/chosen": -532.0, "logps/rejected": -704.0, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -10.3125, "rewards/margins": 10.25, "rewards/rejected": -20.625, "step": 3870 }, { "epoch": 2.030350601779173, "grad_norm": 4.441051778595759, "learning_rate": 1.4297147482259424e-05, "logits/chosen": -13.25, "logits/rejected": -12.5625, "logps/chosen": -512.0, "logps/rejected": -684.0, "loss": 0.0155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.3125, "rewards/margins": 11.0, "rewards/rejected": -21.375, "step": 3880 }, { "epoch": 2.0355834641548927, "grad_norm": 0.2629076990932687, "learning_rate": 1.4159765129947445e-05, "logits/chosen": -12.625, "logits/rejected": -12.25, "logps/chosen": -552.0, "logps/rejected": -660.0, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -10.3125, "rewards/margins": 10.375, "rewards/rejected": -20.75, "step": 3890 }, { "epoch": 2.0408163265306123, "grad_norm": 2.621446854010498, "learning_rate": 1.4022784759110577e-05, "logits/chosen": -13.125, "logits/rejected": -12.6875, "logps/chosen": -484.0, "logps/rejected": -688.0, "loss": 0.0371, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.3125, "rewards/margins": 10.25, "rewards/rejected": -20.5, "step": 3900 }, { "epoch": 2.046049188906332, "grad_norm": 0.3214305037411537, "learning_rate": 1.3886211449304005e-05, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -508.0, "logps/rejected": -704.0, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -10.125, "rewards/margins": 9.375, "rewards/rejected": -19.5, "step": 3910 }, { "epoch": 2.051282051282051, "grad_norm": 0.26615797826258786, "learning_rate": 1.3750050264988173e-05, "logits/chosen": -13.125, "logits/rejected": -12.6875, "logps/chosen": -472.0, "logps/rejected": -676.0, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -9.875, "rewards/margins": 10.5625, "rewards/rejected": -20.375, "step": 3920 }, { "epoch": 2.0565149136577707, "grad_norm": 0.1629150297796315, "learning_rate": 1.361430625534092e-05, "logits/chosen": -13.25, "logits/rejected": -13.125, "logps/chosen": -560.0, "logps/rejected": -720.0, "loss": 0.014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.125, "rewards/margins": 12.4375, "rewards/rejected": -22.5, "step": 3930 }, { "epoch": 2.0617477760334904, "grad_norm": 0.4338741888563191, "learning_rate": 1.3478984454070274e-05, "logits/chosen": -13.3125, "logits/rejected": -12.8125, "logps/chosen": -492.0, "logps/rejected": -752.0, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.125, "rewards/margins": 13.0, "rewards/rejected": -22.125, "step": 3940 }, { "epoch": 2.06698063840921, "grad_norm": 0.6659403344792019, "learning_rate": 1.334408987922777e-05, "logits/chosen": -13.3125, "logits/rejected": -12.5, "logps/chosen": -544.0, "logps/rejected": -728.0, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -10.375, "rewards/margins": 11.8125, "rewards/rejected": -22.125, "step": 3950 }, { "epoch": 2.072213500784929, "grad_norm": 0.2383534684121332, "learning_rate": 1.3209627533022396e-05, "logits/chosen": -12.875, "logits/rejected": -12.25, "logps/chosen": -532.0, "logps/rejected": -744.0, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.375, "rewards/margins": 12.25, "rewards/rejected": -22.75, "step": 3960 }, { "epoch": 2.077446363160649, "grad_norm": 0.17771246387777764, "learning_rate": 1.3075602401635056e-05, "logits/chosen": -13.1875, "logits/rejected": -12.5, "logps/chosen": -520.0, "logps/rejected": -664.0, "loss": 0.0137, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.0625, "rewards/margins": 10.25, "rewards/rejected": -21.25, "step": 3970 }, { "epoch": 2.0826792255363684, "grad_norm": 0.329137920392516, "learning_rate": 1.2942019455033715e-05, "logits/chosen": -12.8125, "logits/rejected": -12.125, "logps/chosen": -580.0, "logps/rejected": -800.0, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.0625, "rewards/margins": 13.125, "rewards/rejected": -23.25, "step": 3980 }, { "epoch": 2.087912087912088, "grad_norm": 0.13778616370169536, "learning_rate": 1.2808883646789089e-05, "logits/chosen": -13.5, "logits/rejected": -12.875, "logps/chosen": -496.0, "logps/rejected": -688.0, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -10.375, "rewards/margins": 11.3125, "rewards/rejected": -21.75, "step": 3990 }, { "epoch": 2.0931449502878072, "grad_norm": 0.24055482258031696, "learning_rate": 1.2676199913890935e-05, "logits/chosen": -13.125, "logits/rejected": -12.75, "logps/chosen": -524.0, "logps/rejected": -736.0, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -9.9375, "rewards/margins": 12.875, "rewards/rejected": -22.75, "step": 4000 }, { "epoch": 2.098377812663527, "grad_norm": 0.12739837504095347, "learning_rate": 1.2543973176565014e-05, "logits/chosen": -13.0, "logits/rejected": -12.1875, "logps/chosen": -520.0, "logps/rejected": -756.0, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -10.8125, "rewards/margins": 13.125, "rewards/rejected": -24.0, "step": 4010 }, { "epoch": 2.1036106750392465, "grad_norm": 0.11079988686594298, "learning_rate": 1.2412208338090566e-05, "logits/chosen": -13.125, "logits/rejected": -12.5, "logps/chosen": -596.0, "logps/rejected": -828.0, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -10.25, "rewards/margins": 13.75, "rewards/rejected": -24.0, "step": 4020 }, { "epoch": 2.108843537414966, "grad_norm": 0.9326656347824286, "learning_rate": 1.2280910284618583e-05, "logits/chosen": -13.0, "logits/rejected": -12.5, "logps/chosen": -528.0, "logps/rejected": -752.0, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -10.25, "rewards/margins": 13.5, "rewards/rejected": -23.75, "step": 4030 }, { "epoch": 2.1140763997906853, "grad_norm": 1.5759110453700604, "learning_rate": 1.2150083884990538e-05, "logits/chosen": -13.25, "logits/rejected": -12.375, "logps/chosen": -548.0, "logps/rejected": -764.0, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -11.25, "rewards/margins": 11.75, "rewards/rejected": -23.0, "step": 4040 }, { "epoch": 2.119309262166405, "grad_norm": 1.1917503318443903, "learning_rate": 1.201973399055788e-05, "logits/chosen": -13.125, "logits/rejected": -12.75, "logps/chosen": -548.0, "logps/rejected": -724.0, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -9.625, "rewards/margins": 12.125, "rewards/rejected": -21.75, "step": 4050 }, { "epoch": 2.1245421245421245, "grad_norm": 3.1551012011885393, "learning_rate": 1.1889865435002117e-05, "logits/chosen": -13.5, "logits/rejected": -12.75, "logps/chosen": -528.0, "logps/rejected": -724.0, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -10.5625, "rewards/margins": 11.75, "rewards/rejected": -22.375, "step": 4060 }, { "epoch": 2.129774986917844, "grad_norm": 1.2341246483692032, "learning_rate": 1.176048303415559e-05, "logits/chosen": -12.625, "logits/rejected": -12.1875, "logps/chosen": -496.0, "logps/rejected": -728.0, "loss": 0.0164, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.375, "rewards/margins": 12.1875, "rewards/rejected": -22.5, "step": 4070 }, { "epoch": 2.1350078492935634, "grad_norm": 0.12092505668455128, "learning_rate": 1.1631591585822841e-05, "logits/chosen": -13.1875, "logits/rejected": -12.75, "logps/chosen": -516.0, "logps/rejected": -760.0, "loss": 0.0248, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.0, "rewards/margins": 12.75, "rewards/rejected": -22.75, "step": 4080 }, { "epoch": 2.140240711669283, "grad_norm": 0.9346832597811591, "learning_rate": 1.1503195869602767e-05, "logits/chosen": -13.0625, "logits/rejected": -12.8125, "logps/chosen": -552.0, "logps/rejected": -728.0, "loss": 0.0156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.25, "rewards/margins": 11.3125, "rewards/rejected": -22.625, "step": 4090 }, { "epoch": 2.1454735740450026, "grad_norm": 0.26958900501293714, "learning_rate": 1.137530064671135e-05, "logits/chosen": -12.875, "logits/rejected": -12.25, "logps/chosen": -536.0, "logps/rejected": -748.0, "loss": 0.0172, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.75, "rewards/margins": 11.9375, "rewards/rejected": -22.75, "step": 4100 }, { "epoch": 2.1507064364207222, "grad_norm": 1.2068543614594405, "learning_rate": 1.1247910659805064e-05, "logits/chosen": -12.875, "logits/rejected": -12.4375, "logps/chosen": -532.0, "logps/rejected": -672.0, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -9.4375, "rewards/margins": 12.0625, "rewards/rejected": -21.5, "step": 4110 }, { "epoch": 2.155939298796442, "grad_norm": 0.07230196530823124, "learning_rate": 1.112103063280509e-05, "logits/chosen": -12.9375, "logits/rejected": -12.375, "logps/chosen": -548.0, "logps/rejected": -804.0, "loss": 0.0082, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.5625, "rewards/margins": 13.125, "rewards/rejected": -22.75, "step": 4120 }, { "epoch": 2.161172161172161, "grad_norm": 0.9441548421966137, "learning_rate": 1.0994665270722071e-05, "logits/chosen": -13.1875, "logits/rejected": -12.6875, "logps/chosen": -484.0, "logps/rejected": -756.0, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -9.8125, "rewards/margins": 12.5, "rewards/rejected": -22.375, "step": 4130 }, { "epoch": 2.1664050235478807, "grad_norm": 1.6049171624147622, "learning_rate": 1.0868819259481639e-05, "logits/chosen": -12.875, "logits/rejected": -12.5625, "logps/chosen": -560.0, "logps/rejected": -720.0, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -11.5, "rewards/margins": 11.0, "rewards/rejected": -22.5, "step": 4140 }, { "epoch": 2.1716378859236003, "grad_norm": 0.05234835972167443, "learning_rate": 1.0743497265750702e-05, "logits/chosen": -13.25, "logits/rejected": -12.375, "logps/chosen": -544.0, "logps/rejected": -724.0, "loss": 0.0189, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.875, "rewards/margins": 11.5625, "rewards/rejected": -22.375, "step": 4150 }, { "epoch": 2.17687074829932, "grad_norm": 0.1969413294246078, "learning_rate": 1.061870393676436e-05, "logits/chosen": -12.9375, "logits/rejected": -12.75, "logps/chosen": -556.0, "logps/rejected": -804.0, "loss": 0.0207, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.75, "rewards/margins": 13.5625, "rewards/rejected": -23.25, "step": 4160 }, { "epoch": 2.182103610675039, "grad_norm": 0.6075595881302668, "learning_rate": 1.0494443900153558e-05, "logits/chosen": -13.0, "logits/rejected": -12.75, "logps/chosen": -576.0, "logps/rejected": -752.0, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -10.625, "rewards/margins": 11.625, "rewards/rejected": -22.25, "step": 4170 }, { "epoch": 2.1873364730507587, "grad_norm": 0.06443679481327427, "learning_rate": 1.0370721763773508e-05, "logits/chosen": -13.25, "logits/rejected": -12.75, "logps/chosen": -548.0, "logps/rejected": -752.0, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -10.25, "rewards/margins": 12.125, "rewards/rejected": -22.375, "step": 4180 }, { "epoch": 2.1925693354264784, "grad_norm": 1.122736909133277, "learning_rate": 1.0247542115532846e-05, "logits/chosen": -13.0, "logits/rejected": -12.75, "logps/chosen": -510.0, "logps/rejected": -728.0, "loss": 0.0396, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.625, "rewards/margins": 12.625, "rewards/rejected": -22.25, "step": 4190 }, { "epoch": 2.197802197802198, "grad_norm": 0.6888171252410781, "learning_rate": 1.0124909523223419e-05, "logits/chosen": -13.0, "logits/rejected": -12.5, "logps/chosen": -564.0, "logps/rejected": -760.0, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -9.875, "rewards/margins": 12.625, "rewards/rejected": -22.5, "step": 4200 }, { "epoch": 2.203035060177917, "grad_norm": 0.19411528574032383, "learning_rate": 1.0002828534350989e-05, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -552.0, "logps/rejected": -724.0, "loss": 0.0172, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.875, "rewards/margins": 11.75, "rewards/rejected": -21.625, "step": 4210 }, { "epoch": 2.208267922553637, "grad_norm": 0.18615879946631778, "learning_rate": 9.881303675966525e-06, "logits/chosen": -12.875, "logits/rejected": -12.375, "logps/chosen": -544.0, "logps/rejected": -736.0, "loss": 0.0052, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.5625, "rewards/margins": 12.1875, "rewards/rejected": -22.75, "step": 4220 }, { "epoch": 2.2135007849293564, "grad_norm": 0.06276505039432473, "learning_rate": 9.760339454498393e-06, "logits/chosen": -13.0625, "logits/rejected": -12.3125, "logps/chosen": -498.0, "logps/rejected": -720.0, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -10.25, "rewards/margins": 12.0625, "rewards/rejected": -22.375, "step": 4230 }, { "epoch": 2.218733647305076, "grad_norm": 0.2906987272091601, "learning_rate": 9.639940355585219e-06, "logits/chosen": -13.0, "logits/rejected": -12.625, "logps/chosen": -584.0, "logps/rejected": -760.0, "loss": 0.0286, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.125, "rewards/margins": 11.3125, "rewards/rejected": -22.375, "step": 4240 }, { "epoch": 2.2239665096807952, "grad_norm": 1.8677189187726966, "learning_rate": 9.520110843909542e-06, "logits/chosen": -12.8125, "logits/rejected": -12.3125, "logps/chosen": -524.0, "logps/rejected": -700.0, "loss": 0.0168, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.0, "rewards/margins": 10.6875, "rewards/rejected": -20.625, "step": 4250 }, { "epoch": 2.229199372056515, "grad_norm": 0.5031535847376246, "learning_rate": 9.400855363032262e-06, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -544.0, "logps/rejected": -764.0, "loss": 0.0108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.75, "rewards/margins": 12.8125, "rewards/rejected": -22.5, "step": 4260 }, { "epoch": 2.2344322344322345, "grad_norm": 0.3299828837281531, "learning_rate": 9.282178335227884e-06, "logits/chosen": -12.625, "logits/rejected": -12.1875, "logps/chosen": -502.0, "logps/rejected": -772.0, "loss": 0.0247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.75, "rewards/margins": 13.0, "rewards/rejected": -22.75, "step": 4270 }, { "epoch": 2.239665096807954, "grad_norm": 0.8321963118058464, "learning_rate": 9.164084161320471e-06, "logits/chosen": -12.625, "logits/rejected": -12.625, "logps/chosen": -506.0, "logps/rejected": -740.0, "loss": 0.017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.125, "rewards/margins": 13.625, "rewards/rejected": -22.75, "step": 4280 }, { "epoch": 2.2448979591836733, "grad_norm": 0.6943875876177056, "learning_rate": 9.04657722052052e-06, "logits/chosen": -12.75, "logits/rejected": -12.8125, "logps/chosen": -556.0, "logps/rejected": -764.0, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -9.5, "rewards/margins": 13.25, "rewards/rejected": -22.75, "step": 4290 }, { "epoch": 2.250130821559393, "grad_norm": 6.799804412241437, "learning_rate": 8.929661870262526e-06, "logits/chosen": -12.3125, "logits/rejected": -12.0, "logps/chosen": -616.0, "logps/rejected": -816.0, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -10.4375, "rewards/margins": 13.0625, "rewards/rejected": -23.5, "step": 4300 }, { "epoch": 2.2553636839351126, "grad_norm": 0.19463379281176907, "learning_rate": 8.813342446043424e-06, "logits/chosen": -12.625, "logits/rejected": -12.375, "logps/chosen": -512.0, "logps/rejected": -752.0, "loss": 0.0206, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.6875, "rewards/margins": 12.0625, "rewards/rejected": -21.75, "step": 4310 }, { "epoch": 2.260596546310832, "grad_norm": 0.2697052020128489, "learning_rate": 8.697623261261789e-06, "logits/chosen": -12.75, "logits/rejected": -12.3125, "logps/chosen": -528.0, "logps/rejected": -764.0, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -10.125, "rewards/margins": 12.8125, "rewards/rejected": -22.875, "step": 4320 }, { "epoch": 2.2658294086865514, "grad_norm": 0.40883057591483557, "learning_rate": 8.58250860705792e-06, "logits/chosen": -12.5, "logits/rejected": -12.75, "logps/chosen": -588.0, "logps/rejected": -700.0, "loss": 0.0161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.75, "rewards/margins": 10.25, "rewards/rejected": -21.0, "step": 4330 }, { "epoch": 2.271062271062271, "grad_norm": 0.1423644882664783, "learning_rate": 8.468002752154672e-06, "logits/chosen": -12.375, "logits/rejected": -12.4375, "logps/chosen": -540.0, "logps/rejected": -716.0, "loss": 0.0173, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.0, "rewards/margins": 11.8125, "rewards/rejected": -21.75, "step": 4340 }, { "epoch": 2.2762951334379906, "grad_norm": 0.2527420977259, "learning_rate": 8.35410994269921e-06, "logits/chosen": -12.5625, "logits/rejected": -12.5625, "logps/chosen": -612.0, "logps/rejected": -756.0, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -10.5625, "rewards/margins": 11.8125, "rewards/rejected": -22.375, "step": 4350 }, { "epoch": 2.2815279958137102, "grad_norm": 0.9800221783762282, "learning_rate": 8.240834402105524e-06, "logits/chosen": -12.5625, "logits/rejected": -12.25, "logps/chosen": -532.0, "logps/rejected": -736.0, "loss": 0.0144, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.375, "rewards/margins": 11.875, "rewards/rejected": -21.25, "step": 4360 }, { "epoch": 2.2867608581894294, "grad_norm": 0.5775266185707517, "learning_rate": 8.128180330897791e-06, "logits/chosen": -12.5625, "logits/rejected": -12.1875, "logps/chosen": -498.0, "logps/rejected": -752.0, "loss": 0.0138, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.6875, "rewards/margins": 13.0625, "rewards/rejected": -21.75, "step": 4370 }, { "epoch": 2.291993720565149, "grad_norm": 0.10327636037785584, "learning_rate": 8.016151906554683e-06, "logits/chosen": -12.9375, "logits/rejected": -12.4375, "logps/chosen": -516.0, "logps/rejected": -772.0, "loss": 0.0129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.125, "rewards/margins": 12.0625, "rewards/rejected": -22.125, "step": 4380 }, { "epoch": 2.2972265829408687, "grad_norm": 0.1282213740055652, "learning_rate": 7.90475328335439e-06, "logits/chosen": -12.6875, "logits/rejected": -12.6875, "logps/chosen": -548.0, "logps/rejected": -700.0, "loss": 0.015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.5, "rewards/margins": 12.0625, "rewards/rejected": -21.625, "step": 4390 }, { "epoch": 2.3024594453165883, "grad_norm": 0.05266887192828813, "learning_rate": 7.793988592220569e-06, "logits/chosen": -12.8125, "logits/rejected": -12.25, "logps/chosen": -488.0, "logps/rejected": -640.0, "loss": 0.018, "rewards/accuracies": 0.9375, "rewards/chosen": -10.0, "rewards/margins": 10.125, "rewards/rejected": -20.125, "step": 4400 }, { "epoch": 2.3076923076923075, "grad_norm": 0.04530104525562387, "learning_rate": 7.683861940569218e-06, "logits/chosen": -12.5625, "logits/rejected": -12.1875, "logps/chosen": -584.0, "logps/rejected": -728.0, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -11.375, "rewards/margins": 10.875, "rewards/rejected": -22.25, "step": 4410 }, { "epoch": 2.312925170068027, "grad_norm": 0.20466612976766344, "learning_rate": 7.574377412156292e-06, "logits/chosen": -12.8125, "logits/rejected": -12.375, "logps/chosen": -494.0, "logps/rejected": -664.0, "loss": 0.0206, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.9375, "rewards/margins": 11.0625, "rewards/rejected": -21.0, "step": 4420 }, { "epoch": 2.3181580324437467, "grad_norm": 1.081779518073677, "learning_rate": 7.465539066926322e-06, "logits/chosen": -12.8125, "logits/rejected": -12.6875, "logps/chosen": -520.0, "logps/rejected": -712.0, "loss": 0.017, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.4375, "rewards/margins": 12.375, "rewards/rejected": -21.75, "step": 4430 }, { "epoch": 2.3233908948194664, "grad_norm": 0.24020374605900488, "learning_rate": 7.357350940861845e-06, "logits/chosen": -12.5625, "logits/rejected": -12.375, "logps/chosen": -576.0, "logps/rejected": -808.0, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.625, "rewards/margins": 12.9375, "rewards/rejected": -22.625, "step": 4440 }, { "epoch": 2.328623757195186, "grad_norm": 0.21378756606796173, "learning_rate": 7.249817045833726e-06, "logits/chosen": -12.875, "logits/rejected": -12.9375, "logps/chosen": -528.0, "logps/rejected": -728.0, "loss": 0.014, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.3125, "rewards/margins": 13.0, "rewards/rejected": -22.375, "step": 4450 }, { "epoch": 2.333856619570905, "grad_norm": 0.06488277056020401, "learning_rate": 7.142941369452411e-06, "logits/chosen": -12.625, "logits/rejected": -12.5, "logps/chosen": -512.0, "logps/rejected": -712.0, "loss": 0.0154, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.125, "rewards/margins": 10.875, "rewards/rejected": -21.0, "step": 4460 }, { "epoch": 2.339089481946625, "grad_norm": 0.7536243870528055, "learning_rate": 7.036727874920043e-06, "logits/chosen": -12.75, "logits/rejected": -12.0625, "logps/chosen": -508.0, "logps/rejected": -764.0, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -9.3125, "rewards/margins": 13.5625, "rewards/rejected": -22.875, "step": 4470 }, { "epoch": 2.3443223443223444, "grad_norm": 0.1250439583457857, "learning_rate": 6.931180500883486e-06, "logits/chosen": -12.8125, "logits/rejected": -12.3125, "logps/chosen": -470.0, "logps/rejected": -676.0, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.0, "rewards/margins": 11.0625, "rewards/rejected": -21.125, "step": 4480 }, { "epoch": 2.3495552066980636, "grad_norm": 0.0633273759969605, "learning_rate": 6.826303161288303e-06, "logits/chosen": -12.5, "logits/rejected": -12.25, "logps/chosen": -500.0, "logps/rejected": -728.0, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.8125, "rewards/margins": 13.0, "rewards/rejected": -22.875, "step": 4490 }, { "epoch": 2.3547880690737832, "grad_norm": 0.07953697544602377, "learning_rate": 6.722099745233595e-06, "logits/chosen": -12.5, "logits/rejected": -12.1875, "logps/chosen": -596.0, "logps/rejected": -756.0, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -11.0, "rewards/margins": 12.125, "rewards/rejected": -23.125, "step": 4500 }, { "epoch": 2.360020931449503, "grad_norm": 0.100823164008195, "learning_rate": 6.618574116827786e-06, "logits/chosen": -12.75, "logits/rejected": -12.375, "logps/chosen": -520.0, "logps/rejected": -764.0, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -9.6875, "rewards/margins": 13.1875, "rewards/rejected": -22.875, "step": 4510 }, { "epoch": 2.3652537938252225, "grad_norm": 0.04037595387640658, "learning_rate": 6.51573011504534e-06, "logits/chosen": -12.5, "logits/rejected": -12.3125, "logps/chosen": -640.0, "logps/rejected": -848.0, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -11.1875, "rewards/margins": 13.6875, "rewards/rejected": -24.875, "step": 4520 }, { "epoch": 2.370486656200942, "grad_norm": 0.041466180476313466, "learning_rate": 6.4135715535844e-06, "logits/chosen": -12.875, "logits/rejected": -12.8125, "logps/chosen": -528.0, "logps/rejected": -724.0, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -9.5625, "rewards/margins": 13.0, "rewards/rejected": -22.625, "step": 4530 }, { "epoch": 2.3757195185766613, "grad_norm": 2.2267009793422337, "learning_rate": 6.312102220725347e-06, "logits/chosen": -12.625, "logits/rejected": -12.375, "logps/chosen": -568.0, "logps/rejected": -792.0, "loss": 0.0161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.25, "rewards/margins": 13.875, "rewards/rejected": -24.125, "step": 4540 }, { "epoch": 2.380952380952381, "grad_norm": 3.6750631389711232, "learning_rate": 6.21132587919036e-06, "logits/chosen": -12.5625, "logits/rejected": -12.375, "logps/chosen": -548.0, "logps/rejected": -756.0, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.125, "rewards/margins": 12.8125, "rewards/rejected": -22.875, "step": 4550 }, { "epoch": 2.3861852433281006, "grad_norm": 0.0988592319744124, "learning_rate": 6.111246266003859e-06, "logits/chosen": -12.3125, "logits/rejected": -12.0, "logps/chosen": -560.0, "logps/rejected": -820.0, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -10.4375, "rewards/margins": 13.25, "rewards/rejected": -23.625, "step": 4560 }, { "epoch": 2.3914181057038197, "grad_norm": 2.33950433690266, "learning_rate": 6.011867092353934e-06, "logits/chosen": -12.8125, "logits/rejected": -12.5625, "logps/chosen": -552.0, "logps/rejected": -756.0, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -9.5625, "rewards/margins": 13.375, "rewards/rejected": -23.0, "step": 4570 }, { "epoch": 2.3966509680795394, "grad_norm": 0.0681730768269089, "learning_rate": 5.913192043454724e-06, "logits/chosen": -12.3125, "logits/rejected": -12.125, "logps/chosen": -520.0, "logps/rejected": -768.0, "loss": 0.0249, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.125, "rewards/margins": 12.375, "rewards/rejected": -22.5, "step": 4580 }, { "epoch": 2.401883830455259, "grad_norm": 0.42266101198548967, "learning_rate": 5.815224778409767e-06, "logits/chosen": -12.5625, "logits/rejected": -12.5, "logps/chosen": -560.0, "logps/rejected": -772.0, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -10.5, "rewards/margins": 12.75, "rewards/rejected": -23.25, "step": 4590 }, { "epoch": 2.4071166928309786, "grad_norm": 0.3853290474720451, "learning_rate": 5.71796893007629e-06, "logits/chosen": -12.375, "logits/rejected": -11.875, "logps/chosen": -496.0, "logps/rejected": -760.0, "loss": 0.0072, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.625, "rewards/margins": 13.75, "rewards/rejected": -23.375, "step": 4600 }, { "epoch": 2.4123495552066982, "grad_norm": 0.10250705226256952, "learning_rate": 5.621428104930529e-06, "logits/chosen": -12.6875, "logits/rejected": -12.1875, "logps/chosen": -494.0, "logps/rejected": -728.0, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.25, "rewards/margins": 12.375, "rewards/rejected": -23.625, "step": 4610 }, { "epoch": 2.4175824175824174, "grad_norm": 0.13828507154871758, "learning_rate": 5.525605882933965e-06, "logits/chosen": -12.5625, "logits/rejected": -12.4375, "logps/chosen": -540.0, "logps/rejected": -804.0, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -10.0625, "rewards/margins": 12.5625, "rewards/rejected": -22.75, "step": 4620 }, { "epoch": 2.422815279958137, "grad_norm": 5.641455891192051, "learning_rate": 5.430505817400586e-06, "logits/chosen": -12.6875, "logits/rejected": -12.75, "logps/chosen": -576.0, "logps/rejected": -816.0, "loss": 0.0214, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.625, "rewards/margins": 14.125, "rewards/rejected": -23.75, "step": 4630 }, { "epoch": 2.4280481423338567, "grad_norm": 0.42243425995189116, "learning_rate": 5.33613143486511e-06, "logits/chosen": -12.5, "logits/rejected": -12.4375, "logps/chosen": -548.0, "logps/rejected": -728.0, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -9.0625, "rewards/margins": 13.5625, "rewards/rejected": -22.625, "step": 4640 }, { "epoch": 2.4332810047095763, "grad_norm": 1.4777844624098502, "learning_rate": 5.2424862349522065e-06, "logits/chosen": -12.5625, "logits/rejected": -12.25, "logps/chosen": -548.0, "logps/rejected": -736.0, "loss": 0.0204, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.3125, "rewards/margins": 12.625, "rewards/rejected": -23.0, "step": 4650 }, { "epoch": 2.4385138670852955, "grad_norm": 0.05115813621262884, "learning_rate": 5.149573690246759e-06, "logits/chosen": -12.5625, "logits/rejected": -12.0, "logps/chosen": -580.0, "logps/rejected": -772.0, "loss": 0.0106, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.75, "rewards/margins": 12.5625, "rewards/rejected": -23.375, "step": 4660 }, { "epoch": 2.443746729461015, "grad_norm": 0.06549014862611358, "learning_rate": 5.0573972461650524e-06, "logits/chosen": -12.25, "logits/rejected": -12.25, "logps/chosen": -636.0, "logps/rejected": -824.0, "loss": 0.0133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.25, "rewards/margins": 13.25, "rewards/rejected": -24.5, "step": 4670 }, { "epoch": 2.4489795918367347, "grad_norm": 0.17158367161151364, "learning_rate": 4.965960320827018e-06, "logits/chosen": -12.375, "logits/rejected": -12.1875, "logps/chosen": -608.0, "logps/rejected": -784.0, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -10.5, "rewards/margins": 12.375, "rewards/rejected": -22.875, "step": 4680 }, { "epoch": 2.4542124542124544, "grad_norm": 0.08224298980083439, "learning_rate": 4.875266304929496e-06, "logits/chosen": -12.6875, "logits/rejected": -12.1875, "logps/chosen": -528.0, "logps/rejected": -712.0, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -10.5625, "rewards/margins": 11.75, "rewards/rejected": -22.25, "step": 4690 }, { "epoch": 2.4594453165881736, "grad_norm": 2.1347869965151687, "learning_rate": 4.7853185616205105e-06, "logits/chosen": -12.8125, "logits/rejected": -12.3125, "logps/chosen": -536.0, "logps/rejected": -756.0, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.75, "rewards/margins": 11.6875, "rewards/rejected": -22.375, "step": 4700 }, { "epoch": 2.464678178963893, "grad_norm": 0.9619277161341164, "learning_rate": 4.696120426374504e-06, "logits/chosen": -12.4375, "logits/rejected": -12.25, "logps/chosen": -516.0, "logps/rejected": -780.0, "loss": 0.011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.8125, "rewards/margins": 15.25, "rewards/rejected": -25.0, "step": 4710 }, { "epoch": 2.469911041339613, "grad_norm": 1.6743219521030817, "learning_rate": 4.607675206868706e-06, "logits/chosen": -12.5, "logits/rejected": -12.3125, "logps/chosen": -490.0, "logps/rejected": -668.0, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -10.3125, "rewards/margins": 11.4375, "rewards/rejected": -21.75, "step": 4720 }, { "epoch": 2.4751439037153324, "grad_norm": 0.1333846845539627, "learning_rate": 4.5199861828604525e-06, "logits/chosen": -12.8125, "logits/rejected": -12.5625, "logps/chosen": -532.0, "logps/rejected": -752.0, "loss": 0.0232, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.875, "rewards/margins": 13.125, "rewards/rejected": -23.0, "step": 4730 }, { "epoch": 2.4803767660910516, "grad_norm": 0.06235123879594944, "learning_rate": 4.433056606065553e-06, "logits/chosen": -12.75, "logits/rejected": -12.375, "logps/chosen": -512.0, "logps/rejected": -740.0, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -9.75, "rewards/margins": 13.4375, "rewards/rejected": -23.25, "step": 4740 }, { "epoch": 2.4856096284667712, "grad_norm": 0.026777284454816707, "learning_rate": 4.346889700037743e-06, "logits/chosen": -12.875, "logits/rejected": -12.625, "logps/chosen": -478.0, "logps/rejected": -720.0, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -9.4375, "rewards/margins": 13.5, "rewards/rejected": -23.0, "step": 4750 }, { "epoch": 2.490842490842491, "grad_norm": 0.6344174994119484, "learning_rate": 4.261488660049112e-06, "logits/chosen": -13.125, "logits/rejected": -12.4375, "logps/chosen": -520.0, "logps/rejected": -816.0, "loss": 0.0102, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.75, "rewards/margins": 15.375, "rewards/rejected": -25.125, "step": 4760 }, { "epoch": 2.4960753532182105, "grad_norm": 4.869230110057092, "learning_rate": 4.176856652971642e-06, "logits/chosen": -12.8125, "logits/rejected": -12.625, "logps/chosen": -508.0, "logps/rejected": -696.0, "loss": 0.0234, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.9375, "rewards/margins": 12.0625, "rewards/rejected": -22.0, "step": 4770 }, { "epoch": 2.50130821559393, "grad_norm": 0.12396452174190684, "learning_rate": 4.092996817159752e-06, "logits/chosen": -12.875, "logits/rejected": -13.0, "logps/chosen": -512.0, "logps/rejected": -700.0, "loss": 0.0056, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.875, "rewards/margins": 12.5, "rewards/rejected": -22.375, "step": 4780 }, { "epoch": 2.5065410779696493, "grad_norm": 0.706327597363468, "learning_rate": 4.009912262333942e-06, "logits/chosen": -12.875, "logits/rejected": -12.5625, "logps/chosen": -536.0, "logps/rejected": -772.0, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -10.375, "rewards/margins": 13.1875, "rewards/rejected": -23.5, "step": 4790 }, { "epoch": 2.511773940345369, "grad_norm": 0.7436869258928149, "learning_rate": 3.927606069465442e-06, "logits/chosen": -12.6875, "logits/rejected": -12.4375, "logps/chosen": -564.0, "logps/rejected": -784.0, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -9.8125, "rewards/margins": 14.5, "rewards/rejected": -24.375, "step": 4800 }, { "epoch": 2.5170068027210886, "grad_norm": 1.309773829889326, "learning_rate": 3.8460812906620045e-06, "logits/chosen": -12.75, "logits/rejected": -12.625, "logps/chosen": -588.0, "logps/rejected": -760.0, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.5, "rewards/margins": 11.75, "rewards/rejected": -22.25, "step": 4810 }, { "epoch": 2.5222396650968077, "grad_norm": 0.3227734134583067, "learning_rate": 3.7653409490546963e-06, "logits/chosen": -12.5, "logits/rejected": -12.6875, "logps/chosen": -540.0, "logps/rejected": -740.0, "loss": 0.0197, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.6875, "rewards/margins": 13.125, "rewards/rejected": -22.875, "step": 4820 }, { "epoch": 2.5274725274725274, "grad_norm": 0.2844434389328289, "learning_rate": 3.6853880386858107e-06, "logits/chosen": -12.75, "logits/rejected": -12.5, "logps/chosen": -532.0, "logps/rejected": -760.0, "loss": 0.0163, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.0, "rewards/margins": 13.25, "rewards/rejected": -22.25, "step": 4830 }, { "epoch": 2.532705389848247, "grad_norm": 1.4407739685644243, "learning_rate": 3.60622552439783e-06, "logits/chosen": -12.4375, "logits/rejected": -12.125, "logps/chosen": -492.0, "logps/rejected": -720.0, "loss": 0.0212, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.0625, "rewards/margins": 13.625, "rewards/rejected": -22.75, "step": 4840 }, { "epoch": 2.5379382522239666, "grad_norm": 0.23520810629298425, "learning_rate": 3.527856341723479e-06, "logits/chosen": -12.8125, "logits/rejected": -12.6875, "logps/chosen": -524.0, "logps/rejected": -756.0, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.5, "rewards/margins": 12.4375, "rewards/rejected": -23.0, "step": 4850 }, { "epoch": 2.5431711145996863, "grad_norm": 0.49363019070075204, "learning_rate": 3.4502833967768822e-06, "logits/chosen": -12.5625, "logits/rejected": -12.5, "logps/chosen": -560.0, "logps/rejected": -756.0, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -9.5625, "rewards/margins": 13.625, "rewards/rejected": -23.25, "step": 4860 }, { "epoch": 2.5484039769754054, "grad_norm": 0.06056475449691753, "learning_rate": 3.373509566145794e-06, "logits/chosen": -12.9375, "logits/rejected": -12.8125, "logps/chosen": -556.0, "logps/rejected": -728.0, "loss": 0.0294, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.0625, "rewards/margins": 12.25, "rewards/rejected": -22.375, "step": 4870 }, { "epoch": 2.553636839351125, "grad_norm": 4.573310943983801, "learning_rate": 3.297537696784911e-06, "logits/chosen": -13.0625, "logits/rejected": -12.6875, "logps/chosen": -490.0, "logps/rejected": -760.0, "loss": 0.0225, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.1875, "rewards/margins": 12.5, "rewards/rejected": -22.75, "step": 4880 }, { "epoch": 2.5588697017268447, "grad_norm": 0.06128290637528687, "learning_rate": 3.2223706059103324e-06, "logits/chosen": -12.5625, "logits/rejected": -12.375, "logps/chosen": -564.0, "logps/rejected": -744.0, "loss": 0.0115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.1875, "rewards/margins": 12.3125, "rewards/rejected": -22.5, "step": 4890 }, { "epoch": 2.564102564102564, "grad_norm": 0.12260215298875937, "learning_rate": 3.1480110808950747e-06, "logits/chosen": -12.5625, "logits/rejected": -12.25, "logps/chosen": -532.0, "logps/rejected": -780.0, "loss": 0.0115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.1875, "rewards/margins": 13.0625, "rewards/rejected": -23.25, "step": 4900 }, { "epoch": 2.5693354264782835, "grad_norm": 0.08368072058556565, "learning_rate": 3.07446187916568e-06, "logits/chosen": -12.8125, "logits/rejected": -12.5625, "logps/chosen": -552.0, "logps/rejected": -756.0, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -10.0, "rewards/margins": 12.0625, "rewards/rejected": -22.0, "step": 4910 }, { "epoch": 2.574568288854003, "grad_norm": 2.918695964115184, "learning_rate": 3.0017257281000216e-06, "logits/chosen": -12.75, "logits/rejected": -12.875, "logps/chosen": -568.0, "logps/rejected": -744.0, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -9.5, "rewards/margins": 12.75, "rewards/rejected": -22.25, "step": 4920 }, { "epoch": 2.5798011512297228, "grad_norm": 0.06527695856423694, "learning_rate": 2.9298053249261244e-06, "logits/chosen": -12.8125, "logits/rejected": -12.625, "logps/chosen": -468.0, "logps/rejected": -752.0, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.0625, "rewards/margins": 14.0, "rewards/rejected": -23.0, "step": 4930 }, { "epoch": 2.5850340136054424, "grad_norm": 0.46980953722988, "learning_rate": 2.858703336622154e-06, "logits/chosen": -13.0625, "logits/rejected": -12.75, "logps/chosen": -480.0, "logps/rejected": -720.0, "loss": 0.0525, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.6875, "rewards/margins": 12.3125, "rewards/rejected": -22.0, "step": 4940 }, { "epoch": 2.5902668759811616, "grad_norm": 0.28837857550195123, "learning_rate": 2.788422399817525e-06, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -512.0, "logps/rejected": -812.0, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -10.625, "rewards/margins": 13.5, "rewards/rejected": -24.125, "step": 4950 }, { "epoch": 2.595499738356881, "grad_norm": 0.06281973988927571, "learning_rate": 2.718965120695141e-06, "logits/chosen": -12.625, "logits/rejected": -12.4375, "logps/chosen": -552.0, "logps/rejected": -776.0, "loss": 0.0185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.375, "rewards/margins": 11.6875, "rewards/rejected": -22.0, "step": 4960 }, { "epoch": 2.600732600732601, "grad_norm": 0.5587610901639003, "learning_rate": 2.6503340748947086e-06, "logits/chosen": -12.8125, "logits/rejected": -12.5, "logps/chosen": -548.0, "logps/rejected": -852.0, "loss": 0.0219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.4375, "rewards/margins": 14.8125, "rewards/rejected": -24.25, "step": 4970 }, { "epoch": 2.60596546310832, "grad_norm": 0.03319417097882791, "learning_rate": 2.5825318074172765e-06, "logits/chosen": -12.25, "logits/rejected": -12.5625, "logps/chosen": -536.0, "logps/rejected": -776.0, "loss": 0.0131, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.125, "rewards/margins": 13.0625, "rewards/rejected": -23.25, "step": 4980 }, { "epoch": 2.6111983254840396, "grad_norm": 0.11213587678104356, "learning_rate": 2.515560832530836e-06, "logits/chosen": -12.875, "logits/rejected": -12.5625, "logps/chosen": -490.0, "logps/rejected": -784.0, "loss": 0.0144, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.75, "rewards/margins": 16.0, "rewards/rejected": -24.75, "step": 4990 }, { "epoch": 2.6164311878597593, "grad_norm": 0.09685923373345516, "learning_rate": 2.4494236336770697e-06, "logits/chosen": -12.5, "logits/rejected": -12.3125, "logps/chosen": -510.0, "logps/rejected": -740.0, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -9.8125, "rewards/margins": 12.375, "rewards/rejected": -22.125, "step": 5000 }, { "epoch": 2.621664050235479, "grad_norm": 2.993062977560315, "learning_rate": 2.3841226633792983e-06, "logits/chosen": -12.3125, "logits/rejected": -12.125, "logps/chosen": -588.0, "logps/rejected": -756.0, "loss": 0.0113, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.375, "rewards/margins": 12.9375, "rewards/rejected": -23.25, "step": 5010 }, { "epoch": 2.6268969126111985, "grad_norm": 0.2803893599147877, "learning_rate": 2.319660343151511e-06, "logits/chosen": -12.75, "logits/rejected": -12.75, "logps/chosen": -536.0, "logps/rejected": -704.0, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -10.625, "rewards/margins": 11.0625, "rewards/rejected": -21.75, "step": 5020 }, { "epoch": 2.6321297749869177, "grad_norm": 0.19574905752849175, "learning_rate": 2.2560390634085714e-06, "logits/chosen": -12.9375, "logits/rejected": -12.75, "logps/chosen": -494.0, "logps/rejected": -764.0, "loss": 0.0152, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.875, "rewards/margins": 12.875, "rewards/rejected": -22.75, "step": 5030 }, { "epoch": 2.6373626373626373, "grad_norm": 0.11308648254691579, "learning_rate": 2.1932611833775846e-06, "logits/chosen": -12.6875, "logits/rejected": -12.625, "logps/chosen": -556.0, "logps/rejected": -772.0, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -10.1875, "rewards/margins": 14.125, "rewards/rejected": -24.25, "step": 5040 }, { "epoch": 2.642595499738357, "grad_norm": 0.19180938005745587, "learning_rate": 2.13132903101039e-06, "logits/chosen": -13.0, "logits/rejected": -12.5625, "logps/chosen": -496.0, "logps/rejected": -744.0, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -9.9375, "rewards/margins": 13.0625, "rewards/rejected": -23.0, "step": 5050 }, { "epoch": 2.647828362114076, "grad_norm": 0.0824809238337483, "learning_rate": 2.0702449028972698e-06, "logits/chosen": -12.75, "logits/rejected": -12.5625, "logps/chosen": -528.0, "logps/rejected": -776.0, "loss": 0.0164, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.625, "rewards/margins": 14.3125, "rewards/rejected": -24.0, "step": 5060 }, { "epoch": 2.6530612244897958, "grad_norm": 0.19257105032977728, "learning_rate": 2.0100110641817548e-06, "logits/chosen": -12.5625, "logits/rejected": -12.125, "logps/chosen": -532.0, "logps/rejected": -752.0, "loss": 0.0133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.25, "rewards/margins": 11.5625, "rewards/rejected": -22.75, "step": 5070 }, { "epoch": 2.6582940868655154, "grad_norm": 0.6376462030458858, "learning_rate": 1.9506297484766427e-06, "logits/chosen": -12.6875, "logits/rejected": -12.9375, "logps/chosen": -564.0, "logps/rejected": -744.0, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -9.6875, "rewards/margins": 14.0625, "rewards/rejected": -23.75, "step": 5080 }, { "epoch": 2.663526949241235, "grad_norm": 1.2368487640960626, "learning_rate": 1.8921031577811693e-06, "logits/chosen": -12.625, "logits/rejected": -12.5625, "logps/chosen": -556.0, "logps/rejected": -752.0, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.375, "rewards/margins": 11.8125, "rewards/rejected": -22.25, "step": 5090 }, { "epoch": 2.6687598116169546, "grad_norm": 0.2769381517332766, "learning_rate": 1.8344334623993515e-06, "logits/chosen": -12.625, "logits/rejected": -12.5, "logps/chosen": -520.0, "logps/rejected": -760.0, "loss": 0.0109, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.0625, "rewards/margins": 13.375, "rewards/rejected": -23.5, "step": 5100 }, { "epoch": 2.6739926739926743, "grad_norm": 0.10428401411066154, "learning_rate": 1.7776228008594965e-06, "logits/chosen": -13.0, "logits/rejected": -12.875, "logps/chosen": -560.0, "logps/rejected": -828.0, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -9.9375, "rewards/margins": 14.6875, "rewards/rejected": -24.625, "step": 5110 }, { "epoch": 2.6792255363683934, "grad_norm": 0.5981436652803144, "learning_rate": 1.721673279834926e-06, "logits/chosen": -13.0, "logits/rejected": -12.1875, "logps/chosen": -520.0, "logps/rejected": -720.0, "loss": 0.011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.25, "rewards/margins": 11.1875, "rewards/rejected": -22.375, "step": 5120 }, { "epoch": 2.684458398744113, "grad_norm": 0.03180320517493719, "learning_rate": 1.6665869740658312e-06, "logits/chosen": -12.875, "logits/rejected": -12.625, "logps/chosen": -608.0, "logps/rejected": -868.0, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -10.0, "rewards/margins": 15.375, "rewards/rejected": -25.375, "step": 5130 }, { "epoch": 2.6896912611198327, "grad_norm": 0.07580243042957731, "learning_rate": 1.6123659262823498e-06, "logits/chosen": -12.625, "logits/rejected": -12.625, "logps/chosen": -506.0, "logps/rejected": -700.0, "loss": 0.0124, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -9.9375, "rewards/margins": 12.5625, "rewards/rejected": -22.5, "step": 5140 }, { "epoch": 2.694924123495552, "grad_norm": 1.4111733913030347, "learning_rate": 1.5590121471288106e-06, "logits/chosen": -12.6875, "logits/rejected": -12.5, "logps/chosen": -520.0, "logps/rejected": -688.0, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.375, "rewards/margins": 11.1875, "rewards/rejected": -21.5, "step": 5150 }, { "epoch": 2.7001569858712715, "grad_norm": 0.08758065150759772, "learning_rate": 1.5065276150891788e-06, "logits/chosen": -12.625, "logits/rejected": -12.125, "logps/chosen": -524.0, "logps/rejected": -748.0, "loss": 0.0115, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.5625, "rewards/margins": 13.3125, "rewards/rejected": -23.875, "step": 5160 }, { "epoch": 2.705389848246991, "grad_norm": 0.05585072015141439, "learning_rate": 1.4549142764136769e-06, "logits/chosen": -13.1875, "logits/rejected": -12.6875, "logps/chosen": -486.0, "logps/rejected": -732.0, "loss": 0.0132, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.0, "rewards/margins": 12.8125, "rewards/rejected": -22.75, "step": 5170 }, { "epoch": 2.7106227106227108, "grad_norm": 2.383493892083904, "learning_rate": 1.4041740450466385e-06, "logits/chosen": -13.0625, "logits/rejected": -13.0625, "logps/chosen": -544.0, "logps/rejected": -760.0, "loss": 0.02, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.8125, "rewards/margins": 13.0625, "rewards/rejected": -23.875, "step": 5180 }, { "epoch": 2.7158555729984304, "grad_norm": 0.4685409939387069, "learning_rate": 1.3543088025555095e-06, "logits/chosen": -12.8125, "logits/rejected": -12.375, "logps/chosen": -490.0, "logps/rejected": -684.0, "loss": 0.0125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.1875, "rewards/margins": 11.5625, "rewards/rejected": -21.75, "step": 5190 }, { "epoch": 2.7210884353741496, "grad_norm": 0.024282596932399626, "learning_rate": 1.3053203980610746e-06, "logits/chosen": -12.6875, "logits/rejected": -12.1875, "logps/chosen": -556.0, "logps/rejected": -824.0, "loss": 0.0077, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.5625, "rewards/margins": 15.4375, "rewards/rejected": -25.0, "step": 5200 }, { "epoch": 2.726321297749869, "grad_norm": 0.08325518642920167, "learning_rate": 1.2572106481689245e-06, "logits/chosen": -12.875, "logits/rejected": -12.5, "logps/chosen": -474.0, "logps/rejected": -728.0, "loss": 0.0157, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.375, "rewards/margins": 12.4375, "rewards/rejected": -22.75, "step": 5210 }, { "epoch": 2.731554160125589, "grad_norm": 0.48506294422392854, "learning_rate": 1.2099813369020468e-06, "logits/chosen": -12.875, "logits/rejected": -12.6875, "logps/chosen": -536.0, "logps/rejected": -780.0, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -9.8125, "rewards/margins": 13.875, "rewards/rejected": -23.75, "step": 5220 }, { "epoch": 2.736787022501308, "grad_norm": 2.6864742223740325, "learning_rate": 1.1636342156346846e-06, "logits/chosen": -12.75, "logits/rejected": -12.3125, "logps/chosen": -572.0, "logps/rejected": -776.0, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -11.375, "rewards/margins": 12.0625, "rewards/rejected": -23.375, "step": 5230 }, { "epoch": 2.7420198848770276, "grad_norm": 0.23707346252631317, "learning_rate": 1.1181710030274046e-06, "logits/chosen": -13.125, "logits/rejected": -12.5625, "logps/chosen": -474.0, "logps/rejected": -696.0, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -10.625, "rewards/margins": 11.5625, "rewards/rejected": -22.25, "step": 5240 }, { "epoch": 2.7472527472527473, "grad_norm": 2.157658970277411, "learning_rate": 1.073593384963356e-06, "logits/chosen": -12.6875, "logits/rejected": -12.375, "logps/chosen": -544.0, "logps/rejected": -756.0, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -10.25, "rewards/margins": 14.125, "rewards/rejected": -24.375, "step": 5250 }, { "epoch": 2.752485609628467, "grad_norm": 0.3102770843154215, "learning_rate": 1.0299030144857446e-06, "logits/chosen": -13.3125, "logits/rejected": -12.75, "logps/chosen": -490.0, "logps/rejected": -792.0, "loss": 0.0104, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.875, "rewards/margins": 13.125, "rewards/rejected": -24.0, "step": 5260 }, { "epoch": 2.7577184720041865, "grad_norm": 0.07324163406898264, "learning_rate": 9.871015117365518e-07, "logits/chosen": -12.6875, "logits/rejected": -12.5625, "logps/chosen": -528.0, "logps/rejected": -728.0, "loss": 0.0106, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.8125, "rewards/margins": 12.3125, "rewards/rejected": -23.125, "step": 5270 }, { "epoch": 2.7629513343799057, "grad_norm": 0.06042974721060957, "learning_rate": 9.451904638964448e-07, "logits/chosen": -12.8125, "logits/rejected": -12.4375, "logps/chosen": -572.0, "logps/rejected": -764.0, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.375, "rewards/margins": 12.5, "rewards/rejected": -22.875, "step": 5280 }, { "epoch": 2.7681841967556253, "grad_norm": 0.3758142300940554, "learning_rate": 9.041714251259215e-07, "logits/chosen": -12.8125, "logits/rejected": -12.8125, "logps/chosen": -560.0, "logps/rejected": -728.0, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -9.8125, "rewards/margins": 12.25, "rewards/rejected": -22.0, "step": 5290 }, { "epoch": 2.773417059131345, "grad_norm": 0.0433372475756288, "learning_rate": 8.640459165076858e-07, "logits/chosen": -12.625, "logits/rejected": -12.4375, "logps/chosen": -500.0, "logps/rejected": -732.0, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -10.5625, "rewards/margins": 11.8125, "rewards/rejected": -22.375, "step": 5300 }, { "epoch": 2.778649921507064, "grad_norm": 0.022121147398926958, "learning_rate": 8.248154259902247e-07, "logits/chosen": -12.875, "logits/rejected": -12.6875, "logps/chosen": -516.0, "logps/rejected": -684.0, "loss": 0.0119, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.0625, "rewards/margins": 11.25, "rewards/rejected": -21.375, "step": 5310 }, { "epoch": 2.7838827838827838, "grad_norm": 0.36018606872198927, "learning_rate": 7.86481408332651e-07, "logits/chosen": -12.6875, "logits/rejected": -12.1875, "logps/chosen": -496.0, "logps/rejected": -724.0, "loss": 0.0143, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.875, "rewards/margins": 12.5, "rewards/rejected": -22.375, "step": 5320 }, { "epoch": 2.7891156462585034, "grad_norm": 0.2828337088179076, "learning_rate": 7.490452850507507e-07, "logits/chosen": -13.1875, "logits/rejected": -12.8125, "logps/chosen": -470.0, "logps/rejected": -648.0, "loss": 0.0509, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.875, "rewards/margins": 11.375, "rewards/rejected": -21.25, "step": 5330 }, { "epoch": 2.794348508634223, "grad_norm": 0.1829888250339699, "learning_rate": 7.125084443642654e-07, "logits/chosen": -12.75, "logits/rejected": -12.375, "logps/chosen": -548.0, "logps/rejected": -752.0, "loss": 0.01, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.9375, "rewards/margins": 13.625, "rewards/rejected": -23.5, "step": 5340 }, { "epoch": 2.7995813710099426, "grad_norm": 0.3036718148331244, "learning_rate": 6.768722411454154e-07, "logits/chosen": -13.0625, "logits/rejected": -12.8125, "logps/chosen": -540.0, "logps/rejected": -760.0, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -10.25, "rewards/margins": 12.875, "rewards/rejected": -23.25, "step": 5350 }, { "epoch": 2.804814233385662, "grad_norm": 1.8143984030436722, "learning_rate": 6.421379968686664e-07, "logits/chosen": -12.8125, "logits/rejected": -12.5625, "logps/chosen": -588.0, "logps/rejected": -792.0, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -9.875, "rewards/margins": 13.875, "rewards/rejected": -23.75, "step": 5360 }, { "epoch": 2.8100470957613815, "grad_norm": 0.9839252753214983, "learning_rate": 6.083069995617113e-07, "logits/chosen": -12.875, "logits/rejected": -12.6875, "logps/chosen": -520.0, "logps/rejected": -712.0, "loss": 0.0175, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.5, "rewards/margins": 11.8125, "rewards/rejected": -22.25, "step": 5370 }, { "epoch": 2.815279958137101, "grad_norm": 0.03769369288362027, "learning_rate": 5.753805037577193e-07, "logits/chosen": -12.875, "logits/rejected": -12.5625, "logps/chosen": -502.0, "logps/rejected": -732.0, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -10.125, "rewards/margins": 12.25, "rewards/rejected": -22.375, "step": 5380 }, { "epoch": 2.8205128205128203, "grad_norm": 1.3710061295371272, "learning_rate": 5.433597304488114e-07, "logits/chosen": -12.6875, "logits/rejected": -12.625, "logps/chosen": -568.0, "logps/rejected": -800.0, "loss": 0.0189, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.5, "rewards/margins": 14.0, "rewards/rejected": -23.5, "step": 5390 }, { "epoch": 2.82574568288854, "grad_norm": 0.04654503797633383, "learning_rate": 5.122458670407837e-07, "logits/chosen": -12.875, "logits/rejected": -12.5625, "logps/chosen": -510.0, "logps/rejected": -716.0, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -9.9375, "rewards/margins": 12.3125, "rewards/rejected": -22.25, "step": 5400 }, { "epoch": 2.8309785452642595, "grad_norm": 0.07293005563672031, "learning_rate": 4.820400673090669e-07, "logits/chosen": -12.625, "logits/rejected": -12.375, "logps/chosen": -592.0, "logps/rejected": -764.0, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -11.1875, "rewards/margins": 11.25, "rewards/rejected": -22.5, "step": 5410 }, { "epoch": 2.836211407639979, "grad_norm": 0.10617523379533803, "learning_rate": 4.527434513559553e-07, "logits/chosen": -12.9375, "logits/rejected": -12.5625, "logps/chosen": -560.0, "logps/rejected": -740.0, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -10.0, "rewards/margins": 12.1875, "rewards/rejected": -22.125, "step": 5420 }, { "epoch": 2.8414442700156988, "grad_norm": 0.9514774703351614, "learning_rate": 4.2435710556906485e-07, "logits/chosen": -12.6875, "logits/rejected": -12.6875, "logps/chosen": -564.0, "logps/rejected": -768.0, "loss": 0.0161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.9375, "rewards/margins": 12.5625, "rewards/rejected": -23.5, "step": 5430 }, { "epoch": 2.846677132391418, "grad_norm": 0.10096456261880521, "learning_rate": 3.968820825810432e-07, "logits/chosen": -12.875, "logits/rejected": -12.625, "logps/chosen": -478.0, "logps/rejected": -736.0, "loss": 0.01, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.8125, "rewards/margins": 14.5625, "rewards/rejected": -23.375, "step": 5440 }, { "epoch": 2.8519099947671376, "grad_norm": 0.5066792514792358, "learning_rate": 3.7031940123053997e-07, "logits/chosen": -12.875, "logits/rejected": -12.5, "logps/chosen": -498.0, "logps/rejected": -768.0, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -10.5625, "rewards/margins": 12.9375, "rewards/rejected": -23.5, "step": 5450 }, { "epoch": 2.857142857142857, "grad_norm": 0.9027735586319693, "learning_rate": 3.4467004652442847e-07, "logits/chosen": -12.75, "logits/rejected": -12.625, "logps/chosen": -532.0, "logps/rejected": -772.0, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -9.8125, "rewards/margins": 12.75, "rewards/rejected": -22.625, "step": 5460 }, { "epoch": 2.8623757195185764, "grad_norm": 2.98311443868495, "learning_rate": 3.1993496960127656e-07, "logits/chosen": -13.0625, "logits/rejected": -12.8125, "logps/chosen": -512.0, "logps/rejected": -752.0, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -9.4375, "rewards/margins": 14.3125, "rewards/rejected": -23.75, "step": 5470 }, { "epoch": 2.867608581894296, "grad_norm": 0.10108300143799398, "learning_rate": 2.961150876960667e-07, "logits/chosen": -12.8125, "logits/rejected": -12.4375, "logps/chosen": -540.0, "logps/rejected": -820.0, "loss": 0.01, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.0, "rewards/margins": 13.5, "rewards/rejected": -24.5, "step": 5480 }, { "epoch": 2.8728414442700156, "grad_norm": 0.07157164222596546, "learning_rate": 2.732112841062034e-07, "logits/chosen": -12.75, "logits/rejected": -12.5625, "logps/chosen": -528.0, "logps/rejected": -732.0, "loss": 0.0176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.5625, "rewards/margins": 11.9375, "rewards/rejected": -22.5, "step": 5490 }, { "epoch": 2.8780743066457353, "grad_norm": 0.29644122911545273, "learning_rate": 2.5122440815873725e-07, "logits/chosen": -12.625, "logits/rejected": -12.75, "logps/chosen": -528.0, "logps/rejected": -728.0, "loss": 0.0221, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.0625, "rewards/margins": 13.4375, "rewards/rejected": -23.5, "step": 5500 }, { "epoch": 2.883307169021455, "grad_norm": 0.2008809792059982, "learning_rate": 2.301552751788838e-07, "logits/chosen": -12.875, "logits/rejected": -12.3125, "logps/chosen": -540.0, "logps/rejected": -808.0, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -9.875, "rewards/margins": 14.3125, "rewards/rejected": -24.125, "step": 5510 }, { "epoch": 2.8885400313971745, "grad_norm": 0.7313163229009243, "learning_rate": 2.1000466645978435e-07, "logits/chosen": -13.1875, "logits/rejected": -12.625, "logps/chosen": -520.0, "logps/rejected": -712.0, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.875, "rewards/margins": 11.9375, "rewards/rejected": -22.75, "step": 5520 }, { "epoch": 2.8937728937728937, "grad_norm": 1.6364561725555695, "learning_rate": 1.907733292335373e-07, "logits/chosen": -12.125, "logits/rejected": -12.25, "logps/chosen": -572.0, "logps/rejected": -772.0, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -9.4375, "rewards/margins": 13.9375, "rewards/rejected": -23.375, "step": 5530 }, { "epoch": 2.8990057561486133, "grad_norm": 0.035311367474680014, "learning_rate": 1.7246197664347875e-07, "logits/chosen": -12.75, "logits/rejected": -12.25, "logps/chosen": -536.0, "logps/rejected": -772.0, "loss": 0.0278, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.375, "rewards/margins": 13.125, "rewards/rejected": -22.5, "step": 5540 }, { "epoch": 2.904238618524333, "grad_norm": 4.683425846518621, "learning_rate": 1.5507128771775347e-07, "logits/chosen": -12.3125, "logits/rejected": -12.375, "logps/chosen": -564.0, "logps/rejected": -768.0, "loss": 0.0213, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.375, "rewards/margins": 12.875, "rewards/rejected": -23.25, "step": 5550 }, { "epoch": 2.909471480900052, "grad_norm": 0.9311464987583402, "learning_rate": 1.386019073441186e-07, "logits/chosen": -12.875, "logits/rejected": -12.5, "logps/chosen": -564.0, "logps/rejected": -784.0, "loss": 0.0266, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.3125, "rewards/margins": 12.3125, "rewards/rejected": -22.625, "step": 5560 }, { "epoch": 2.9147043432757718, "grad_norm": 1.8171086732245252, "learning_rate": 1.2305444624604035e-07, "logits/chosen": -12.8125, "logits/rejected": -12.875, "logps/chosen": -560.0, "logps/rejected": -772.0, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -9.75, "rewards/margins": 13.1875, "rewards/rejected": -23.0, "step": 5570 }, { "epoch": 2.9199372056514914, "grad_norm": 0.3308245242542842, "learning_rate": 1.0842948096004835e-07, "logits/chosen": -13.25, "logits/rejected": -12.625, "logps/chosen": -504.0, "logps/rejected": -764.0, "loss": 0.0353, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.6875, "rewards/margins": 13.3125, "rewards/rejected": -23.0, "step": 5580 }, { "epoch": 2.925170068027211, "grad_norm": 7.34462217102031, "learning_rate": 9.472755381434162e-08, "logits/chosen": -12.75, "logits/rejected": -12.6875, "logps/chosen": -528.0, "logps/rejected": -684.0, "loss": 0.0343, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -11.0625, "rewards/margins": 10.5625, "rewards/rejected": -21.625, "step": 5590 }, { "epoch": 2.9304029304029307, "grad_norm": 0.08522077716129985, "learning_rate": 8.194917290869908e-08, "logits/chosen": -13.0, "logits/rejected": -12.8125, "logps/chosen": -528.0, "logps/rejected": -712.0, "loss": 0.0182, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.4375, "rewards/margins": 12.125, "rewards/rejected": -22.625, "step": 5600 }, { "epoch": 2.93563579277865, "grad_norm": 0.49829930070504364, "learning_rate": 7.009481209561686e-08, "logits/chosen": -13.0, "logits/rejected": -12.4375, "logps/chosen": -528.0, "logps/rejected": -780.0, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -9.625, "rewards/margins": 15.125, "rewards/rejected": -24.75, "step": 5610 }, { "epoch": 2.9408686551543695, "grad_norm": 0.0896042557685845, "learning_rate": 5.9164910962758445e-08, "logits/chosen": -12.8125, "logits/rejected": -12.5, "logps/chosen": -544.0, "logps/rejected": -816.0, "loss": 0.0149, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.5, "rewards/margins": 14.125, "rewards/rejected": -24.625, "step": 5620 }, { "epoch": 2.946101517530089, "grad_norm": 0.07557114485550194, "learning_rate": 4.915987481662887e-08, "logits/chosen": -13.0, "logits/rejected": -12.4375, "logps/chosen": -466.0, "logps/rejected": -680.0, "loss": 0.0259, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.5625, "rewards/margins": 11.6875, "rewards/rejected": -21.25, "step": 5630 }, { "epoch": 2.9513343799058083, "grad_norm": 0.1172664083262538, "learning_rate": 4.008007466757002e-08, "logits/chosen": -12.75, "logits/rejected": -12.625, "logps/chosen": -572.0, "logps/rejected": -792.0, "loss": 0.0151, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.1875, "rewards/margins": 12.75, "rewards/rejected": -24.0, "step": 5640 }, { "epoch": 2.956567242281528, "grad_norm": 0.1298391903154204, "learning_rate": 3.192584721598002e-08, "logits/chosen": -12.5625, "logits/rejected": -12.3125, "logps/chosen": -506.0, "logps/rejected": -756.0, "loss": 0.015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.3125, "rewards/margins": 14.0, "rewards/rejected": -23.25, "step": 5650 }, { "epoch": 2.9618001046572475, "grad_norm": 0.7035633454767951, "learning_rate": 2.4697494839850953e-08, "logits/chosen": -12.625, "logits/rejected": -12.5625, "logps/chosen": -544.0, "logps/rejected": -756.0, "loss": 0.0165, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.375, "rewards/margins": 13.625, "rewards/rejected": -23.0, "step": 5660 }, { "epoch": 2.967032967032967, "grad_norm": 0.07862395199572014, "learning_rate": 1.8395285583530654e-08, "logits/chosen": -12.625, "logits/rejected": -12.4375, "logps/chosen": -572.0, "logps/rejected": -748.0, "loss": 0.0183, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.4375, "rewards/margins": 12.0, "rewards/rejected": -22.375, "step": 5670 }, { "epoch": 2.9722658294086868, "grad_norm": 0.05368864657027849, "learning_rate": 1.3019453147805616e-08, "logits/chosen": -13.25, "logits/rejected": -12.5625, "logps/chosen": -520.0, "logps/rejected": -736.0, "loss": 0.0081, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.25, "rewards/margins": 13.0, "rewards/rejected": -23.25, "step": 5680 }, { "epoch": 2.977498691784406, "grad_norm": 0.11527478161506108, "learning_rate": 8.570196881216297e-09, "logits/chosen": -12.8125, "logits/rejected": -12.5625, "logps/chosen": -496.0, "logps/rejected": -772.0, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -10.0, "rewards/margins": 13.4375, "rewards/rejected": -23.5, "step": 5690 }, { "epoch": 2.9827315541601256, "grad_norm": 0.0555134584567345, "learning_rate": 5.04768177268522e-09, "logits/chosen": -12.875, "logits/rejected": -12.75, "logps/chosen": -556.0, "logps/rejected": -796.0, "loss": 0.0256, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.375, "rewards/margins": 14.125, "rewards/rejected": -24.375, "step": 5700 }, { "epoch": 2.987964416535845, "grad_norm": 0.9590070181791803, "learning_rate": 2.4520384453746716e-09, "logits/chosen": -12.875, "logits/rejected": -12.625, "logps/chosen": -560.0, "logps/rejected": -792.0, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -10.25, "rewards/margins": 13.3125, "rewards/rejected": -23.5, "step": 5710 }, { "epoch": 2.9931972789115644, "grad_norm": 0.05085650982391767, "learning_rate": 7.833631518627815e-10, "logits/chosen": -13.0, "logits/rejected": -12.4375, "logps/chosen": -512.0, "logps/rejected": -720.0, "loss": 0.02, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -10.5625, "rewards/margins": 11.5625, "rewards/rejected": -22.125, "step": 5720 }, { "epoch": 2.998430141287284, "grad_norm": 0.6365850172475589, "learning_rate": 4.171777056583004e-11, "logits/chosen": -12.9375, "logits/rejected": -12.875, "logps/chosen": -596.0, "logps/rejected": -744.0, "loss": 0.011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.875, "rewards/margins": 13.4375, "rewards/rejected": -23.25, "step": 5730 }, { "epoch": 3.0, "eval_logits/chosen": -13.0, "eval_logits/rejected": -13.0, "eval_logps/chosen": -612.0, "eval_logps/rejected": -632.0, "eval_loss": 1.0008906126022339, "eval_rewards/accuracies": 0.71875, "eval_rewards/chosen": -14.5625, "eval_rewards/margins": 2.609375, "eval_rewards/rejected": -17.125, "eval_runtime": 46.7518, "eval_samples_per_second": 42.779, "eval_steps_per_second": 0.684, "step": 5733 }, { "epoch": 3.0, "step": 5733, "total_flos": 0.0, "train_loss": 0.2962565040964742, "train_runtime": 10895.6468, "train_samples_per_second": 16.833, "train_steps_per_second": 0.526 } ], "logging_steps": 10, "max_steps": 5733, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }