diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10442 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 26.674157303370787, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 242.1805623372396, + "epoch": 0.033707865168539325, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.166666666666666e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 1 + }, + { + "completion_length": 239.2916692097982, + "epoch": 0.06741573033707865, + "grad_norm": 0.03706183277731571, + "kl": 0.0, + "learning_rate": 8.333333333333333e-08, + "loss": 0.0, + "reward": 0.11388889700174332, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.013888889302810034, + "step": 2 + }, + { + "completion_length": 254.25000508626303, + "epoch": 0.10112359550561797, + "grad_norm": 0.03698120739141217, + "kl": 0.005870819091796875, + "learning_rate": 1.25e-07, + "loss": 0.0, + "reward": 0.13472223530213037, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0347222238779068, + "step": 3 + }, + { + "completion_length": 185.70139439900717, + "epoch": 0.1348314606741573, + "grad_norm": 0.07241452627714502, + "kl": 0.0006707509358723959, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0, + "reward": 0.1833333522081375, + "reward_std": 0.0483945868909359, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.08333333830038707, + "step": 4 + }, + { + "completion_length": 232.35417683919272, + "epoch": 0.16853932584269662, + "grad_norm": 0.19678275196611628, + "kl": 0.0008799235026041666, + "learning_rate": 2.0833333333333333e-07, + "loss": 0.0, + "reward": 0.19027779499689737, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 5 + }, + { + "completion_length": 160.31250508626303, + "epoch": 0.20224719101123595, + "grad_norm": 0.23527549094422046, + "kl": 0.000644683837890625, + "learning_rate": 2.5e-07, + "loss": 0.0, + "reward": 0.3291666880249977, + "reward_std": 0.09375367189447086, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2291666716337204, + "step": 6 + }, + { + "completion_length": 220.12500508626303, + "epoch": 0.23595505617977527, + "grad_norm": 0.09697341534112204, + "kl": 0.0005804697672526041, + "learning_rate": 2.916666666666667e-07, + "loss": 0.0, + "reward": 0.2458333522081375, + "reward_std": 0.04535908500353495, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 7 + }, + { + "completion_length": 259.1180597941081, + "epoch": 0.2696629213483146, + "grad_norm": 0.1480415352673439, + "kl": 0.0005292892456054688, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0, + "reward": 0.10694445421298344, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.006944444651405017, + "step": 8 + }, + { + "completion_length": 263.34722900390625, + "epoch": 0.30337078651685395, + "grad_norm": 0.021217727907200827, + "kl": 0.0008672078450520834, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.09652778630455335, + "reward_std": 0.0028752734263738, + "rewards/format_reward_func": 0.09652778506278992, + "rewards/solution_reward_func": 0.0, + "step": 9 + }, + { + "completion_length": 256.14584096272785, + "epoch": 0.33707865168539325, + "grad_norm": 1.0882733283926178e-05, + "kl": 0.0006086031595865885, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 10 + }, + { + "completion_length": 227.36111958821616, + "epoch": 0.3707865168539326, + "grad_norm": 1.3399977563660263e-05, + "kl": 0.0006875991821289062, + "learning_rate": 4.5833333333333327e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 11 + }, + { + "completion_length": 246.04861704508463, + "epoch": 0.4044943820224719, + "grad_norm": 0.12510659506032698, + "kl": 0.0007673899332682291, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": 0.13472223530213037, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0347222238779068, + "step": 12 + }, + { + "completion_length": 237.3680623372396, + "epoch": 0.43820224719101125, + "grad_norm": 0.016908865504043948, + "kl": 0.0014123916625976562, + "learning_rate": 5.416666666666666e-07, + "loss": 0.0, + "reward": 0.15347223480542502, + "reward_std": 0.0028752734263738, + "rewards/format_reward_func": 0.09791667386889458, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 13 + }, + { + "completion_length": 168.3402837117513, + "epoch": 0.47191011235955055, + "grad_norm": 0.13649178468519033, + "kl": 0.001056671142578125, + "learning_rate": 5.833333333333334e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 14 + }, + { + "completion_length": 217.0277862548828, + "epoch": 0.5056179775280899, + "grad_norm": 0.020972866074463, + "kl": 0.0007607142130533854, + "learning_rate": 6.249999999999999e-07, + "loss": 0.0, + "reward": 0.20902779201666513, + "reward_std": 0.0028752734263738, + "rewards/format_reward_func": 0.09791667386889458, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 15 + }, + { + "completion_length": 274.3055597941081, + "epoch": 0.5393258426966292, + "grad_norm": 1.4745881640095153e-05, + "kl": 0.0006968180338541666, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 16 + }, + { + "completion_length": 174.4236157735189, + "epoch": 0.5730337078651685, + "grad_norm": 0.0521261603097119, + "kl": 0.0008719762166341146, + "learning_rate": 7.083333333333334e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 17 + }, + { + "completion_length": 222.31250508626303, + "epoch": 0.6067415730337079, + "grad_norm": 0.06337817110559164, + "kl": 0.0006812413533528646, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 0.23194446663061777, + "reward_std": 0.05446995794773102, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1319444477558136, + "step": 18 + }, + { + "completion_length": 184.50000127156576, + "epoch": 0.6404494382022472, + "grad_norm": 0.1318972028154634, + "kl": 0.0006866455078125, + "learning_rate": 7.916666666666666e-07, + "loss": 0.0, + "reward": 0.12083334227403005, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.02083333395421505, + "step": 19 + }, + { + "completion_length": 217.36111958821616, + "epoch": 0.6741573033707865, + "grad_norm": 0.08598991476443321, + "kl": 0.0014066696166992188, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0, + "reward": 0.16944445918003717, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.06944444651405017, + "step": 20 + }, + { + "completion_length": 185.94445037841797, + "epoch": 0.7078651685393258, + "grad_norm": 0.31711439428288046, + "kl": 0.00119781494140625, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 0.18333334724108377, + "reward_std": 0.06500093638896942, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0833333358168602, + "step": 21 + }, + { + "completion_length": 191.38889439900717, + "epoch": 0.7415730337078652, + "grad_norm": 0.12504614704058498, + "kl": 0.0008424123128255209, + "learning_rate": 9.166666666666665e-07, + "loss": 0.0, + "reward": 0.16180556764205298, + "reward_std": 0.047323266665140785, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.0625000037252903, + "step": 22 + }, + { + "completion_length": 214.19445037841797, + "epoch": 0.7752808988764045, + "grad_norm": 0.11543473546399327, + "kl": 0.0012032190958658855, + "learning_rate": 9.583333333333334e-07, + "loss": 0.0, + "reward": 0.2916666989525159, + "reward_std": 0.032665262619654335, + "rewards/format_reward_func": 0.09722223008672397, + "rewards/solution_reward_func": 0.19444445023934046, + "step": 23 + }, + { + "completion_length": 244.49305979410806, + "epoch": 0.8089887640449438, + "grad_norm": 0.051013579724238596, + "kl": 0.0009679794311523438, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.2500000062088172, + "reward_std": 0.028686795694132645, + "rewards/format_reward_func": 0.09722223008672397, + "rewards/solution_reward_func": 0.15277778108914694, + "step": 24 + }, + { + "completion_length": 299.93750762939453, + "epoch": 0.8426966292134831, + "grad_norm": 0.14342801944833, + "kl": 0.0010420481363932292, + "learning_rate": 9.999959025306053e-07, + "loss": 0.0, + "reward": 0.11597223331530888, + "reward_std": 0.030716917167107265, + "rewards/format_reward_func": 0.09513889625668526, + "rewards/solution_reward_func": 0.020833333333333332, + "step": 25 + }, + { + "completion_length": 215.1180623372396, + "epoch": 0.8764044943820225, + "grad_norm": 0.13590270991406894, + "kl": 0.0011971791585286458, + "learning_rate": 9.99983610189578e-07, + "loss": 0.0, + "reward": 0.1763889044523239, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.07638889054457347, + "step": 26 + }, + { + "completion_length": 266.32639567057294, + "epoch": 0.9101123595505618, + "grad_norm": 2.7342564951368985e-05, + "kl": 0.0017363230387369792, + "learning_rate": 9.999631231783884e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 27 + }, + { + "completion_length": 198.70834096272787, + "epoch": 0.9438202247191011, + "grad_norm": 0.0745389173315549, + "kl": 0.0020586649576822915, + "learning_rate": 9.99934441832816e-07, + "loss": 0.0, + "reward": 0.14166667560736337, + "reward_std": 0.0483945868909359, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.041666668529311814, + "step": 28 + }, + { + "completion_length": 213.43751017252603, + "epoch": 0.9775280898876404, + "grad_norm": 0.026431441256609643, + "kl": 0.0018463134765625, + "learning_rate": 9.998975666229445e-07, + "loss": 0.0, + "reward": 0.15486112236976624, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 29 + }, + { + "completion_length": 241.75, + "epoch": 1.0, + "grad_norm": 0.026431441256609643, + "kl": 0.0010938644409179688, + "learning_rate": 9.99852498153154e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 30 + }, + { + "completion_length": 220.86111958821616, + "epoch": 1.0337078651685394, + "grad_norm": 2.958592497318555e-05, + "kl": 0.002307891845703125, + "learning_rate": 9.997992371621111e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 31 + }, + { + "completion_length": 231.7916742960612, + "epoch": 1.0674157303370786, + "grad_norm": 4.086324991771372e-05, + "kl": 0.0018717447916666667, + "learning_rate": 9.997377845227574e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 32 + }, + { + "completion_length": 215.47917683919272, + "epoch": 1.101123595505618, + "grad_norm": 0.04772660458779815, + "kl": 0.0036188761393229165, + "learning_rate": 9.996681412422937e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 33 + }, + { + "completion_length": 182.62500381469727, + "epoch": 1.1348314606741572, + "grad_norm": 3.755955293684506e-05, + "kl": 0.0033086140950520835, + "learning_rate": 9.99590308462165e-07, + "loss": 0.0, + "reward": 0.2527777974804242, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15277778108914694, + "step": 34 + }, + { + "completion_length": 215.61111450195312, + "epoch": 1.1685393258426966, + "grad_norm": 0.2575955029475059, + "kl": 0.00734710693359375, + "learning_rate": 9.99504287458041e-07, + "loss": 0.0, + "reward": 0.15555557111899057, + "reward_std": 0.05939138929049174, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 35 + }, + { + "completion_length": 268.2916742960612, + "epoch": 1.202247191011236, + "grad_norm": 0.07388856045908256, + "kl": 0.0036570231119791665, + "learning_rate": 9.994100796397953e-07, + "loss": 0.0, + "reward": 0.1548611248532931, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 36 + }, + { + "completion_length": 245.7430648803711, + "epoch": 1.2359550561797752, + "grad_norm": 3.2861422165846195e-05, + "kl": 0.004191080729166667, + "learning_rate": 9.993076865514827e-07, + "loss": 0.0, + "reward": 0.21805557111899057, + "reward_std": 0.05446995794773102, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11805555721124013, + "step": 37 + }, + { + "completion_length": 213.39583841959634, + "epoch": 1.2696629213483146, + "grad_norm": 0.1553321502823284, + "kl": 0.00687408447265625, + "learning_rate": 9.991971098713135e-07, + "loss": 0.0, + "reward": 0.21805557360251746, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11805556093653043, + "step": 38 + }, + { + "completion_length": 186.56945037841797, + "epoch": 1.303370786516854, + "grad_norm": 0.11728940209991921, + "kl": 0.03392791748046875, + "learning_rate": 9.990783514116256e-07, + "loss": 0.0, + "reward": 0.12083334475755692, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.020833333333333332, + "step": 39 + }, + { + "completion_length": 306.57639567057294, + "epoch": 1.3370786516853932, + "grad_norm": 0.01711459639970424, + "kl": 0.005391438802083333, + "learning_rate": 9.989514131188558e-07, + "loss": 0.0, + "reward": 0.09791667262713115, + "reward_std": 0.004535907879471779, + "rewards/format_reward_func": 0.097916675110658, + "rewards/solution_reward_func": 0.0, + "step": 40 + }, + { + "completion_length": 209.00000508626303, + "epoch": 1.3707865168539326, + "grad_norm": 0.08864370760788896, + "kl": 0.0128021240234375, + "learning_rate": 9.988162970735072e-07, + "loss": 0.0, + "reward": 0.2597222402691841, + "reward_std": 0.0801871841152509, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1597222238779068, + "step": 41 + }, + { + "completion_length": 234.0763905843099, + "epoch": 1.404494382022472, + "grad_norm": 0.13269251171868587, + "kl": 0.00672149658203125, + "learning_rate": 9.986730054901152e-07, + "loss": 0.0, + "reward": 0.12083334227403005, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.020833333333333332, + "step": 42 + }, + { + "completion_length": 260.3055674235026, + "epoch": 1.4382022471910112, + "grad_norm": 0.04020878196286899, + "kl": 0.00701141357421875, + "learning_rate": 9.985215407172114e-07, + "loss": 0.0, + "reward": 0.10694445421298344, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.006944444651405017, + "step": 43 + }, + { + "completion_length": 219.06250762939453, + "epoch": 1.4719101123595506, + "grad_norm": 3.483490189793289e-05, + "kl": 0.008382161458333334, + "learning_rate": 9.983619052372847e-07, + "loss": 0.0, + "reward": 0.2736111308137576, + "reward_std": 0.04535908500353495, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.17361111442248026, + "step": 44 + }, + { + "completion_length": 193.68055979410806, + "epoch": 1.50561797752809, + "grad_norm": 0.17669570750215183, + "kl": 0.010396321614583334, + "learning_rate": 9.981941016667413e-07, + "loss": 0.0, + "reward": 0.14166667809089026, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.041666666666666664, + "step": 45 + }, + { + "completion_length": 229.98611704508463, + "epoch": 1.5393258426966292, + "grad_norm": 0.14124271408222322, + "kl": 0.009211222330729166, + "learning_rate": 9.980181327558608e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 46 + }, + { + "completion_length": 178.11111704508463, + "epoch": 1.5730337078651684, + "grad_norm": 6.416250108314609e-05, + "kl": 0.009124755859375, + "learning_rate": 9.978340013887525e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 47 + }, + { + "completion_length": 180.9166692097982, + "epoch": 1.606741573033708, + "grad_norm": 0.09616362735907492, + "kl": 0.017262776692708332, + "learning_rate": 9.97641710583307e-07, + "loss": 0.0, + "reward": 0.19027779499689737, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 48 + }, + { + "completion_length": 253.0625089009603, + "epoch": 1.6404494382022472, + "grad_norm": 0.17187544410809805, + "kl": 0.011540730794270834, + "learning_rate": 9.974412634911466e-07, + "loss": 0.0, + "reward": 0.13819445793827376, + "reward_std": 0.0512698603173097, + "rewards/format_reward_func": 0.09652778506278992, + "rewards/solution_reward_func": 0.041666668529311814, + "step": 49 + }, + { + "completion_length": 250.73611704508463, + "epoch": 1.6741573033707864, + "grad_norm": 0.14786367303175033, + "kl": 0.006711324055989583, + "learning_rate": 9.972326633975752e-07, + "loss": 0.0, + "reward": 0.09861111889282863, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.0, + "step": 50 + }, + { + "completion_length": 193.90972264607748, + "epoch": 1.7078651685393258, + "grad_norm": 5.4551083470792385e-05, + "kl": 0.0163421630859375, + "learning_rate": 9.970159137215223e-07, + "loss": 0.0, + "reward": 0.16250001390775046, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.06250000186264515, + "step": 51 + }, + { + "completion_length": 230.14584096272787, + "epoch": 1.7415730337078652, + "grad_norm": 0.15311939396518548, + "kl": 0.01141357421875, + "learning_rate": 9.967910180154888e-07, + "loss": 0.0, + "reward": 0.14097223182519278, + "reward_std": 0.050358771036068596, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.041666668529311814, + "step": 52 + }, + { + "completion_length": 191.00000508626303, + "epoch": 1.7752808988764044, + "grad_norm": 0.16006040802844965, + "kl": 0.014923095703125, + "learning_rate": 9.965579799654878e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 53 + }, + { + "completion_length": 211.41667683919272, + "epoch": 1.8089887640449438, + "grad_norm": 0.05676407950910771, + "kl": 0.014668782552083334, + "learning_rate": 9.96316803390984e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 54 + }, + { + "completion_length": 202.59028116861978, + "epoch": 1.8426966292134832, + "grad_norm": 0.10163370353112902, + "kl": 0.018564860026041668, + "learning_rate": 9.960674922448327e-07, + "loss": 0.0, + "reward": 0.29305558154980343, + "reward_std": 0.07155112735927105, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.1944444521019856, + "step": 55 + }, + { + "completion_length": 270.63195037841797, + "epoch": 1.8764044943820224, + "grad_norm": 0.09333244468307338, + "kl": 0.0123443603515625, + "learning_rate": 9.958100506132126e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 56 + }, + { + "completion_length": 211.5625025431315, + "epoch": 1.9101123595505618, + "grad_norm": 6.920433223672096e-05, + "kl": 0.018463134765625, + "learning_rate": 9.955444827155603e-07, + "loss": 0.0, + "reward": 0.1270833487311999, + "reward_std": 0.031659880032142006, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.027777778605620067, + "step": 57 + }, + { + "completion_length": 268.9652837117513, + "epoch": 1.9438202247191012, + "grad_norm": 0.14121522033901673, + "kl": 0.011784871419270834, + "learning_rate": 9.952707929045018e-07, + "loss": 0.0, + "reward": 0.195833350221316, + "reward_std": 0.022611424637337525, + "rewards/format_reward_func": 0.09166667362054189, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 58 + }, + { + "completion_length": 217.5555648803711, + "epoch": 1.9775280898876404, + "grad_norm": 0.07014704788546897, + "kl": 0.022349039713541668, + "learning_rate": 9.949889856657787e-07, + "loss": 0.0, + "reward": 0.15555557111899057, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555559073885284, + "step": 59 + }, + { + "completion_length": 387.75, + "epoch": 2.0, + "grad_norm": 0.06558149295401529, + "kl": 0.009552001953125, + "learning_rate": 9.946990656181779e-07, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0883883461356163, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 60 + }, + { + "completion_length": 266.50695037841797, + "epoch": 2.033707865168539, + "grad_norm": 0.13494905010941088, + "kl": 0.018880208333333332, + "learning_rate": 9.944010375134532e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 61 + }, + { + "completion_length": 201.0902837117513, + "epoch": 2.067415730337079, + "grad_norm": 0.11338354916019651, + "kl": 0.026885986328125, + "learning_rate": 9.94094906236249e-07, + "loss": 0.0, + "reward": 0.2666666929920514, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666685293118, + "step": 62 + }, + { + "completion_length": 214.2430623372396, + "epoch": 2.101123595505618, + "grad_norm": 0.07775160444532367, + "kl": 0.03680419921875, + "learning_rate": 9.937806768040188e-07, + "loss": 0.0, + "reward": 0.19027779499689737, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 63 + }, + { + "completion_length": 171.39583841959634, + "epoch": 2.134831460674157, + "grad_norm": 0.07576083004046388, + "kl": 0.026468912760416668, + "learning_rate": 9.934583543669453e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 64 + }, + { + "completion_length": 177.53472773234049, + "epoch": 2.168539325842697, + "grad_norm": 0.00010044228644990604, + "kl": 0.0325927734375, + "learning_rate": 9.931279442078532e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 65 + }, + { + "completion_length": 237.50695037841797, + "epoch": 2.202247191011236, + "grad_norm": 0.05021473552933482, + "kl": 0.023905436197916668, + "learning_rate": 9.927894517421252e-07, + "loss": 0.0, + "reward": 0.13472223281860352, + "reward_std": 0.05446995794773102, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.034722222636143364, + "step": 66 + }, + { + "completion_length": 167.95834096272787, + "epoch": 2.235955056179775, + "grad_norm": 0.029548078341797564, + "kl": 0.036376953125, + "learning_rate": 9.924428825176105e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 67 + }, + { + "completion_length": 257.06251017252606, + "epoch": 2.2696629213483144, + "grad_norm": 0.00016274719298492977, + "kl": 0.024668375651041668, + "learning_rate": 9.92088242214537e-07, + "loss": 0.0, + "reward": 0.09791667386889458, + "reward_std": 0.0028752734263738, + "rewards/format_reward_func": 0.09791667386889458, + "rewards/solution_reward_func": 0.0, + "step": 68 + }, + { + "completion_length": 221.5277837117513, + "epoch": 2.303370786516854, + "grad_norm": 0.06538120823459802, + "kl": 0.026377360026041668, + "learning_rate": 9.917255366454157e-07, + "loss": 0.0, + "reward": 0.29375001043081284, + "reward_std": 0.050358773209154606, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.19444445272286734, + "step": 69 + }, + { + "completion_length": 251.48611958821616, + "epoch": 2.337078651685393, + "grad_norm": 0.15219119711312137, + "kl": 0.024119059244791668, + "learning_rate": 9.913547717549462e-07, + "loss": 0.0, + "reward": 0.13472223530213037, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0347222238779068, + "step": 70 + }, + { + "completion_length": 249.81945292154947, + "epoch": 2.370786516853933, + "grad_norm": 0.23435731005009558, + "kl": 0.029368082682291668, + "learning_rate": 9.909759536199197e-07, + "loss": 0.0, + "reward": 0.12777779251337051, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.027777778605620067, + "step": 71 + }, + { + "completion_length": 179.8055623372396, + "epoch": 2.404494382022472, + "grad_norm": 0.12136026320973395, + "kl": 0.058186848958333336, + "learning_rate": 9.905890884491194e-07, + "loss": 0.0, + "reward": 0.3361111332972844, + "reward_std": 0.055412920812765755, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.23611111442248026, + "step": 72 + }, + { + "completion_length": 241.2013956705729, + "epoch": 2.438202247191011, + "grad_norm": 0.00010787190273206431, + "kl": 0.037282307942708336, + "learning_rate": 9.90194182583218e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 73 + }, + { + "completion_length": 202.40278116861978, + "epoch": 2.4719101123595504, + "grad_norm": 0.09618269582914715, + "kl": 0.047037760416666664, + "learning_rate": 9.897912424946738e-07, + "loss": 0.0, + "reward": 0.20208334922790527, + "reward_std": 0.022517128537098568, + "rewards/format_reward_func": 0.09791667386889458, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 74 + }, + { + "completion_length": 277.4583435058594, + "epoch": 2.50561797752809, + "grad_norm": 0.01693996810034334, + "kl": 0.048319498697916664, + "learning_rate": 9.893802747876263e-07, + "loss": 0.0, + "reward": 0.15416668355464935, + "reward_std": 0.06331975882252057, + "rewards/format_reward_func": 0.09861112137635548, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 75 + }, + { + "completion_length": 221.0277862548828, + "epoch": 2.539325842696629, + "grad_norm": 0.12037997611692709, + "kl": 0.044026692708333336, + "learning_rate": 9.889612861977853e-07, + "loss": 0.0, + "reward": 0.15694445744156837, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.09444445247451465, + "rewards/solution_reward_func": 0.06250000186264515, + "step": 76 + }, + { + "completion_length": 302.4305648803711, + "epoch": 2.5730337078651684, + "grad_norm": 0.10666719312964867, + "kl": 0.023284912109375, + "learning_rate": 9.885342835923226e-07, + "loss": 0.0, + "reward": 0.14027778804302216, + "reward_std": 0.02828894866009553, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.041666666666666664, + "step": 77 + }, + { + "completion_length": 226.6875089009603, + "epoch": 2.606741573033708, + "grad_norm": 0.09049396666448892, + "kl": 0.043670654296875, + "learning_rate": 9.880992739697588e-07, + "loss": 0.0, + "reward": 0.2944444591800372, + "reward_std": 0.0483945868909359, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.19444445272286734, + "step": 78 + }, + { + "completion_length": 238.90972391764322, + "epoch": 2.640449438202247, + "grad_norm": 0.028368696702127428, + "kl": 0.034271240234375, + "learning_rate": 9.876562644598485e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 79 + }, + { + "completion_length": 263.28472900390625, + "epoch": 2.6741573033707864, + "grad_norm": 0.00010690347907733638, + "kl": 0.0318603515625, + "learning_rate": 9.872052623234631e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 80 + }, + { + "completion_length": 244.7430623372396, + "epoch": 2.7078651685393256, + "grad_norm": 0.00011778267558553269, + "kl": 0.033935546875, + "learning_rate": 9.867462749524722e-07, + "loss": 0.0, + "reward": 0.09444445247451465, + "reward_std": 0.0, + "rewards/format_reward_func": 0.09444445247451465, + "rewards/solution_reward_func": 0.0, + "step": 81 + }, + { + "completion_length": 198.5138956705729, + "epoch": 2.741573033707865, + "grad_norm": 0.0001275712539021957, + "kl": 0.047831217447916664, + "learning_rate": 9.862793098696229e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 82 + }, + { + "completion_length": 185.56945164998373, + "epoch": 2.7752808988764044, + "grad_norm": 0.000253900095380479, + "kl": 0.041544596354166664, + "learning_rate": 9.858043747284157e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 83 + }, + { + "completion_length": 223.0138956705729, + "epoch": 2.808988764044944, + "grad_norm": 0.047216635984270006, + "kl": 0.041158040364583336, + "learning_rate": 9.853214773129795e-07, + "loss": 0.0, + "reward": 0.24583335469166437, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 84 + }, + { + "completion_length": 220.78473027547201, + "epoch": 2.842696629213483, + "grad_norm": 0.1776805285836067, + "kl": 0.038981119791666664, + "learning_rate": 9.848306255379437e-07, + "loss": 0.0, + "reward": 0.19722223778565726, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0972222238779068, + "step": 85 + }, + { + "completion_length": 191.31945292154947, + "epoch": 2.8764044943820224, + "grad_norm": 0.2787813496358527, + "kl": 0.051188151041666664, + "learning_rate": 9.843318274483087e-07, + "loss": 0.0, + "reward": 0.1972222402691841, + "reward_std": 0.06500093638896942, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09722222574055195, + "step": 86 + }, + { + "completion_length": 172.2777837117513, + "epoch": 2.9101123595505616, + "grad_norm": 0.00019800203126626523, + "kl": 0.053792317708333336, + "learning_rate": 9.838250912193145e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 87 + }, + { + "completion_length": 233.4513956705729, + "epoch": 2.943820224719101, + "grad_norm": 0.09957485190796092, + "kl": 0.043009440104166664, + "learning_rate": 9.833104251563055e-07, + "loss": 0.0, + "reward": 0.19027779251337051, + "reward_std": 0.07107630744576454, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027777798473835, + "step": 88 + }, + { + "completion_length": 180.3541717529297, + "epoch": 2.9775280898876404, + "grad_norm": 0.12280386436457202, + "kl": 0.057718912760416664, + "learning_rate": 9.827878376945958e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 89 + }, + { + "completion_length": 251.5, + "epoch": 3.0, + "grad_norm": 0.14338243971365205, + "kl": 0.032012939453125, + "learning_rate": 9.822573373993293e-07, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 90 + }, + { + "completion_length": 201.59722391764322, + "epoch": 3.033707865168539, + "grad_norm": 0.014709720839822223, + "kl": 0.07157389322916667, + "learning_rate": 9.817189329653416e-07, + "loss": 0.0, + "reward": 0.3215278113881747, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 91 + }, + { + "completion_length": 235.28472900390625, + "epoch": 3.067415730337079, + "grad_norm": 0.00013796096139160368, + "kl": 0.043070475260416664, + "learning_rate": 9.81172633217015e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 92 + }, + { + "completion_length": 230.97223154703775, + "epoch": 3.101123595505618, + "grad_norm": 0.0002011791856491358, + "kl": 0.055033365885416664, + "learning_rate": 9.806184471081357e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 93 + }, + { + "completion_length": 218.06250508626303, + "epoch": 3.134831460674157, + "grad_norm": 0.06404118560370763, + "kl": 0.051005045572916664, + "learning_rate": 9.800563837217464e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 94 + }, + { + "completion_length": 201.57639821370444, + "epoch": 3.168539325842697, + "grad_norm": 0.00020865180710594378, + "kl": 0.061767578125, + "learning_rate": 9.794864522699965e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 95 + }, + { + "completion_length": 198.97222900390625, + "epoch": 3.202247191011236, + "grad_norm": 0.00020653196924200095, + "kl": 0.055094401041666664, + "learning_rate": 9.789086620939935e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 96 + }, + { + "completion_length": 197.62500508626303, + "epoch": 3.235955056179775, + "grad_norm": 0.2017658519088373, + "kl": 0.056233723958333336, + "learning_rate": 9.783230226636472e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 97 + }, + { + "completion_length": 187.84028244018555, + "epoch": 3.2696629213483144, + "grad_norm": 0.16626034072804743, + "kl": 0.08353678385416667, + "learning_rate": 9.777295435775163e-07, + "loss": 0.0, + "reward": 0.12083334475755692, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.020833333333333332, + "step": 98 + }, + { + "completion_length": 200.7013931274414, + "epoch": 3.303370786516854, + "grad_norm": 0.0013681873848743736, + "kl": 0.10001627604166667, + "learning_rate": 9.771282345626504e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 99 + }, + { + "completion_length": 242.1388956705729, + "epoch": 3.337078651685393, + "grad_norm": 0.00022282721979562337, + "kl": 0.0657958984375, + "learning_rate": 9.765191054744304e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 100 + }, + { + "completion_length": 264.72222900390625, + "epoch": 3.370786516853933, + "grad_norm": 0.10744150708513822, + "kl": 0.06624348958333333, + "learning_rate": 9.759021662964078e-07, + "loss": 0.0, + "reward": 0.2388889119029045, + "reward_std": 0.0483945868909359, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1388888955116272, + "step": 101 + }, + { + "completion_length": 254.67362467447916, + "epoch": 3.404494382022472, + "grad_norm": 0.24335354352345753, + "kl": 0.07767740885416667, + "learning_rate": 9.7527742714014e-07, + "loss": 0.0, + "reward": 0.14791667958100638, + "reward_std": 0.0216060404976209, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 102 + }, + { + "completion_length": 193.09722391764322, + "epoch": 3.438202247191011, + "grad_norm": 0.17100766164600184, + "kl": 0.10367838541666667, + "learning_rate": 9.746448982450254e-07, + "loss": 0.0, + "reward": 0.2805555835366249, + "reward_std": 0.0483945868909359, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.18055556217829385, + "step": 103 + }, + { + "completion_length": 211.04861450195312, + "epoch": 3.4719101123595504, + "grad_norm": 0.000278803736734657, + "kl": 0.09562174479166667, + "learning_rate": 9.740045899781352e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 104 + }, + { + "completion_length": 204.84722646077475, + "epoch": 3.50561797752809, + "grad_norm": 0.23313609085933693, + "kl": 0.11629231770833333, + "learning_rate": 9.733565128340434e-07, + "loss": 0.0, + "reward": 0.1833333522081375, + "reward_std": 0.0483945868909359, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.08333333830038707, + "step": 105 + }, + { + "completion_length": 175.8055623372396, + "epoch": 3.539325842696629, + "grad_norm": 0.004782321605843764, + "kl": 0.17049153645833334, + "learning_rate": 9.72700677434655e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 106 + }, + { + "completion_length": 249.0277837117513, + "epoch": 3.5730337078651684, + "grad_norm": 0.041941865975148804, + "kl": 0.084716796875, + "learning_rate": 9.72037094529032e-07, + "loss": 0.0, + "reward": 0.24583335469166437, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 107 + }, + { + "completion_length": 193.6111157735189, + "epoch": 3.606741573033708, + "grad_norm": 0.18704647717273898, + "kl": 0.10758463541666667, + "learning_rate": 9.713657749932171e-07, + "loss": 0.0, + "reward": 0.19027779499689737, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 108 + }, + { + "completion_length": 233.9444465637207, + "epoch": 3.640449438202247, + "grad_norm": 0.00015307778209937512, + "kl": 0.11319986979166667, + "learning_rate": 9.706867298300551e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 109 + }, + { + "completion_length": 266.32639567057294, + "epoch": 3.6741573033707864, + "grad_norm": 0.04014478645060059, + "kl": 0.09733072916666667, + "learning_rate": 9.69999970169013e-07, + "loss": 0.0, + "reward": 0.23958334575096765, + "reward_std": 0.03328863965968291, + "rewards/format_reward_func": 0.0937500074505806, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 110 + }, + { + "completion_length": 211.11806106567383, + "epoch": 3.7078651685393256, + "grad_norm": 0.12298179559308783, + "kl": 0.1365966796875, + "learning_rate": 9.69305507265998e-07, + "loss": 0.0, + "reward": 0.23888890693585077, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.13888889302810034, + "step": 111 + }, + { + "completion_length": 208.5833422342936, + "epoch": 3.741573033707865, + "grad_norm": 0.0002692241293662926, + "kl": 0.12841796875, + "learning_rate": 9.686033525031719e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 112 + }, + { + "completion_length": 208.28472900390625, + "epoch": 3.7752808988764044, + "grad_norm": 0.01509652182577804, + "kl": 0.119384765625, + "learning_rate": 9.67893517388765e-07, + "loss": 0.0, + "reward": 0.09791667386889458, + "reward_std": 0.0028752734263738, + "rewards/format_reward_func": 0.09791667386889458, + "rewards/solution_reward_func": 0.0, + "step": 113 + }, + { + "completion_length": 235.4791717529297, + "epoch": 3.808988764044944, + "grad_norm": 0.019579174671086906, + "kl": 0.1146240234375, + "learning_rate": 9.671760135568881e-07, + "loss": 0.0, + "reward": 0.1541666785875956, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 114 + }, + { + "completion_length": 195.80555979410806, + "epoch": 3.842696629213483, + "grad_norm": 0.0002531309101509351, + "kl": 0.16162109375, + "learning_rate": 9.664508527673413e-07, + "loss": 0.0, + "reward": 0.3777778223156929, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 115 + }, + { + "completion_length": 180.3333396911621, + "epoch": 3.8764044943820224, + "grad_norm": 0.057927772735377935, + "kl": 0.14375813802083334, + "learning_rate": 9.657180469054212e-07, + "loss": 0.0, + "reward": 0.21805557111899057, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11805555721124013, + "step": 116 + }, + { + "completion_length": 138.5902837117513, + "epoch": 3.9101123595505616, + "grad_norm": 0.24189723785234268, + "kl": 0.19287109375, + "learning_rate": 9.649776079817259e-07, + "loss": 0.0, + "reward": 0.3152778049310048, + "reward_std": 0.07107630744576454, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2152777798473835, + "step": 117 + }, + { + "completion_length": 192.15972900390625, + "epoch": 3.943820224719101, + "grad_norm": 0.0001995768225525734, + "kl": 0.141845703125, + "learning_rate": 9.642295481319587e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 118 + }, + { + "completion_length": 218.4305674235026, + "epoch": 3.9775280898876404, + "grad_norm": 0.00040888295429870506, + "kl": 0.14925130208333334, + "learning_rate": 9.634738796167295e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 119 + }, + { + "completion_length": 310.5, + "epoch": 4.0, + "grad_norm": 0.00040888295429870506, + "kl": 0.06365966796875, + "learning_rate": 9.62710614821352e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 120 + }, + { + "completion_length": 241.63195037841797, + "epoch": 4.033707865168539, + "grad_norm": 0.0001456954366989617, + "kl": 0.14762369791666666, + "learning_rate": 9.619397662556433e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 121 + }, + { + "completion_length": 164.05555979410806, + "epoch": 4.067415730337078, + "grad_norm": 0.1252265197504326, + "kl": 0.19270833333333334, + "learning_rate": 9.611613465537168e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 122 + }, + { + "completion_length": 162.61111450195312, + "epoch": 4.101123595505618, + "grad_norm": 0.08228984656904792, + "kl": 0.18977864583333334, + "learning_rate": 9.603753684737764e-07, + "loss": 0.0, + "reward": 0.21736112982034683, + "reward_std": 0.0604126105705897, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.118055559694767, + "step": 123 + }, + { + "completion_length": 203.56250127156576, + "epoch": 4.134831460674158, + "grad_norm": 0.27153713868114754, + "kl": 0.12361653645833333, + "learning_rate": 9.59581844897906e-07, + "loss": 0.0, + "reward": 0.19722223778565726, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0972222238779068, + "step": 124 + }, + { + "completion_length": 199.56945164998373, + "epoch": 4.168539325842697, + "grad_norm": 0.00033787586634761004, + "kl": 0.1533203125, + "learning_rate": 9.587807888318605e-07, + "loss": 0.0, + "reward": 0.21111113081375757, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111628512542, + "step": 125 + }, + { + "completion_length": 176.59722900390625, + "epoch": 4.202247191011236, + "grad_norm": 0.2597221365107078, + "kl": 0.15885416666666666, + "learning_rate": 9.579722134048505e-07, + "loss": 0.0, + "reward": 0.20416668305794397, + "reward_std": 0.07107630744576454, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.10416667039195697, + "step": 126 + }, + { + "completion_length": 127.36111450195312, + "epoch": 4.235955056179775, + "grad_norm": 0.24963870644554398, + "kl": 0.20279947916666666, + "learning_rate": 9.571561318693283e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 127 + }, + { + "completion_length": 187.31250762939453, + "epoch": 4.269662921348314, + "grad_norm": 0.02739209214915287, + "kl": 0.18961588541666666, + "learning_rate": 9.5633255760077e-07, + "loss": 0.0, + "reward": 0.26527779797712964, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 128 + }, + { + "completion_length": 175.40278116861978, + "epoch": 4.303370786516854, + "grad_norm": 0.00028731175072391977, + "kl": 0.18115234375, + "learning_rate": 9.555015040974577e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 129 + }, + { + "completion_length": 246.54167683919272, + "epoch": 4.337078651685394, + "grad_norm": 0.000346914571861134, + "kl": 0.14461263020833334, + "learning_rate": 9.546629849802561e-07, + "loss": 0.0, + "reward": 0.19027779499689737, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 130 + }, + { + "completion_length": 234.53472900390625, + "epoch": 4.370786516853933, + "grad_norm": 0.036764460783073716, + "kl": 0.21565755208333334, + "learning_rate": 9.538170139923909e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 131 + }, + { + "completion_length": 217.4652862548828, + "epoch": 4.404494382022472, + "grad_norm": 0.14904165841279143, + "kl": 0.20947265625, + "learning_rate": 9.529636049992233e-07, + "loss": 0.0, + "reward": 0.17638890941937765, + "reward_std": 0.049337549755970635, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.07638889302810033, + "step": 132 + }, + { + "completion_length": 216.59722900390625, + "epoch": 4.438202247191011, + "grad_norm": 0.13024382600501613, + "kl": 0.2532552083333333, + "learning_rate": 9.521027719880222e-07, + "loss": 0.0, + "reward": 0.24583335469166437, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 133 + }, + { + "completion_length": 213.2083396911621, + "epoch": 4.47191011235955, + "grad_norm": 0.00032395357966553327, + "kl": 0.22835286458333334, + "learning_rate": 9.512345290677349e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 134 + }, + { + "completion_length": 177.13195037841797, + "epoch": 4.50561797752809, + "grad_norm": 0.0003766339875477684, + "kl": 0.24503580729166666, + "learning_rate": 9.503588904687569e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 135 + }, + { + "completion_length": 203.2430648803711, + "epoch": 4.539325842696629, + "grad_norm": 0.03295030655767121, + "kl": 0.3694661458333333, + "learning_rate": 9.494758705426976e-07, + "loss": 0.0, + "reward": 0.26527780542771023, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 136 + }, + { + "completion_length": 266.88889567057294, + "epoch": 4.573033707865169, + "grad_norm": 0.022213064895931858, + "kl": 0.150146484375, + "learning_rate": 9.485854837621454e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 137 + }, + { + "completion_length": 160.25694783528647, + "epoch": 4.606741573033708, + "grad_norm": 0.0005588132091927237, + "kl": 0.2586263020833333, + "learning_rate": 9.476877447204308e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 138 + }, + { + "completion_length": 222.2638931274414, + "epoch": 4.640449438202247, + "grad_norm": 0.7039155412741445, + "kl": 2336.2388509114585, + "learning_rate": 9.467826681313865e-07, + "loss": 0.2337, + "reward": 0.2597222502032916, + "reward_std": 0.07714731867114703, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 139 + }, + { + "completion_length": 135.27778244018555, + "epoch": 4.674157303370786, + "grad_norm": 308.3545025470755, + "kl": 0.2586263020833333, + "learning_rate": 9.458702688291071e-07, + "loss": 0.0, + "reward": 0.19027779499689737, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 140 + }, + { + "completion_length": 206.41667048136392, + "epoch": 4.707865168539326, + "grad_norm": 0.09237966655849307, + "kl": 0.22493489583333334, + "learning_rate": 9.449505617677057e-07, + "loss": 0.0, + "reward": 0.21041668206453323, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 141 + }, + { + "completion_length": 226.6527862548828, + "epoch": 4.741573033707866, + "grad_norm": 0.02009681785356261, + "kl": 0.18098958333333334, + "learning_rate": 9.440235620210682e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 142 + }, + { + "completion_length": 198.64583841959634, + "epoch": 4.775280898876405, + "grad_norm": 0.22738468685304128, + "kl": 0.23282877604166666, + "learning_rate": 9.430892847826071e-07, + "loss": 0.0, + "reward": 0.1833333522081375, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0833333358168602, + "step": 143 + }, + { + "completion_length": 196.8263956705729, + "epoch": 4.808988764044944, + "grad_norm": 0.2961657685439418, + "kl": 0.23909505208333334, + "learning_rate": 9.421477453650117e-07, + "loss": 0.0, + "reward": 0.2597222452362378, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 144 + }, + { + "completion_length": 235.47222900390625, + "epoch": 4.842696629213483, + "grad_norm": 0.00041434419997458336, + "kl": 0.172607421875, + "learning_rate": 9.411989591999982e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 145 + }, + { + "completion_length": 208.03472518920898, + "epoch": 4.876404494382022, + "grad_norm": 0.00020879556198166483, + "kl": 0.191650390625, + "learning_rate": 9.402429418380553e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 146 + }, + { + "completion_length": 162.6388905843099, + "epoch": 4.910112359550562, + "grad_norm": 0.08522641777242758, + "kl": 0.2947591145833333, + "learning_rate": 9.392797089481908e-07, + "loss": 0.0, + "reward": 0.16250001390775046, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.06250000186264515, + "step": 147 + }, + { + "completion_length": 205.2638956705729, + "epoch": 4.943820224719101, + "grad_norm": 0.2066694345630605, + "kl": 0.232421875, + "learning_rate": 9.383092763176738e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 148 + }, + { + "completion_length": 181.20833841959634, + "epoch": 4.97752808988764, + "grad_norm": 0.00037175360482054684, + "kl": 0.28759765625, + "learning_rate": 9.373316598517762e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 149 + }, + { + "completion_length": 148.5, + "epoch": 5.0, + "grad_norm": 0.060460306788254634, + "kl": 0.3077392578125, + "learning_rate": 9.363468755735122e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 150 + }, + { + "completion_length": 206.1666717529297, + "epoch": 5.033707865168539, + "grad_norm": 0.000794347409907963, + "kl": 0.2976888020833333, + "learning_rate": 9.353549396233758e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 151 + }, + { + "completion_length": 171.43056360880533, + "epoch": 5.067415730337078, + "grad_norm": 0.0003038893271057909, + "kl": 0.2910970052083333, + "learning_rate": 9.343558682590755e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 152 + }, + { + "completion_length": 230.3402837117513, + "epoch": 5.101123595505618, + "grad_norm": 0.0040508013033015555, + "kl": 0.1630859375, + "learning_rate": 9.333496778552688e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 153 + }, + { + "completion_length": 175.28472518920898, + "epoch": 5.134831460674158, + "grad_norm": 0.34605833615760045, + "kl": 0.3408203125, + "learning_rate": 9.323363849032931e-07, + "loss": 0.0, + "reward": 0.2597222477197647, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1597222238779068, + "step": 154 + }, + { + "completion_length": 167.9513956705729, + "epoch": 5.168539325842697, + "grad_norm": 0.001301600680856017, + "kl": 0.3631184895833333, + "learning_rate": 9.31316006010896e-07, + "loss": 0.0, + "reward": 0.23888889948527017, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.13888889302810034, + "step": 155 + }, + { + "completion_length": 205.1875025431315, + "epoch": 5.202247191011236, + "grad_norm": 0.20752073381421987, + "kl": 0.2676595052083333, + "learning_rate": 9.302885579019626e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 156 + }, + { + "completion_length": 181.08333587646484, + "epoch": 5.235955056179775, + "grad_norm": 0.10876389088755385, + "kl": 0.3362630208333333, + "learning_rate": 9.292540574162416e-07, + "loss": 0.0, + "reward": 0.2736111357808113, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.173611119389534, + "step": 157 + }, + { + "completion_length": 226.5208422342936, + "epoch": 5.269662921348314, + "grad_norm": 0.20273924828563913, + "kl": 0.239990234375, + "learning_rate": 9.282125215090693e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 158 + }, + { + "completion_length": 166.7361157735189, + "epoch": 5.303370786516854, + "grad_norm": 0.0006707969337256801, + "kl": 0.3128255208333333, + "learning_rate": 9.271639672510916e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 159 + }, + { + "completion_length": 143.4305623372396, + "epoch": 5.337078651685394, + "grad_norm": 0.04387290401383505, + "kl": 0.33642578125, + "learning_rate": 9.261084118279846e-07, + "loss": 0.0, + "reward": 0.16250001390775046, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.06250000186264515, + "step": 160 + }, + { + "completion_length": 213.2638956705729, + "epoch": 5.370786516853933, + "grad_norm": 0.0004734125541091039, + "kl": 0.20035807291666666, + "learning_rate": 9.250458725401724e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 161 + }, + { + "completion_length": 192.3888956705729, + "epoch": 5.404494382022472, + "grad_norm": 0.004020462189990586, + "kl": 0.4783528645833333, + "learning_rate": 9.239763668025439e-07, + "loss": 0.0, + "reward": 0.15486112236976624, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 162 + }, + { + "completion_length": 177.59027989705405, + "epoch": 5.438202247191011, + "grad_norm": 0.03052962356501358, + "kl": 0.3419596354166667, + "learning_rate": 9.228999121441672e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 163 + }, + { + "completion_length": 181.23611195882162, + "epoch": 5.47191011235955, + "grad_norm": 0.000676866776546439, + "kl": 0.2516276041666667, + "learning_rate": 9.218165262080022e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 164 + }, + { + "completion_length": 213.2291717529297, + "epoch": 5.50561797752809, + "grad_norm": 0.00044212486293047085, + "kl": 0.236572265625, + "learning_rate": 9.207262267506121e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 165 + }, + { + "completion_length": 170.65972900390625, + "epoch": 5.539325842696629, + "grad_norm": 0.00035606054092954985, + "kl": 0.298828125, + "learning_rate": 9.196290316418711e-07, + "loss": 0.0, + "reward": 0.3777778148651123, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 166 + }, + { + "completion_length": 177.84028498331705, + "epoch": 5.573033707865169, + "grad_norm": 0.04607000551350566, + "kl": 0.2532552083333333, + "learning_rate": 9.18524958864673e-07, + "loss": 0.0, + "reward": 0.1763889044523239, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.07638889054457347, + "step": 167 + }, + { + "completion_length": 194.1527837117513, + "epoch": 5.606741573033708, + "grad_norm": 0.20808121936182614, + "kl": 0.267578125, + "learning_rate": 9.174140265146355e-07, + "loss": 0.0, + "reward": 0.1625000163912773, + "reward_std": 0.0589255653321743, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0625000037252903, + "step": 168 + }, + { + "completion_length": 218.64584096272787, + "epoch": 5.640449438202247, + "grad_norm": 0.05286339425480217, + "kl": 0.19921875, + "learning_rate": 9.162962527998037e-07, + "loss": 0.0, + "reward": 0.15000001092751822, + "reward_std": 0.003928370773792267, + "rewards/format_reward_func": 0.09444445247451465, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 169 + }, + { + "completion_length": 191.7152837117513, + "epoch": 5.674157303370786, + "grad_norm": 0.020660720733936037, + "kl": 0.22037760416666666, + "learning_rate": 9.151716560403519e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 170 + }, + { + "completion_length": 166.56250508626303, + "epoch": 5.707865168539326, + "grad_norm": 0.022941040890898625, + "kl": 0.2823893229166667, + "learning_rate": 9.140402546682834e-07, + "loss": 0.0, + "reward": 0.2041666880249977, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.10416666666666667, + "step": 171 + }, + { + "completion_length": 199.54861450195312, + "epoch": 5.741573033707866, + "grad_norm": 0.0516847916750005, + "kl": 0.28076171875, + "learning_rate": 9.129020672271281e-07, + "loss": 0.0, + "reward": 0.2527778049310048, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15277778108914694, + "step": 172 + }, + { + "completion_length": 217.98611958821616, + "epoch": 5.775280898876405, + "grad_norm": 0.3192851660671531, + "kl": 0.17618815104166666, + "learning_rate": 9.11757112371639e-07, + "loss": 0.0, + "reward": 0.3083333695928256, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2083333432674408, + "step": 173 + }, + { + "completion_length": 218.20833841959634, + "epoch": 5.808988764044944, + "grad_norm": 0.0002236331198074834, + "kl": 0.1943359375, + "learning_rate": 9.10605408867486e-07, + "loss": 0.0, + "reward": 0.12083334227403005, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.020833333333333332, + "step": 174 + }, + { + "completion_length": 160.8819497426351, + "epoch": 5.842696629213483, + "grad_norm": 0.6363090100232928, + "kl": 0.28369140625, + "learning_rate": 9.094469755909482e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 175 + }, + { + "completion_length": 180.36806106567383, + "epoch": 5.876404494382022, + "grad_norm": 0.17228473209014805, + "kl": 0.3553873697916667, + "learning_rate": 9.082818315286054e-07, + "loss": 0.0, + "reward": 0.14861112336317697, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.048611111318071686, + "step": 176 + }, + { + "completion_length": 160.80556106567383, + "epoch": 5.910112359550562, + "grad_norm": 0.26204832610248274, + "kl": 0.24576822916666666, + "learning_rate": 9.071099957770263e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 177 + }, + { + "completion_length": 159.0138931274414, + "epoch": 5.943820224719101, + "grad_norm": 0.3760477740103961, + "kl": 0.3387044270833333, + "learning_rate": 9.059314875424552e-07, + "loss": 0.0, + "reward": 0.3430555760860443, + "reward_std": 0.09375367189447086, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.24305556217829385, + "step": 178 + }, + { + "completion_length": 185.4583396911621, + "epoch": 5.97752808988764, + "grad_norm": 0.20131920510232387, + "kl": 0.253173828125, + "learning_rate": 9.047463261404978e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 179 + }, + { + "completion_length": 71.0, + "epoch": 6.0, + "grad_norm": 0.00024713865826551305, + "kl": 0.369140625, + "learning_rate": 9.035545309958046e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 180 + }, + { + "completion_length": 180.29861704508463, + "epoch": 6.033707865168539, + "grad_norm": 0.08206716249575104, + "kl": 0.22086588541666666, + "learning_rate": 9.023561216417519e-07, + "loss": 0.0, + "reward": 0.22500002135833105, + "reward_std": 0.0483945868909359, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.12500000496705374, + "step": 181 + }, + { + "completion_length": 163.3263905843099, + "epoch": 6.067415730337078, + "grad_norm": 0.0002197484412041211, + "kl": 0.26318359375, + "learning_rate": 9.011511177201224e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 182 + }, + { + "completion_length": 139.90278244018555, + "epoch": 6.101123595505618, + "grad_norm": 0.3543602549829108, + "kl": 0.4309895833333333, + "learning_rate": 8.999395389807829e-07, + "loss": 0.0, + "reward": 0.3361111481984456, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.23611111318071684, + "step": 183 + }, + { + "completion_length": 219.1041742960612, + "epoch": 6.134831460674158, + "grad_norm": 0.0005190749745494502, + "kl": 0.19498697916666666, + "learning_rate": 8.987214052813603e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 184 + }, + { + "completion_length": 208.36805979410806, + "epoch": 6.168539325842697, + "grad_norm": 0.19615702437440735, + "kl": 0.4274088541666667, + "learning_rate": 8.974967365869173e-07, + "loss": 0.0, + "reward": 0.13402778406937918, + "reward_std": 0.030716917167107265, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.0347222238779068, + "step": 185 + }, + { + "completion_length": 167.9166692097982, + "epoch": 6.202247191011236, + "grad_norm": 0.0003164021561801834, + "kl": 0.24886067708333334, + "learning_rate": 8.962655529696235e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 186 + }, + { + "completion_length": 234.4305623372396, + "epoch": 6.235955056179775, + "grad_norm": 0.05168938277747663, + "kl": 0.23063151041666666, + "learning_rate": 8.950278746084279e-07, + "loss": 0.0, + "reward": 0.10694445421298344, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.006944444651405017, + "step": 187 + }, + { + "completion_length": 162.4583371480306, + "epoch": 6.269662921348314, + "grad_norm": 0.0002023239645274476, + "kl": 0.2801106770833333, + "learning_rate": 8.937837217887272e-07, + "loss": 0.0, + "reward": 0.21111111591259638, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 188 + }, + { + "completion_length": 150.44445037841797, + "epoch": 6.303370786516854, + "grad_norm": 0.0003562133681414341, + "kl": 0.2545572916666667, + "learning_rate": 8.925331149020337e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 189 + }, + { + "completion_length": 209.24305979410806, + "epoch": 6.337078651685394, + "grad_norm": 0.36604296557184446, + "kl": 0.24519856770833334, + "learning_rate": 8.912760744456415e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 190 + }, + { + "completion_length": 207.37500508626303, + "epoch": 6.370786516853933, + "grad_norm": 0.0004352909840355996, + "kl": 0.2854817708333333, + "learning_rate": 8.900126210222893e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 191 + }, + { + "completion_length": 207.30555979410806, + "epoch": 6.404494382022472, + "grad_norm": 0.0002532856528219797, + "kl": 0.19596354166666666, + "learning_rate": 8.887427753398247e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 192 + }, + { + "completion_length": 194.8333371480306, + "epoch": 6.438202247191011, + "grad_norm": 0.04547724870864244, + "kl": 0.22916666666666666, + "learning_rate": 8.874665582108624e-07, + "loss": 0.0, + "reward": 0.10694445421298344, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.006944444651405017, + "step": 193 + }, + { + "completion_length": 199.3055648803711, + "epoch": 6.47191011235955, + "grad_norm": 0.07322371441721755, + "kl": 0.2294921875, + "learning_rate": 8.861839905524451e-07, + "loss": 0.0, + "reward": 0.24583335469166437, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 194 + }, + { + "completion_length": 192.9652862548828, + "epoch": 6.50561797752809, + "grad_norm": 0.0005388255772049631, + "kl": 0.23331705729166666, + "learning_rate": 8.848950933856997e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 195 + }, + { + "completion_length": 176.9513905843099, + "epoch": 6.539325842696629, + "grad_norm": 0.0003153221960758335, + "kl": 0.24983723958333334, + "learning_rate": 8.83599887835493e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 196 + }, + { + "completion_length": 177.02084096272787, + "epoch": 6.573033707865169, + "grad_norm": 0.11811051043287392, + "kl": 0.2789713541666667, + "learning_rate": 8.822983951300854e-07, + "loss": 0.0, + "reward": 0.19027779002984366, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 197 + }, + { + "completion_length": 239.79861958821616, + "epoch": 6.606741573033708, + "grad_norm": 0.05400355549485076, + "kl": 0.1689453125, + "learning_rate": 8.809906366007831e-07, + "loss": 0.0, + "reward": 0.2388889119029045, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.13888889302810034, + "step": 198 + }, + { + "completion_length": 158.7152837117513, + "epoch": 6.640449438202247, + "grad_norm": 0.0005236301772094877, + "kl": 0.3155924479166667, + "learning_rate": 8.796766336815882e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 199 + }, + { + "completion_length": 220.93055979410806, + "epoch": 6.674157303370786, + "grad_norm": 0.07404539059728213, + "kl": 0.21695963541666666, + "learning_rate": 8.783564079088476e-07, + "loss": 0.0, + "reward": 0.19027779499689737, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 200 + }, + { + "completion_length": 205.39584096272787, + "epoch": 6.707865168539326, + "grad_norm": 0.0005631441688288627, + "kl": 0.21695963541666666, + "learning_rate": 8.770299809209003e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 201 + }, + { + "completion_length": 143.1388905843099, + "epoch": 6.741573033707866, + "grad_norm": 0.00027517441307582907, + "kl": 0.28564453125, + "learning_rate": 8.75697374457722e-07, + "loss": 0.0, + "reward": 0.3777777949968974, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 202 + }, + { + "completion_length": 162.7291742960612, + "epoch": 6.775280898876405, + "grad_norm": 0.30060312709467973, + "kl": 0.3131510416666667, + "learning_rate": 8.743586103605697e-07, + "loss": 0.0, + "reward": 0.14166667809089026, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.041666666666666664, + "step": 203 + }, + { + "completion_length": 141.17361704508463, + "epoch": 6.808988764044944, + "grad_norm": 0.17881444073136876, + "kl": 0.25634765625, + "learning_rate": 8.73013710571623e-07, + "loss": 0.0, + "reward": 0.3569444566965103, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2569444527228673, + "step": 204 + }, + { + "completion_length": 172.7500089009603, + "epoch": 6.842696629213483, + "grad_norm": 0.32498611478058176, + "kl": 0.2928059895833333, + "learning_rate": 8.716626971336247e-07, + "loss": 0.0, + "reward": 0.18333334475755692, + "reward_std": 0.05143445233503977, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0833333358168602, + "step": 205 + }, + { + "completion_length": 243.64583587646484, + "epoch": 6.876404494382022, + "grad_norm": 0.1057262560369027, + "kl": 0.19791666666666666, + "learning_rate": 8.703055921895199e-07, + "loss": 0.0, + "reward": 0.16944445669651031, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.06944444651405017, + "step": 206 + }, + { + "completion_length": 205.63195037841797, + "epoch": 6.910112359550562, + "grad_norm": 0.21916204378698378, + "kl": 0.2490234375, + "learning_rate": 8.689424179820922e-07, + "loss": 0.0, + "reward": 0.41458337257305783, + "reward_std": 0.041247895608345665, + "rewards/format_reward_func": 0.09513889625668526, + "rewards/solution_reward_func": 0.3194444527228673, + "step": 207 + }, + { + "completion_length": 226.7638956705729, + "epoch": 6.943820224719101, + "grad_norm": 0.00042143576798072427, + "kl": 0.2542317708333333, + "learning_rate": 8.675731968536002e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 208 + }, + { + "completion_length": 157.97222773234049, + "epoch": 6.97752808988764, + "grad_norm": 0.17012672263950396, + "kl": 0.3225911458333333, + "learning_rate": 8.661979512454104e-07, + "loss": 0.0, + "reward": 0.3708333646257718, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2708333432674408, + "step": 209 + }, + { + "completion_length": 230.0, + "epoch": 7.0, + "grad_norm": 0.17012672263950396, + "kl": 0.1776123046875, + "learning_rate": 8.648167036976302e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 210 + }, + { + "completion_length": 208.6041742960612, + "epoch": 7.033707865168539, + "grad_norm": 0.3317139962221015, + "kl": 0.24576822916666666, + "learning_rate": 8.634294768487374e-07, + "loss": 0.0, + "reward": 0.14166667560736337, + "reward_std": 0.0483945868909359, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.041666668529311814, + "step": 211 + }, + { + "completion_length": 164.69444783528647, + "epoch": 7.067415730337078, + "grad_norm": 0.0008387288617045326, + "kl": 0.2560221354166667, + "learning_rate": 8.620362934352108e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 212 + }, + { + "completion_length": 183.25694783528647, + "epoch": 7.101123595505618, + "grad_norm": 0.00023557532696121805, + "kl": 0.21728515625, + "learning_rate": 8.606371762911555e-07, + "loss": 0.0, + "reward": 0.21041668206453323, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 213 + }, + { + "completion_length": 161.52084096272787, + "epoch": 7.134831460674158, + "grad_norm": 0.048977062347091135, + "kl": 0.3642578125, + "learning_rate": 8.592321483479303e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 214 + }, + { + "completion_length": 192.1180648803711, + "epoch": 7.168539325842697, + "grad_norm": 0.00017611911065657207, + "kl": 0.2897135416666667, + "learning_rate": 8.578212326337714e-07, + "loss": 0.0, + "reward": 0.12083334475755692, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.020833333333333332, + "step": 215 + }, + { + "completion_length": 229.6388931274414, + "epoch": 7.202247191011236, + "grad_norm": 0.12507225670346744, + "kl": 0.1572265625, + "learning_rate": 8.564044522734146e-07, + "loss": 0.0, + "reward": 0.15486112236976624, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 216 + }, + { + "completion_length": 176.19445037841797, + "epoch": 7.235955056179775, + "grad_norm": 0.03559034369746066, + "kl": 0.23421223958333334, + "learning_rate": 8.549818304877163e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 217 + }, + { + "completion_length": 171.13194529215494, + "epoch": 7.269662921348314, + "grad_norm": 0.00017182336570387106, + "kl": 0.19759114583333334, + "learning_rate": 8.535533905932737e-07, + "loss": 0.0, + "reward": 0.13472223033507666, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0347222238779068, + "step": 218 + }, + { + "completion_length": 201.3402837117513, + "epoch": 7.303370786516854, + "grad_norm": 0.049462007516130634, + "kl": 0.22412109375, + "learning_rate": 8.521191560020417e-07, + "loss": 0.0, + "reward": 0.3708333646257718, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2708333432674408, + "step": 219 + }, + { + "completion_length": 183.0694491068522, + "epoch": 7.337078651685394, + "grad_norm": 0.19395499494209628, + "kl": 0.212890625, + "learning_rate": 8.506791502209496e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 220 + }, + { + "completion_length": 207.61805979410806, + "epoch": 7.370786516853933, + "grad_norm": 0.00026939239925366705, + "kl": 0.183837890625, + "learning_rate": 8.492333968515158e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 221 + }, + { + "completion_length": 181.2361157735189, + "epoch": 7.404494382022472, + "grad_norm": 0.00016424525366185846, + "kl": 0.21110026041666666, + "learning_rate": 8.477819195894614e-07, + "loss": 0.0, + "reward": 0.2319444641470909, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1319444477558136, + "step": 222 + }, + { + "completion_length": 186.52778116861978, + "epoch": 7.438202247191011, + "grad_norm": 0.25062121098686574, + "kl": 0.20939127604166666, + "learning_rate": 8.463247422243205e-07, + "loss": 0.0, + "reward": 0.1763889044523239, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.07638889054457347, + "step": 223 + }, + { + "completion_length": 158.52778244018555, + "epoch": 7.47191011235955, + "grad_norm": 0.049706078845733456, + "kl": 0.241455078125, + "learning_rate": 8.448618886390521e-07, + "loss": 0.0, + "reward": 0.1541666785875956, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 224 + }, + { + "completion_length": 202.50000762939453, + "epoch": 7.50561797752809, + "grad_norm": 0.00034885466425547024, + "kl": 0.2732747395833333, + "learning_rate": 8.433933828096472e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 225 + }, + { + "completion_length": 188.61806360880533, + "epoch": 7.539325842696629, + "grad_norm": 0.0006087230988606237, + "kl": 0.20906575520833334, + "learning_rate": 8.419192488047369e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 226 + }, + { + "completion_length": 177.0763956705729, + "epoch": 7.573033707865169, + "grad_norm": 0.3079793799664563, + "kl": 0.20548502604166666, + "learning_rate": 8.404395107851966e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 227 + }, + { + "completion_length": 152.19445164998373, + "epoch": 7.606741573033708, + "grad_norm": 0.0002703341260393453, + "kl": 0.2534993489583333, + "learning_rate": 8.389541930037516e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 228 + }, + { + "completion_length": 181.15972773234049, + "epoch": 7.640449438202247, + "grad_norm": 0.000319301036115163, + "kl": 0.191162109375, + "learning_rate": 8.374633198045784e-07, + "loss": 0.0, + "reward": 0.21805557360251746, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11805555907388528, + "step": 229 + }, + { + "completion_length": 218.43055979410806, + "epoch": 7.674157303370786, + "grad_norm": 0.130298487527294, + "kl": 0.290771484375, + "learning_rate": 8.359669156229061e-07, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 230 + }, + { + "completion_length": 210.86111450195312, + "epoch": 7.707865168539326, + "grad_norm": 0.00015648868990740126, + "kl": 0.19596354166666666, + "learning_rate": 8.344650049846164e-07, + "loss": 0.0, + "reward": 0.14166667809089026, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.041666666666666664, + "step": 231 + }, + { + "completion_length": 204.4861183166504, + "epoch": 7.741573033707866, + "grad_norm": 0.0614349615980157, + "kl": 0.3232421875, + "learning_rate": 8.329576125058405e-07, + "loss": 0.0, + "reward": 0.14166667809089026, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.041666666666666664, + "step": 232 + }, + { + "completion_length": 196.45833841959634, + "epoch": 7.775280898876405, + "grad_norm": 0.05285516640060137, + "kl": 0.20735677083333334, + "learning_rate": 8.314447628925567e-07, + "loss": 0.0, + "reward": 0.3222222502032916, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2222222307076057, + "step": 233 + }, + { + "completion_length": 163.19445037841797, + "epoch": 7.808988764044944, + "grad_norm": 0.0866450492488482, + "kl": 0.20817057291666666, + "learning_rate": 8.299264809401849e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 234 + }, + { + "completion_length": 165.0138956705729, + "epoch": 7.842696629213483, + "grad_norm": 0.02856230145049242, + "kl": 0.3448893229166667, + "learning_rate": 8.284027915331805e-07, + "loss": 0.0, + "reward": 0.21041668206453323, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 235 + }, + { + "completion_length": 199.34722900390625, + "epoch": 7.876404494382022, + "grad_norm": 0.0007307723400553277, + "kl": 0.2740885416666667, + "learning_rate": 8.268737196446263e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 236 + }, + { + "completion_length": 190.95834096272787, + "epoch": 7.910112359550562, + "grad_norm": 0.0001991437977729803, + "kl": 0.5262044270833334, + "learning_rate": 8.253392903358231e-07, + "loss": 0.0001, + "reward": 0.1833333522081375, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0833333358168602, + "step": 237 + }, + { + "completion_length": 214.2152837117513, + "epoch": 7.943820224719101, + "grad_norm": 0.057150694404137586, + "kl": 0.19197591145833334, + "learning_rate": 8.237995287558801e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 238 + }, + { + "completion_length": 194.34028244018555, + "epoch": 7.97752808988764, + "grad_norm": 0.00019928756967534184, + "kl": 0.21809895833333334, + "learning_rate": 8.222544601413004e-07, + "loss": 0.0, + "reward": 0.2597222452362378, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 239 + }, + { + "completion_length": 218.75, + "epoch": 8.0, + "grad_norm": 0.07678030652150342, + "kl": 0.2730712890625, + "learning_rate": 8.207041098155699e-07, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 240 + }, + { + "completion_length": 221.54167683919272, + "epoch": 8.03370786516854, + "grad_norm": 0.00024167027287265238, + "kl": 0.21525065104166666, + "learning_rate": 8.191485031887404e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 241 + }, + { + "completion_length": 186.81945037841797, + "epoch": 8.067415730337078, + "grad_norm": 0.00022654805321569352, + "kl": 0.2779134114583333, + "learning_rate": 8.175876657570143e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 242 + }, + { + "completion_length": 231.1875114440918, + "epoch": 8.101123595505618, + "grad_norm": 0.00026861541661596264, + "kl": 0.227783203125, + "learning_rate": 8.160216231023257e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 243 + }, + { + "completion_length": 217.84028816223145, + "epoch": 8.134831460674157, + "grad_norm": 0.08926755229091893, + "kl": 0.22469075520833334, + "learning_rate": 8.144504008919222e-07, + "loss": 0.0, + "reward": 0.252777802447478, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15277778605620065, + "step": 244 + }, + { + "completion_length": 136.2777837117513, + "epoch": 8.168539325842696, + "grad_norm": 0.17137115615213414, + "kl": 0.3009440104166667, + "learning_rate": 8.128740248779435e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 245 + }, + { + "completion_length": 189.50695037841797, + "epoch": 8.202247191011235, + "grad_norm": 0.0002118541249508326, + "kl": 0.2831217447916667, + "learning_rate": 8.112925208969994e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 246 + }, + { + "completion_length": 140.9444491068522, + "epoch": 8.235955056179776, + "grad_norm": 0.0033735013369370205, + "kl": 0.3704427083333333, + "learning_rate": 8.097059148697467e-07, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 247 + }, + { + "completion_length": 207.68055979410806, + "epoch": 8.269662921348315, + "grad_norm": 0.0004275483339120829, + "kl": 0.2508138020833333, + "learning_rate": 8.081142328004636e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 248 + }, + { + "completion_length": 180.4236157735189, + "epoch": 8.303370786516854, + "grad_norm": 0.1482240896052723, + "kl": 0.24837239583333334, + "learning_rate": 8.065175007766247e-07, + "loss": 0.0, + "reward": 0.2597222526868184, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 249 + }, + { + "completion_length": 172.2638931274414, + "epoch": 8.337078651685394, + "grad_norm": 0.13794200971106416, + "kl": 0.23258463541666666, + "learning_rate": 8.049157449684722e-07, + "loss": 0.0, + "reward": 0.3708333522081375, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2708333432674408, + "step": 250 + }, + { + "completion_length": 175.2013931274414, + "epoch": 8.370786516853933, + "grad_norm": 0.00032142804398902695, + "kl": 0.23974609375, + "learning_rate": 8.03308991628588e-07, + "loss": 0.0, + "reward": 0.16250001390775046, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.06250000186264515, + "step": 251 + }, + { + "completion_length": 198.30555979410806, + "epoch": 8.404494382022472, + "grad_norm": 0.20697734730837425, + "kl": 0.2720540364583333, + "learning_rate": 8.016972670914623e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 252 + }, + { + "completion_length": 183.52083841959634, + "epoch": 8.438202247191011, + "grad_norm": 0.05454961820929964, + "kl": 0.239501953125, + "learning_rate": 8.000805977730631e-07, + "loss": 0.0, + "reward": 0.19722223778565726, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0972222238779068, + "step": 253 + }, + { + "completion_length": 164.64583587646484, + "epoch": 8.47191011235955, + "grad_norm": 0.00016798374467291786, + "kl": 0.24951171875, + "learning_rate": 7.984590101704025e-07, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 254 + }, + { + "completion_length": 175.21528116861978, + "epoch": 8.50561797752809, + "grad_norm": 0.0008589186632415992, + "kl": 0.2859700520833333, + "learning_rate": 7.96832530861103e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 255 + }, + { + "completion_length": 190.94445037841797, + "epoch": 8.539325842696629, + "grad_norm": 0.0009915163884001233, + "kl": 0.2609049479166667, + "learning_rate": 7.952011865029613e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 256 + }, + { + "completion_length": 205.9652837117513, + "epoch": 8.573033707865168, + "grad_norm": 0.0010250230631537972, + "kl": 0.276611328125, + "learning_rate": 7.935650038335117e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 257 + }, + { + "completion_length": 162.49305979410806, + "epoch": 8.606741573033707, + "grad_norm": 0.000693181587383422, + "kl": 0.22981770833333334, + "learning_rate": 7.919240096695876e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 258 + }, + { + "completion_length": 196.1319491068522, + "epoch": 8.640449438202246, + "grad_norm": 0.0003241887822054065, + "kl": 0.3126627604166667, + "learning_rate": 7.902782309068829e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 259 + }, + { + "completion_length": 193.06250508626303, + "epoch": 8.674157303370787, + "grad_norm": 0.0008168413550598498, + "kl": 0.22224934895833334, + "learning_rate": 7.886276945195097e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 260 + }, + { + "completion_length": 160.02778244018555, + "epoch": 8.707865168539326, + "grad_norm": 0.027372591721328443, + "kl": 0.2552083333333333, + "learning_rate": 7.869724275595575e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 261 + }, + { + "completion_length": 214.47222391764322, + "epoch": 8.741573033707866, + "grad_norm": 0.0003468074976912508, + "kl": 0.22184244791666666, + "learning_rate": 7.853124571566491e-07, + "loss": 0.0, + "reward": 0.3152778074145317, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.21527778108914694, + "step": 262 + }, + { + "completion_length": 157.0416717529297, + "epoch": 8.775280898876405, + "grad_norm": 0.08514453031222587, + "kl": 0.3087565104166667, + "learning_rate": 7.83647810517496e-07, + "loss": 0.0, + "reward": 0.23194446663061777, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1319444477558136, + "step": 263 + }, + { + "completion_length": 203.34723027547201, + "epoch": 8.808988764044944, + "grad_norm": 0.00028914506798194975, + "kl": 0.19441731770833334, + "learning_rate": 7.819785149254532e-07, + "loss": 0.0, + "reward": 0.26597224920988083, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 264 + }, + { + "completion_length": 215.6041717529297, + "epoch": 8.842696629213483, + "grad_norm": 0.03283764607423548, + "kl": 0.22737630208333334, + "learning_rate": 7.803045977400708e-07, + "loss": 0.0, + "reward": 0.16250001142422357, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0625, + "step": 265 + }, + { + "completion_length": 229.3125089009603, + "epoch": 8.876404494382022, + "grad_norm": 0.001029929532802994, + "kl": 0.22892252604166666, + "learning_rate": 7.786260863966467e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 266 + }, + { + "completion_length": 223.4652837117513, + "epoch": 8.910112359550562, + "grad_norm": 0.3205202192459917, + "kl": 0.21378580729166666, + "learning_rate": 7.769430084057763e-07, + "loss": 0.0, + "reward": 0.12777779251337051, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.027777778605620067, + "step": 267 + }, + { + "completion_length": 168.00000635782877, + "epoch": 8.9438202247191, + "grad_norm": 0.11628693186020611, + "kl": 0.263671875, + "learning_rate": 7.752553913529018e-07, + "loss": 0.0, + "reward": 0.2250000163912773, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1250000037252903, + "step": 268 + }, + { + "completion_length": 152.41667048136392, + "epoch": 8.97752808988764, + "grad_norm": 0.15146115390520254, + "kl": 0.3439127604166667, + "learning_rate": 7.7356326289786e-07, + "loss": 0.0, + "reward": 0.3708333522081375, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2708333432674408, + "step": 269 + }, + { + "completion_length": 106.5, + "epoch": 9.0, + "grad_norm": 0.0003309017022495485, + "kl": 0.727294921875, + "learning_rate": 7.718666507744292e-07, + "loss": 0.0, + "reward": 0.600000012665987, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.5, + "step": 270 + }, + { + "completion_length": 214.97222646077475, + "epoch": 9.03370786516854, + "grad_norm": 0.00019305821444591339, + "kl": 0.2587076822916667, + "learning_rate": 7.701655827898746e-07, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 271 + }, + { + "completion_length": 217.1388905843099, + "epoch": 9.067415730337078, + "grad_norm": 0.0003009233811166216, + "kl": 0.251953125, + "learning_rate": 7.684600868244919e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 272 + }, + { + "completion_length": 187.6805623372396, + "epoch": 9.101123595505618, + "grad_norm": 0.0002515290133301996, + "kl": 0.3336588541666667, + "learning_rate": 7.667501908311514e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 273 + }, + { + "completion_length": 182.5763931274414, + "epoch": 9.134831460674157, + "grad_norm": 0.0006250246822807841, + "kl": 0.29150390625, + "learning_rate": 7.650359228348389e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 274 + }, + { + "completion_length": 202.16667302449545, + "epoch": 9.168539325842696, + "grad_norm": 0.15883579287900926, + "kl": 0.2608235677083333, + "learning_rate": 7.633173109321973e-07, + "loss": 0.0, + "reward": 0.19722223778565726, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0972222238779068, + "step": 275 + }, + { + "completion_length": 227.27083841959634, + "epoch": 9.202247191011235, + "grad_norm": 0.003732477372645033, + "kl": 0.1943359375, + "learning_rate": 7.61594383291065e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 276 + }, + { + "completion_length": 176.84722518920898, + "epoch": 9.235955056179776, + "grad_norm": 0.00022363495127753627, + "kl": 0.278564453125, + "learning_rate": 7.598671681500153e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 277 + }, + { + "completion_length": 233.0000025431315, + "epoch": 9.269662921348315, + "grad_norm": 0.0005376187767677384, + "kl": 0.23844401041666666, + "learning_rate": 7.581356938178929e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 278 + }, + { + "completion_length": 223.97222900390625, + "epoch": 9.303370786516854, + "grad_norm": 0.03440186400251449, + "kl": 0.2809244791666667, + "learning_rate": 7.563999886733499e-07, + "loss": 0.0, + "reward": 0.15486111988623938, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 279 + }, + { + "completion_length": 153.57639185587564, + "epoch": 9.337078651685394, + "grad_norm": 0.1996672974939433, + "kl": 0.3518880208333333, + "learning_rate": 7.546600811643816e-07, + "loss": 0.0, + "reward": 0.2527777974804242, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15277778108914694, + "step": 280 + }, + { + "completion_length": 193.23611450195312, + "epoch": 9.370786516853933, + "grad_norm": 0.1118588421931556, + "kl": 0.25, + "learning_rate": 7.529159998078585e-07, + "loss": 0.0, + "reward": 0.1833333522081375, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0833333358168602, + "step": 281 + }, + { + "completion_length": 160.61111450195312, + "epoch": 9.404494382022472, + "grad_norm": 0.07685147824751054, + "kl": 0.3004557291666667, + "learning_rate": 7.51167773189061e-07, + "loss": 0.0, + "reward": 0.19027779002984366, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09027778108914693, + "step": 282 + }, + { + "completion_length": 230.34723409016928, + "epoch": 9.438202247191011, + "grad_norm": 0.030619903495342302, + "kl": 0.23453776041666666, + "learning_rate": 7.49415429961209e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 283 + }, + { + "completion_length": 182.0694491068522, + "epoch": 9.47191011235955, + "grad_norm": 0.05428301379216778, + "kl": 0.3636067708333333, + "learning_rate": 7.476589988449938e-07, + "loss": 0.0, + "reward": 0.09861111889282863, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.0, + "step": 284 + }, + { + "completion_length": 160.13195037841797, + "epoch": 9.50561797752809, + "grad_norm": 0.06402596699127762, + "kl": 0.2997233072916667, + "learning_rate": 7.45898508628106e-07, + "loss": 0.0, + "reward": 0.4750000536441803, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3750000149011612, + "step": 285 + }, + { + "completion_length": 198.4027837117513, + "epoch": 9.539325842696629, + "grad_norm": 0.00046257900496985105, + "kl": 0.3580729166666667, + "learning_rate": 7.441339881647652e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 286 + }, + { + "completion_length": 161.2083371480306, + "epoch": 9.573033707865168, + "grad_norm": 0.00024986955738330905, + "kl": 0.2903645833333333, + "learning_rate": 7.423654663752454e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 287 + }, + { + "completion_length": 203.44445037841797, + "epoch": 9.606741573033707, + "grad_norm": 0.10317623994263957, + "kl": 0.26513671875, + "learning_rate": 7.405929722454025e-07, + "loss": 0.0, + "reward": 0.280555581053098, + "reward_std": 0.055412920812765755, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.180555559694767, + "step": 288 + }, + { + "completion_length": 173.83333587646484, + "epoch": 9.640449438202246, + "grad_norm": 0.059810479233788266, + "kl": 0.3313802083333333, + "learning_rate": 7.388165348261981e-07, + "loss": 0.0, + "reward": 0.3152778123815854, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.21527778605620065, + "step": 289 + }, + { + "completion_length": 198.1041717529297, + "epoch": 9.674157303370787, + "grad_norm": 0.018668904801476878, + "kl": 0.25, + "learning_rate": 7.37036183233224e-07, + "loss": 0.0, + "reward": 0.3215278039375941, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 290 + }, + { + "completion_length": 202.40278116861978, + "epoch": 9.707865168539326, + "grad_norm": 0.22603773921898057, + "kl": 0.2625325520833333, + "learning_rate": 7.35251946646225e-07, + "loss": 0.0, + "reward": 0.19722224523623785, + "reward_std": 0.07809028153618176, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09722222636143367, + "step": 291 + }, + { + "completion_length": 173.43056360880533, + "epoch": 9.741573033707866, + "grad_norm": 0.2537084810535681, + "kl": 0.3806966145833333, + "learning_rate": 7.334638543086203e-07, + "loss": 0.0, + "reward": 0.2875000238418579, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.18750000496705374, + "step": 292 + }, + { + "completion_length": 178.2500025431315, + "epoch": 9.775280898876405, + "grad_norm": 0.00020754615104715004, + "kl": 0.29736328125, + "learning_rate": 7.316719355270243e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 293 + }, + { + "completion_length": 202.9930648803711, + "epoch": 9.808988764044944, + "grad_norm": 0.05716436006560752, + "kl": 0.21427408854166666, + "learning_rate": 7.298762196707668e-07, + "loss": 0.0, + "reward": 0.1548611248532931, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 294 + }, + { + "completion_length": 163.88194783528647, + "epoch": 9.842696629213483, + "grad_norm": 0.06242834589205222, + "kl": 0.3356119791666667, + "learning_rate": 7.280767361714102e-07, + "loss": 0.0, + "reward": 0.13472223033507666, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0347222238779068, + "step": 295 + }, + { + "completion_length": 263.92362213134766, + "epoch": 9.876404494382022, + "grad_norm": 0.0001561019934670225, + "kl": 0.23771158854166666, + "learning_rate": 7.262735145222695e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 296 + }, + { + "completion_length": 165.6041692097982, + "epoch": 9.910112359550562, + "grad_norm": 0.00025071736674542154, + "kl": 0.3185221354166667, + "learning_rate": 7.244665842779261e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 297 + }, + { + "completion_length": 160.10417111714682, + "epoch": 9.9438202247191, + "grad_norm": 0.0004267321607742576, + "kl": 0.38671875, + "learning_rate": 7.226559750537461e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 298 + }, + { + "completion_length": 175.75694783528647, + "epoch": 9.97752808988764, + "grad_norm": 0.0003379033997547813, + "kl": 0.2975260416666667, + "learning_rate": 7.208417165253927e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 299 + }, + { + "completion_length": 150.5, + "epoch": 10.0, + "grad_norm": 0.0003379033997547813, + "kl": 0.31103515625, + "learning_rate": 7.190238384283412e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 300 + }, + { + "completion_length": 180.3125025431315, + "epoch": 10.03370786516854, + "grad_norm": 0.0016189387942331542, + "kl": 0.2853190104166667, + "learning_rate": 7.172023705573909e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 301 + }, + { + "completion_length": 174.12500699361166, + "epoch": 10.067415730337078, + "grad_norm": 0.0005250389124759635, + "kl": 0.3736979166666667, + "learning_rate": 7.153773427661773e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 302 + }, + { + "completion_length": 189.65972900390625, + "epoch": 10.101123595505618, + "grad_norm": 0.0007006562513204762, + "kl": 0.32421875, + "learning_rate": 7.135487849666826e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 303 + }, + { + "completion_length": 217.2986183166504, + "epoch": 10.134831460674157, + "grad_norm": 0.0003028785166100476, + "kl": 0.279296875, + "learning_rate": 7.117167271287452e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 304 + }, + { + "completion_length": 242.63195037841797, + "epoch": 10.168539325842696, + "grad_norm": 0.0007931948131115146, + "kl": 0.227294921875, + "learning_rate": 7.098811992795686e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 305 + }, + { + "completion_length": 191.58333587646484, + "epoch": 10.202247191011235, + "grad_norm": 0.00027105955882152435, + "kl": 0.2613932291666667, + "learning_rate": 7.080422315032297e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 306 + }, + { + "completion_length": 153.79861450195312, + "epoch": 10.235955056179776, + "grad_norm": 0.0011095326537554626, + "kl": 0.3759765625, + "learning_rate": 7.061998539401853e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 307 + }, + { + "completion_length": 109.19444910685222, + "epoch": 10.269662921348315, + "grad_norm": 0.000787854226760077, + "kl": 0.5485026041666666, + "learning_rate": 7.043540967867781e-07, + "loss": 0.0001, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 308 + }, + { + "completion_length": 146.34722773234049, + "epoch": 10.303370786516854, + "grad_norm": 0.33537569611972995, + "kl": 0.39013671875, + "learning_rate": 7.025049902947418e-07, + "loss": 0.0, + "reward": 0.3708333646257718, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2708333432674408, + "step": 309 + }, + { + "completion_length": 187.21528116861978, + "epoch": 10.337078651685394, + "grad_norm": 0.0004193171668726079, + "kl": 0.3624674479166667, + "learning_rate": 7.006525647707053e-07, + "loss": 0.0, + "reward": 0.2527777974804242, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15277778108914694, + "step": 310 + }, + { + "completion_length": 206.9166717529297, + "epoch": 10.370786516853933, + "grad_norm": 0.2554434536716341, + "kl": 0.2158203125, + "learning_rate": 6.987968505756965e-07, + "loss": 0.0, + "reward": 0.22500002135833105, + "reward_std": 0.0689794048666954, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1250000074505806, + "step": 311 + }, + { + "completion_length": 160.18750508626303, + "epoch": 10.404494382022472, + "grad_norm": 0.03090523836420807, + "kl": 0.3173828125, + "learning_rate": 6.969378781246436e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 312 + }, + { + "completion_length": 233.04167683919272, + "epoch": 10.438202247191011, + "grad_norm": 0.001003307225434264, + "kl": 0.24641927083333334, + "learning_rate": 6.950756778858777e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 313 + }, + { + "completion_length": 178.34028498331705, + "epoch": 10.47191011235955, + "grad_norm": 0.000714097789198713, + "kl": 0.3426106770833333, + "learning_rate": 6.932102803806324e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 314 + }, + { + "completion_length": 141.40277989705405, + "epoch": 10.50561797752809, + "grad_norm": 0.0003119191393154227, + "kl": 0.4718424479166667, + "learning_rate": 6.913417161825449e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 315 + }, + { + "completion_length": 219.12500762939453, + "epoch": 10.539325842696629, + "grad_norm": 0.020248614491029743, + "kl": 0.2689615885416667, + "learning_rate": 6.894700159171534e-07, + "loss": 0.0, + "reward": 0.13263890395561853, + "reward_std": 0.031628007690111794, + "rewards/format_reward_func": 0.09791667386889458, + "rewards/solution_reward_func": 0.0347222238779068, + "step": 316 + }, + { + "completion_length": 170.86806106567383, + "epoch": 10.573033707865168, + "grad_norm": 0.3485360389015691, + "kl": 0.3232421875, + "learning_rate": 6.87595210261396e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 317 + }, + { + "completion_length": 177.3541742960612, + "epoch": 10.606741573033707, + "grad_norm": 0.00031775385864263686, + "kl": 0.2933756510416667, + "learning_rate": 6.857173299431083e-07, + "loss": 0.0, + "reward": 0.16250001390775046, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.06250000186264515, + "step": 318 + }, + { + "completion_length": 184.98611958821616, + "epoch": 10.640449438202246, + "grad_norm": 0.07606007580915988, + "kl": 0.4265950520833333, + "learning_rate": 6.838364057405183e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 319 + }, + { + "completion_length": 169.7708371480306, + "epoch": 10.674157303370787, + "grad_norm": 0.001793799039232961, + "kl": 0.2845052083333333, + "learning_rate": 6.819524684817438e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 320 + }, + { + "completion_length": 156.34722646077475, + "epoch": 10.707865168539326, + "grad_norm": 0.00021374300291719347, + "kl": 0.3289388020833333, + "learning_rate": 6.800655490442856e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 321 + }, + { + "completion_length": 157.9305623372396, + "epoch": 10.741573033707866, + "grad_norm": 0.0002561478992467003, + "kl": 0.3733723958333333, + "learning_rate": 6.781756783545224e-07, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 322 + }, + { + "completion_length": 219.3888931274414, + "epoch": 10.775280898876405, + "grad_norm": 0.00030228911195122224, + "kl": 0.2770182291666667, + "learning_rate": 6.762828873872035e-07, + "loss": 0.0, + "reward": 0.2097222457329432, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 323 + }, + { + "completion_length": 216.47222900390625, + "epoch": 10.808988764044944, + "grad_norm": 0.02845898506769314, + "kl": 0.23014322916666666, + "learning_rate": 6.743872071649411e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 324 + }, + { + "completion_length": 162.88889439900717, + "epoch": 10.842696629213483, + "grad_norm": 0.10338962215221022, + "kl": 0.3343098958333333, + "learning_rate": 6.72488668757702e-07, + "loss": 0.0, + "reward": 0.24305557583769163, + "reward_std": 0.03172230130682389, + "rewards/format_reward_func": 0.09722223008672397, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 325 + }, + { + "completion_length": 182.50694529215494, + "epoch": 10.876404494382022, + "grad_norm": 0.06574508171916076, + "kl": 0.3323567708333333, + "learning_rate": 6.70587303282298e-07, + "loss": 0.0, + "reward": 0.26527778555949527, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 326 + }, + { + "completion_length": 164.2013931274414, + "epoch": 10.910112359550562, + "grad_norm": 0.0005689084346328859, + "kl": 0.3707682291666667, + "learning_rate": 6.686831419018768e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 327 + }, + { + "completion_length": 262.82640075683594, + "epoch": 10.9438202247191, + "grad_norm": 0.0002872818587467845, + "kl": 0.2627766927083333, + "learning_rate": 6.667762158254103e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 328 + }, + { + "completion_length": 205.06250762939453, + "epoch": 10.97752808988764, + "grad_norm": 0.0002750206564001014, + "kl": 0.30615234375, + "learning_rate": 6.648665563071833e-07, + "loss": 0.0, + "reward": 0.3777778223156929, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 329 + }, + { + "completion_length": 103.25, + "epoch": 11.0, + "grad_norm": 0.001409618382547904, + "kl": 0.609130859375, + "learning_rate": 6.629541946462816e-07, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 330 + }, + { + "completion_length": 210.44445037841797, + "epoch": 11.03370786516854, + "grad_norm": 0.0003136007528342489, + "kl": 0.2693684895833333, + "learning_rate": 6.610391621860785e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 331 + }, + { + "completion_length": 128.63194783528647, + "epoch": 11.067415730337078, + "grad_norm": 0.0002543068100767582, + "kl": 0.3797200520833333, + "learning_rate": 6.59121490313722e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 332 + }, + { + "completion_length": 197.47222900390625, + "epoch": 11.101123595505618, + "grad_norm": 0.00026242740543890773, + "kl": 0.3154296875, + "learning_rate": 6.572012104596189e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 333 + }, + { + "completion_length": 189.09722900390625, + "epoch": 11.134831460674157, + "grad_norm": 0.054020887122492936, + "kl": 0.2888997395833333, + "learning_rate": 6.552783540969211e-07, + "loss": 0.0, + "reward": 0.2013888992369175, + "reward_std": 0.022611424637337525, + "rewards/format_reward_func": 0.09722223008672397, + "rewards/solution_reward_func": 0.10416666666666667, + "step": 334 + }, + { + "completion_length": 184.1666742960612, + "epoch": 11.168539325842696, + "grad_norm": 0.0012428623378884169, + "kl": 0.3077799479166667, + "learning_rate": 6.533529527410091e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 335 + }, + { + "completion_length": 135.5486157735189, + "epoch": 11.202247191011235, + "grad_norm": 0.034949986552955906, + "kl": 0.4228515625, + "learning_rate": 6.514250379489753e-07, + "loss": 0.0, + "reward": 0.2527777974804242, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15277778108914694, + "step": 336 + }, + { + "completion_length": 164.56250762939453, + "epoch": 11.235955056179776, + "grad_norm": 0.01953164885507744, + "kl": 0.3321940104166667, + "learning_rate": 6.494946413191071e-07, + "loss": 0.0, + "reward": 0.15486111988623938, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 337 + }, + { + "completion_length": 206.36805979410806, + "epoch": 11.269662921348315, + "grad_norm": 0.00021287037641538507, + "kl": 0.2664388020833333, + "learning_rate": 6.475617944903691e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 338 + }, + { + "completion_length": 198.1388905843099, + "epoch": 11.303370786516854, + "grad_norm": 0.058144014428164435, + "kl": 0.2600911458333333, + "learning_rate": 6.456265291418841e-07, + "loss": 0.0, + "reward": 0.23194446166356406, + "reward_std": 0.05446995794773102, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.13194445272286734, + "step": 339 + }, + { + "completion_length": 198.59722900390625, + "epoch": 11.337078651685394, + "grad_norm": 0.1713625494543494, + "kl": 0.2859700520833333, + "learning_rate": 6.436888769924141e-07, + "loss": 0.0, + "reward": 0.1548611248532931, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 340 + }, + { + "completion_length": 129.09027989705405, + "epoch": 11.370786516853933, + "grad_norm": 0.0004273841969510614, + "kl": 0.4593098958333333, + "learning_rate": 6.417488697998408e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 341 + }, + { + "completion_length": 179.3611157735189, + "epoch": 11.404494382022472, + "grad_norm": 0.043928870917432215, + "kl": 0.3538411458333333, + "learning_rate": 6.398065393606444e-07, + "loss": 0.0, + "reward": 0.1548611248532931, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 342 + }, + { + "completion_length": 170.64583841959634, + "epoch": 11.438202247191011, + "grad_norm": 0.0002525016450016241, + "kl": 0.4108072916666667, + "learning_rate": 6.378619175093834e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 343 + }, + { + "completion_length": 186.02083841959634, + "epoch": 11.47191011235955, + "grad_norm": 0.05915691457559526, + "kl": 0.3147786458333333, + "learning_rate": 6.359150361181714e-07, + "loss": 0.0, + "reward": 0.16944445669651031, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.06944444651405017, + "step": 344 + }, + { + "completion_length": 131.21528116861978, + "epoch": 11.50561797752809, + "grad_norm": 0.0004463548678461368, + "kl": 0.5196940104166666, + "learning_rate": 6.339659270961565e-07, + "loss": 0.0001, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 345 + }, + { + "completion_length": 207.62500127156576, + "epoch": 11.539325842696629, + "grad_norm": 0.0005304094093283811, + "kl": 0.2530924479166667, + "learning_rate": 6.320146223889965e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 346 + }, + { + "completion_length": 173.11805852254233, + "epoch": 11.573033707865168, + "grad_norm": 0.14991673693265567, + "kl": 0.3543294270833333, + "learning_rate": 6.300611539783372e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 347 + }, + { + "completion_length": 132.13888931274414, + "epoch": 11.606741573033707, + "grad_norm": 0.0005385348710833941, + "kl": 0.3854166666666667, + "learning_rate": 6.281055538812861e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 348 + }, + { + "completion_length": 183.73611704508463, + "epoch": 11.640449438202246, + "grad_norm": 0.0012610209756596285, + "kl": 0.3517252604166667, + "learning_rate": 6.261478541498899e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 349 + }, + { + "completion_length": 208.39583841959634, + "epoch": 11.674157303370787, + "grad_norm": 0.047741972814235506, + "kl": 0.24983723958333334, + "learning_rate": 6.241880868706074e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 350 + }, + { + "completion_length": 197.56250762939453, + "epoch": 11.707865168539326, + "grad_norm": 0.00013577663203044108, + "kl": 0.21354166666666666, + "learning_rate": 6.222262841637843e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 351 + }, + { + "completion_length": 177.14584096272787, + "epoch": 11.741573033707866, + "grad_norm": 0.0008694906490758783, + "kl": 0.3067220052083333, + "learning_rate": 6.202624781831268e-07, + "loss": 0.0, + "reward": 0.2527777999639511, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15277778605620065, + "step": 352 + }, + { + "completion_length": 183.9583396911621, + "epoch": 11.775280898876405, + "grad_norm": 0.10608509472791557, + "kl": 0.357421875, + "learning_rate": 6.182967011151745e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 353 + }, + { + "completion_length": 184.88195292154947, + "epoch": 11.808988764044944, + "grad_norm": 0.0014986183716396595, + "kl": 0.3017578125, + "learning_rate": 6.163289851787731e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 354 + }, + { + "completion_length": 189.22222646077475, + "epoch": 11.842696629213483, + "grad_norm": 0.0005517772248671651, + "kl": 0.22713216145833334, + "learning_rate": 6.143593626245455e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 355 + }, + { + "completion_length": 216.89584096272787, + "epoch": 11.876404494382022, + "grad_norm": 0.0004964556739884272, + "kl": 0.2972819010416667, + "learning_rate": 6.123878657343647e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 356 + }, + { + "completion_length": 122.3819465637207, + "epoch": 11.910112359550562, + "grad_norm": 0.0009579948324521336, + "kl": 0.3916015625, + "learning_rate": 6.104145268208232e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 357 + }, + { + "completion_length": 167.1041717529297, + "epoch": 11.9438202247191, + "grad_norm": 0.0005563995057600177, + "kl": 0.3909505208333333, + "learning_rate": 6.084393782267039e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 358 + }, + { + "completion_length": 165.38195037841797, + "epoch": 11.97752808988764, + "grad_norm": 0.0008336815063265414, + "kl": 0.3528645833333333, + "learning_rate": 6.064624523244509e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 359 + }, + { + "completion_length": 144.5, + "epoch": 12.0, + "grad_norm": 0.0003010049080704611, + "kl": 0.404541015625, + "learning_rate": 6.044837815156376e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 360 + }, + { + "completion_length": 167.2916692097982, + "epoch": 12.03370786516854, + "grad_norm": 0.0003820551423681328, + "kl": 0.2529296875, + "learning_rate": 6.025033982304362e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 361 + }, + { + "completion_length": 204.15972773234049, + "epoch": 12.067415730337078, + "grad_norm": 0.16183028859414833, + "kl": 0.255859375, + "learning_rate": 6.005213349270864e-07, + "loss": 0.0, + "reward": 0.3152778049310048, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.21527778605620065, + "step": 362 + }, + { + "completion_length": 169.59028498331705, + "epoch": 12.101123595505618, + "grad_norm": 0.0003504711153107916, + "kl": 0.3050944010416667, + "learning_rate": 5.98537624091363e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 363 + }, + { + "completion_length": 176.46528498331705, + "epoch": 12.134831460674157, + "grad_norm": 0.00043722908360364806, + "kl": 0.2932942708333333, + "learning_rate": 5.96552298236044e-07, + "loss": 0.0, + "reward": 0.4333333770434062, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3333333383003871, + "step": 364 + }, + { + "completion_length": 235.55555852254233, + "epoch": 12.168539325842696, + "grad_norm": 1.5878187441811469, + "kl": 2.86767578125, + "learning_rate": 5.945653899003768e-07, + "loss": 0.0003, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 365 + }, + { + "completion_length": 214.61111704508463, + "epoch": 12.202247191011235, + "grad_norm": 0.0010650119110943777, + "kl": 0.323974609375, + "learning_rate": 5.925769316495461e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 366 + }, + { + "completion_length": 157.31944783528647, + "epoch": 12.235955056179776, + "grad_norm": 0.059812278676286815, + "kl": 0.2989908854166667, + "learning_rate": 5.905869560741388e-07, + "loss": 0.0, + "reward": 0.4194444740811984, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3194444527228673, + "step": 367 + }, + { + "completion_length": 162.45139439900717, + "epoch": 12.269662921348315, + "grad_norm": 0.07123413489312279, + "kl": 0.271484375, + "learning_rate": 5.885954957896115e-07, + "loss": 0.0, + "reward": 0.24583335469166437, + "reward_std": 0.028752731780211132, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 368 + }, + { + "completion_length": 151.49305979410806, + "epoch": 12.303370786516854, + "grad_norm": 0.05210555922826469, + "kl": 0.4189453125, + "learning_rate": 5.86602583435754e-07, + "loss": 0.0, + "reward": 0.3152778123815854, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.21527778605620065, + "step": 369 + }, + { + "completion_length": 129.15278244018555, + "epoch": 12.337078651685394, + "grad_norm": 0.00031598493630194624, + "kl": 0.3564453125, + "learning_rate": 5.846082516761557e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 370 + }, + { + "completion_length": 205.19445164998373, + "epoch": 12.370786516853933, + "grad_norm": 0.12491315821957581, + "kl": 0.19327799479166666, + "learning_rate": 5.826125331976707e-07, + "loss": 0.0, + "reward": 0.1972222402691841, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09722222884496053, + "step": 371 + }, + { + "completion_length": 169.81250508626303, + "epoch": 12.404494382022472, + "grad_norm": 0.0001668630376936716, + "kl": 0.3038736979166667, + "learning_rate": 5.806154607098799e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 372 + }, + { + "completion_length": 163.11111895243326, + "epoch": 12.438202247191011, + "grad_norm": 0.023647595004305768, + "kl": 0.3009440104166667, + "learning_rate": 5.786170669445572e-07, + "loss": 0.0, + "reward": 0.3083333447575569, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.20833333830038706, + "step": 373 + }, + { + "completion_length": 251.25695037841797, + "epoch": 12.47191011235955, + "grad_norm": 0.0006332197928937429, + "kl": 0.1669921875, + "learning_rate": 5.766173846551316e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 374 + }, + { + "completion_length": 159.04167048136392, + "epoch": 12.50561797752809, + "grad_norm": 0.0002703778763178631, + "kl": 0.3225911458333333, + "learning_rate": 5.746164466161511e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 375 + }, + { + "completion_length": 207.15973027547201, + "epoch": 12.539325842696629, + "grad_norm": 0.0010034252057769405, + "kl": 0.2862955729166667, + "learning_rate": 5.726142856227452e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 376 + }, + { + "completion_length": 193.02778116861978, + "epoch": 12.573033707865168, + "grad_norm": 0.00021634249958789418, + "kl": 0.18269856770833334, + "learning_rate": 5.706109344900874e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 377 + }, + { + "completion_length": 234.78472518920898, + "epoch": 12.606741573033707, + "grad_norm": 0.23589228490309555, + "kl": 0.17146809895833334, + "learning_rate": 5.686064260528577e-07, + "loss": 0.0, + "reward": 0.25138890991608304, + "reward_std": 0.02828894866009553, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.15277778108914694, + "step": 378 + }, + { + "completion_length": 197.90279006958008, + "epoch": 12.640449438202246, + "grad_norm": 0.00020646880932808422, + "kl": 0.279541015625, + "learning_rate": 5.666007931647038e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 379 + }, + { + "completion_length": 206.31945292154947, + "epoch": 12.674157303370787, + "grad_norm": 0.00023121936429887455, + "kl": 0.2808430989583333, + "learning_rate": 5.645940686977032e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 380 + }, + { + "completion_length": 214.84722900390625, + "epoch": 12.707865168539326, + "grad_norm": 0.0007008980643213043, + "kl": 0.217041015625, + "learning_rate": 5.625862855418245e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 381 + }, + { + "completion_length": 165.94445037841797, + "epoch": 12.741573033707866, + "grad_norm": 0.00024405733730170179, + "kl": 0.23885091145833334, + "learning_rate": 5.605774766043873e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 382 + }, + { + "completion_length": 193.60417556762695, + "epoch": 12.775280898876405, + "grad_norm": 0.026077748042351946, + "kl": 0.2490234375, + "learning_rate": 5.585676748095248e-07, + "loss": 0.0, + "reward": 0.09930556515852611, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.0, + "step": 383 + }, + { + "completion_length": 199.88195037841797, + "epoch": 12.808988764044944, + "grad_norm": 0.0015764900618587335, + "kl": 0.195556640625, + "learning_rate": 5.565569130976422e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 384 + }, + { + "completion_length": 190.86111958821616, + "epoch": 12.842696629213483, + "grad_norm": 0.0006664177114999611, + "kl": 0.243896484375, + "learning_rate": 5.545452244248774e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 385 + }, + { + "completion_length": 220.31945292154947, + "epoch": 12.876404494382022, + "grad_norm": 0.1998384017913336, + "kl": 0.21940104166666666, + "learning_rate": 5.52532641762562e-07, + "loss": 0.0, + "reward": 0.24583335469166437, + "reward_std": 0.04535908127824465, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 386 + }, + { + "completion_length": 188.4305623372396, + "epoch": 12.910112359550562, + "grad_norm": 0.00031986136651065007, + "kl": 0.250244140625, + "learning_rate": 5.50519198096679e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 387 + }, + { + "completion_length": 204.5763931274414, + "epoch": 12.9438202247191, + "grad_norm": 0.0005878538805575539, + "kl": 0.21875, + "learning_rate": 5.485049264273241e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 388 + }, + { + "completion_length": 185.63194783528647, + "epoch": 12.97752808988764, + "grad_norm": 0.07660154975533078, + "kl": 0.2594401041666667, + "learning_rate": 5.464898597681629e-07, + "loss": 0.0, + "reward": 0.3180555800596873, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09583334128061931, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 389 + }, + { + "completion_length": 178.5, + "epoch": 13.0, + "grad_norm": 0.07660154975533078, + "kl": 0.2437744140625, + "learning_rate": 5.444740311458914e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 390 + }, + { + "completion_length": 173.43056106567383, + "epoch": 13.03370786516854, + "grad_norm": 0.08943733591463003, + "kl": 0.2587890625, + "learning_rate": 5.42457473599694e-07, + "loss": 0.0, + "reward": 0.2041666880249977, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.10416666666666667, + "step": 391 + }, + { + "completion_length": 189.65972900390625, + "epoch": 13.067415730337078, + "grad_norm": 0.06642566712861118, + "kl": 0.19498697916666666, + "learning_rate": 5.404402201807021e-07, + "loss": 0.0, + "reward": 0.21041668206453323, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 392 + }, + { + "completion_length": 137.9444491068522, + "epoch": 13.101123595505618, + "grad_norm": 0.0005772899793395543, + "kl": 0.38671875, + "learning_rate": 5.384223039514521e-07, + "loss": 0.0, + "reward": 0.3638889243205388, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2638888955116272, + "step": 393 + }, + { + "completion_length": 174.15972646077475, + "epoch": 13.134831460674157, + "grad_norm": 0.23548617007723474, + "kl": 0.4768880208333333, + "learning_rate": 5.364037579853439e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 394 + }, + { + "completion_length": 217.65972773234049, + "epoch": 13.168539325842696, + "grad_norm": 0.0038889022855124423, + "kl": 0.19620768229166666, + "learning_rate": 5.343846153660991e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 395 + }, + { + "completion_length": 165.89584096272787, + "epoch": 13.202247191011235, + "grad_norm": 0.0012739658159539797, + "kl": 0.3113606770833333, + "learning_rate": 5.323649091872178e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 396 + }, + { + "completion_length": 228.5138905843099, + "epoch": 13.235955056179776, + "grad_norm": 0.00024036705147910484, + "kl": 0.17220052083333334, + "learning_rate": 5.303446725514371e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 397 + }, + { + "completion_length": 184.52083841959634, + "epoch": 13.269662921348315, + "grad_norm": 0.0003824892371598811, + "kl": 0.3045247395833333, + "learning_rate": 5.283239385701881e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 398 + }, + { + "completion_length": 219.70834096272787, + "epoch": 13.303370786516854, + "grad_norm": 0.00014916228668265585, + "kl": 0.21382649739583334, + "learning_rate": 5.263027403630533e-07, + "loss": 0.0, + "reward": 0.17638890693585077, + "reward_std": 0.049337546030680336, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.07638889054457347, + "step": 399 + }, + { + "completion_length": 179.02083587646484, + "epoch": 13.337078651685394, + "grad_norm": 0.10532863364621428, + "kl": 0.2506510416666667, + "learning_rate": 5.242811110572242e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 400 + }, + { + "completion_length": 188.70139821370444, + "epoch": 13.370786516853933, + "grad_norm": 0.0002357174535753633, + "kl": 0.3167317708333333, + "learning_rate": 5.222590837869571e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 401 + }, + { + "completion_length": 159.29167048136392, + "epoch": 13.404494382022472, + "grad_norm": 0.0016623225923695986, + "kl": 0.2757161458333333, + "learning_rate": 5.202366916930319e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 402 + }, + { + "completion_length": 208.22222391764322, + "epoch": 13.438202247191011, + "grad_norm": 0.0005433316887850818, + "kl": 0.5068359375, + "learning_rate": 5.182139679222071e-07, + "loss": 0.0001, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 403 + }, + { + "completion_length": 167.14583841959634, + "epoch": 13.47191011235955, + "grad_norm": 0.008640657915404278, + "kl": 0.2814127604166667, + "learning_rate": 5.16190945626678e-07, + "loss": 0.0, + "reward": 0.2458333522081375, + "reward_std": 0.04535908500353495, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.14583333830038706, + "step": 404 + }, + { + "completion_length": 218.2638931274414, + "epoch": 13.50561797752809, + "grad_norm": 0.32151324015002974, + "kl": 0.2609049479166667, + "learning_rate": 5.141676579635321e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 405 + }, + { + "completion_length": 235.06250762939453, + "epoch": 13.539325842696629, + "grad_norm": 0.00031033750331374154, + "kl": 0.2364501953125, + "learning_rate": 5.121441380942065e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 406 + }, + { + "completion_length": 268.9861195882161, + "epoch": 13.573033707865168, + "grad_norm": 0.07905844933373425, + "kl": 0.19075520833333334, + "learning_rate": 5.101204191839444e-07, + "loss": 0.0, + "reward": 0.2069444681207339, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09583334128061931, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 407 + }, + { + "completion_length": 208.20834096272787, + "epoch": 13.606741573033707, + "grad_norm": 0.09608491103279254, + "kl": 0.18416341145833334, + "learning_rate": 5.080965344012508e-07, + "loss": 0.0, + "reward": 0.3083333596587181, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2083333432674408, + "step": 408 + }, + { + "completion_length": 167.96528244018555, + "epoch": 13.640449438202246, + "grad_norm": 0.048948115660668264, + "kl": 0.302734375, + "learning_rate": 5.060725169173494e-07, + "loss": 0.0, + "reward": 0.097916675110658, + "reward_std": 0.0028752734263738, + "rewards/format_reward_func": 0.097916675110658, + "rewards/solution_reward_func": 0.0, + "step": 409 + }, + { + "completion_length": 206.86805852254233, + "epoch": 13.674157303370787, + "grad_norm": 0.036440037046322135, + "kl": 0.24812825520833334, + "learning_rate": 5.040483999056393e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 410 + }, + { + "completion_length": 162.15972773234049, + "epoch": 13.707865168539326, + "grad_norm": 0.00032977342187953565, + "kl": 0.2552083333333333, + "learning_rate": 5.020242165411503e-07, + "loss": 0.0, + "reward": 0.2250000163912773, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1250000037252903, + "step": 411 + }, + { + "completion_length": 230.4513931274414, + "epoch": 13.741573033707866, + "grad_norm": 0.20764036295872904, + "kl": 0.19319661458333334, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 412 + }, + { + "completion_length": 164.1666717529297, + "epoch": 13.775280898876405, + "grad_norm": 0.0005905781369925677, + "kl": 0.2431640625, + "learning_rate": 4.979757834588498e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 413 + }, + { + "completion_length": 235.1805623372396, + "epoch": 13.808988764044944, + "grad_norm": 0.00037054793613402624, + "kl": 0.18155924479166666, + "learning_rate": 4.959516000943607e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 414 + }, + { + "completion_length": 185.38889821370444, + "epoch": 13.842696629213483, + "grad_norm": 0.00044972995539110055, + "kl": 0.2794596354166667, + "learning_rate": 4.939274830826506e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 415 + }, + { + "completion_length": 188.93750508626303, + "epoch": 13.876404494382022, + "grad_norm": 0.00041969638777271157, + "kl": 0.2520345052083333, + "learning_rate": 4.919034655987492e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 416 + }, + { + "completion_length": 195.31250762939453, + "epoch": 13.910112359550562, + "grad_norm": 0.00040950039159462167, + "kl": 0.22908528645833334, + "learning_rate": 4.898795808160557e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 417 + }, + { + "completion_length": 181.3125025431315, + "epoch": 13.9438202247191, + "grad_norm": 0.050108152183806716, + "kl": 0.223388671875, + "learning_rate": 4.878558619057935e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 418 + }, + { + "completion_length": 212.9166692097982, + "epoch": 13.97752808988764, + "grad_norm": 0.0004219227421004525, + "kl": 0.24300130208333334, + "learning_rate": 4.85832342036468e-07, + "loss": 0.0, + "reward": 0.15486112236976624, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 419 + }, + { + "completion_length": 214.0, + "epoch": 14.0, + "grad_norm": 0.04997924855970522, + "kl": 0.183837890625, + "learning_rate": 4.838090543733221e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 420 + }, + { + "completion_length": 184.95139439900717, + "epoch": 14.03370786516854, + "grad_norm": 0.0004201660556234122, + "kl": 0.23982747395833334, + "learning_rate": 4.817860320777929e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 421 + }, + { + "completion_length": 175.45833587646484, + "epoch": 14.067415730337078, + "grad_norm": 0.07509150094936456, + "kl": 0.3214518229166667, + "learning_rate": 4.797633083069683e-07, + "loss": 0.0, + "reward": 0.37638891239960987, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 422 + }, + { + "completion_length": 217.94444783528647, + "epoch": 14.101123595505618, + "grad_norm": 0.0001941914222500721, + "kl": 0.21940104166666666, + "learning_rate": 4.77740916213043e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 423 + }, + { + "completion_length": 217.18750762939453, + "epoch": 14.134831460674157, + "grad_norm": 0.0002025350091661306, + "kl": 0.22802734375, + "learning_rate": 4.75718888942776e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 424 + }, + { + "completion_length": 169.12500381469727, + "epoch": 14.168539325842696, + "grad_norm": 0.0005197782718127106, + "kl": 0.3312174479166667, + "learning_rate": 4.7369725963694656e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 425 + }, + { + "completion_length": 154.90972646077475, + "epoch": 14.202247191011235, + "grad_norm": 0.09277940829064768, + "kl": 0.3219401041666667, + "learning_rate": 4.7167606142981173e-07, + "loss": 0.0, + "reward": 0.2597222452362378, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 426 + }, + { + "completion_length": 148.22222773234049, + "epoch": 14.235955056179776, + "grad_norm": 0.00028063361317496476, + "kl": 0.297607421875, + "learning_rate": 4.696553274485628e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 427 + }, + { + "completion_length": 184.79167683919272, + "epoch": 14.269662921348315, + "grad_norm": 0.0012698411579828127, + "kl": 0.2862955729166667, + "learning_rate": 4.676350908127821e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 428 + }, + { + "completion_length": 207.8541742960612, + "epoch": 14.303370786516854, + "grad_norm": 0.00020973277504669202, + "kl": 0.180908203125, + "learning_rate": 4.6561538463390076e-07, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 429 + }, + { + "completion_length": 127.5277811686198, + "epoch": 14.337078651685394, + "grad_norm": 0.055367245657142926, + "kl": 0.6676432291666666, + "learning_rate": 4.6359624201465597e-07, + "loss": 0.0001, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 430 + }, + { + "completion_length": 164.1805623372396, + "epoch": 14.370786516853933, + "grad_norm": 0.0008108279145827242, + "kl": 0.28662109375, + "learning_rate": 4.6157769604854784e-07, + "loss": 0.0, + "reward": 0.26597223927577335, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 431 + }, + { + "completion_length": 195.0208396911621, + "epoch": 14.404494382022472, + "grad_norm": 0.035046849749853205, + "kl": 0.35693359375, + "learning_rate": 4.595597798192979e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 432 + }, + { + "completion_length": 149.1458371480306, + "epoch": 14.438202247191011, + "grad_norm": 0.0004132029027644461, + "kl": 0.3048502604166667, + "learning_rate": 4.575425264003059e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 433 + }, + { + "completion_length": 227.8055623372396, + "epoch": 14.47191011235955, + "grad_norm": 0.0003549320396508729, + "kl": 0.16324869791666666, + "learning_rate": 4.555259688541086e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 434 + }, + { + "completion_length": 179.4305623372396, + "epoch": 14.50561797752809, + "grad_norm": 0.04981780203348592, + "kl": 0.244140625, + "learning_rate": 4.535101402318372e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 435 + }, + { + "completion_length": 171.81944783528647, + "epoch": 14.539325842696629, + "grad_norm": 0.03680238339161683, + "kl": 0.2950846354166667, + "learning_rate": 4.5149507357267597e-07, + "loss": 0.0, + "reward": 0.2097222382823626, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 436 + }, + { + "completion_length": 138.3541692097982, + "epoch": 14.573033707865168, + "grad_norm": 0.09183677574711158, + "kl": 0.2802734375, + "learning_rate": 4.4948080190332095e-07, + "loss": 0.0, + "reward": 0.2597222452362378, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 437 + }, + { + "completion_length": 259.81251017252606, + "epoch": 14.606741573033707, + "grad_norm": 0.00047948910395449724, + "kl": 0.21866861979166666, + "learning_rate": 4.47467358237438e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 438 + }, + { + "completion_length": 205.5555623372396, + "epoch": 14.640449438202246, + "grad_norm": 0.0002665695617108525, + "kl": 0.2931315104166667, + "learning_rate": 4.454547755751226e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 439 + }, + { + "completion_length": 171.31944783528647, + "epoch": 14.674157303370787, + "grad_norm": 0.13710385404345207, + "kl": 0.2937825520833333, + "learning_rate": 4.434430869023579e-07, + "loss": 0.0, + "reward": 0.17638890941937765, + "reward_std": 0.049337549755970635, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.07638889302810033, + "step": 440 + }, + { + "completion_length": 168.5763931274414, + "epoch": 14.707865168539326, + "grad_norm": 0.11486991086566431, + "kl": 0.2942708333333333, + "learning_rate": 4.414323251904752e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 441 + }, + { + "completion_length": 179.40972900390625, + "epoch": 14.741573033707866, + "grad_norm": 0.11245966115421278, + "kl": 0.258056640625, + "learning_rate": 4.394225233956127e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 442 + }, + { + "completion_length": 156.7777837117513, + "epoch": 14.775280898876405, + "grad_norm": 0.0004859560435108553, + "kl": 0.2766927083333333, + "learning_rate": 4.3741371445817566e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 443 + }, + { + "completion_length": 157.47222900390625, + "epoch": 14.808988764044944, + "grad_norm": 0.0005497833039302257, + "kl": 0.3638509114583333, + "learning_rate": 4.354059313022969e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 444 + }, + { + "completion_length": 188.6388956705729, + "epoch": 14.842696629213483, + "grad_norm": 0.0006967351900209226, + "kl": 0.23819986979166666, + "learning_rate": 4.3339920683529633e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 445 + }, + { + "completion_length": 151.25694783528647, + "epoch": 14.876404494382022, + "grad_norm": 0.0002478218894817169, + "kl": 0.3118489583333333, + "learning_rate": 4.313935739471425e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 446 + }, + { + "completion_length": 187.72223154703775, + "epoch": 14.910112359550562, + "grad_norm": 0.0005062909773275928, + "kl": 0.3043619791666667, + "learning_rate": 4.293890655099127e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 447 + }, + { + "completion_length": 220.4027837117513, + "epoch": 14.9438202247191, + "grad_norm": 0.00018911751024986066, + "kl": 0.291015625, + "learning_rate": 4.2738571437725496e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 448 + }, + { + "completion_length": 229.17361450195312, + "epoch": 14.97752808988764, + "grad_norm": 0.00034558388239139606, + "kl": 0.23486328125, + "learning_rate": 4.253835533838489e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 449 + }, + { + "completion_length": 156.0, + "epoch": 15.0, + "grad_norm": 0.0007660661793562322, + "kl": 0.234375, + "learning_rate": 4.233826153448684e-07, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 450 + }, + { + "completion_length": 181.04167556762695, + "epoch": 15.03370786516854, + "grad_norm": 0.007094099382061799, + "kl": 0.274658203125, + "learning_rate": 4.2138293305544284e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 451 + }, + { + "completion_length": 172.22222900390625, + "epoch": 15.067415730337078, + "grad_norm": 0.06544339578312929, + "kl": 0.2692057291666667, + "learning_rate": 4.193845392901201e-07, + "loss": 0.0, + "reward": 0.25763891140619916, + "reward_std": 0.022517128537098568, + "rewards/format_reward_func": 0.097916675110658, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 452 + }, + { + "completion_length": 192.02084096272787, + "epoch": 15.101123595505618, + "grad_norm": 0.026983113078097677, + "kl": 0.26171875, + "learning_rate": 4.173874668023293e-07, + "loss": 0.0, + "reward": 0.3152778074145317, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.21527778108914694, + "step": 453 + }, + { + "completion_length": 142.51389185587564, + "epoch": 15.134831460674157, + "grad_norm": 0.00042277783975308683, + "kl": 0.3512369791666667, + "learning_rate": 4.1539174832384415e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 454 + }, + { + "completion_length": 139.8611157735189, + "epoch": 15.168539325842696, + "grad_norm": 0.0006725431507052571, + "kl": 0.32080078125, + "learning_rate": 4.133974165642461e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 455 + }, + { + "completion_length": 194.61111450195312, + "epoch": 15.202247191011235, + "grad_norm": 0.007799154572751969, + "kl": 0.3564453125, + "learning_rate": 4.1140450421038866e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 456 + }, + { + "completion_length": 160.0138931274414, + "epoch": 15.235955056179776, + "grad_norm": 0.0009772230071249098, + "kl": 0.2732747395833333, + "learning_rate": 4.0941304392586115e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 457 + }, + { + "completion_length": 170.6875025431315, + "epoch": 15.269662921348315, + "grad_norm": 0.000784111519363425, + "kl": 0.3238118489583333, + "learning_rate": 4.07423068350454e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 458 + }, + { + "completion_length": 171.02778244018555, + "epoch": 15.303370786516854, + "grad_norm": 0.0008429394623382894, + "kl": 0.271728515625, + "learning_rate": 4.054346100996232e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 459 + }, + { + "completion_length": 225.98611958821616, + "epoch": 15.337078651685394, + "grad_norm": 0.1573173925023226, + "kl": 0.251953125, + "learning_rate": 4.0344770176395606e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 460 + }, + { + "completion_length": 161.2916717529297, + "epoch": 15.370786516853933, + "grad_norm": 0.07141887970989558, + "kl": 0.23063151041666666, + "learning_rate": 4.0146237590863695e-07, + "loss": 0.0, + "reward": 0.2597222452362378, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 461 + }, + { + "completion_length": 180.7777837117513, + "epoch": 15.404494382022472, + "grad_norm": 0.0002797271143654389, + "kl": 0.2529296875, + "learning_rate": 3.994786650729136e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 462 + }, + { + "completion_length": 195.98611704508463, + "epoch": 15.438202247191011, + "grad_norm": 0.0006422159625303369, + "kl": 0.2661946614583333, + "learning_rate": 3.974966017695639e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 463 + }, + { + "completion_length": 186.00000508626303, + "epoch": 15.47191011235955, + "grad_norm": 0.00028473530088646146, + "kl": 0.2718098958333333, + "learning_rate": 3.955162184843624e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 464 + }, + { + "completion_length": 252.5763931274414, + "epoch": 15.50561797752809, + "grad_norm": 0.00025081713741414963, + "kl": 0.20402018229166666, + "learning_rate": 3.935375476755491e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 465 + }, + { + "completion_length": 191.27084096272787, + "epoch": 15.539325842696629, + "grad_norm": 0.02890150894357404, + "kl": 0.2975260416666667, + "learning_rate": 3.915606217732962e-07, + "loss": 0.0, + "reward": 0.15486112236976624, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 466 + }, + { + "completion_length": 159.63194783528647, + "epoch": 15.573033707865168, + "grad_norm": 0.0007826623119459179, + "kl": 0.29736328125, + "learning_rate": 3.89585473179177e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 467 + }, + { + "completion_length": 142.22917048136392, + "epoch": 15.606741573033707, + "grad_norm": 0.0002039721211095107, + "kl": 0.2711588541666667, + "learning_rate": 3.8761213426563543e-07, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 468 + }, + { + "completion_length": 187.55556360880533, + "epoch": 15.640449438202246, + "grad_norm": 0.0004174276881004873, + "kl": 0.23673502604166666, + "learning_rate": 3.856406373754545e-07, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 469 + }, + { + "completion_length": 195.6041717529297, + "epoch": 15.674157303370787, + "grad_norm": 0.0001730058012489683, + "kl": 0.2503255208333333, + "learning_rate": 3.8367101482122705e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 470 + }, + { + "completion_length": 180.73611450195312, + "epoch": 15.707865168539326, + "grad_norm": 0.0004095745406866655, + "kl": 0.2691243489583333, + "learning_rate": 3.817032988848255e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 471 + }, + { + "completion_length": 146.02778116861978, + "epoch": 15.741573033707866, + "grad_norm": 0.00045396703551767767, + "kl": 0.34619140625, + "learning_rate": 3.7973752181687327e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 472 + }, + { + "completion_length": 152.84028244018555, + "epoch": 15.775280898876405, + "grad_norm": 0.0009086729113282224, + "kl": 0.3414713541666667, + "learning_rate": 3.7777371583621565e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 473 + }, + { + "completion_length": 180.48611704508463, + "epoch": 15.808988764044944, + "grad_norm": 0.0002678250865837051, + "kl": 0.22884114583333334, + "learning_rate": 3.758119131293925e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 474 + }, + { + "completion_length": 159.9166717529297, + "epoch": 15.842696629213483, + "grad_norm": 0.04183647208889375, + "kl": 0.2705078125, + "learning_rate": 3.7385214585010993e-07, + "loss": 0.0, + "reward": 0.2597222477197647, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1597222238779068, + "step": 475 + }, + { + "completion_length": 207.4930623372396, + "epoch": 15.876404494382022, + "grad_norm": 0.0026321247620772464, + "kl": 0.2845052083333333, + "learning_rate": 3.718944461187138e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 476 + }, + { + "completion_length": 181.15278244018555, + "epoch": 15.910112359550562, + "grad_norm": 0.0643462502527668, + "kl": 0.3087565104166667, + "learning_rate": 3.699388460216628e-07, + "loss": 0.0, + "reward": 0.23888890693585077, + "reward_std": 0.02969569464524587, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.13888889302810034, + "step": 477 + }, + { + "completion_length": 226.1388905843099, + "epoch": 15.9438202247191, + "grad_norm": 0.0006608658101970019, + "kl": 0.18668619791666666, + "learning_rate": 3.6798537761100347e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 478 + }, + { + "completion_length": 158.53472773234049, + "epoch": 15.97752808988764, + "grad_norm": 0.0007673988615900748, + "kl": 0.3330078125, + "learning_rate": 3.660340729038435e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 479 + }, + { + "completion_length": 145.0, + "epoch": 16.0, + "grad_norm": 0.0007673988615900748, + "kl": 0.357177734375, + "learning_rate": 3.640849638818285e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 480 + }, + { + "completion_length": 213.62500762939453, + "epoch": 16.03370786516854, + "grad_norm": 0.0005566545756712285, + "kl": 0.2451171875, + "learning_rate": 3.6213808249061675e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 481 + }, + { + "completion_length": 172.7708396911621, + "epoch": 16.06741573033708, + "grad_norm": 0.0010665464074822223, + "kl": 0.23811848958333334, + "learning_rate": 3.601934606393555e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 482 + }, + { + "completion_length": 214.5555623372396, + "epoch": 16.10112359550562, + "grad_norm": 0.0004358483909927699, + "kl": 0.25244140625, + "learning_rate": 3.582511302001593e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 483 + }, + { + "completion_length": 183.3263905843099, + "epoch": 16.134831460674157, + "grad_norm": 0.00016145382024288697, + "kl": 0.3138020833333333, + "learning_rate": 3.563111230075859e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 484 + }, + { + "completion_length": 243.87500381469727, + "epoch": 16.168539325842698, + "grad_norm": 0.0041061854858934765, + "kl": 0.23567708333333334, + "learning_rate": 3.5437347085811595e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 485 + }, + { + "completion_length": 178.05555979410806, + "epoch": 16.202247191011235, + "grad_norm": 0.0003720172699577454, + "kl": 0.24169921875, + "learning_rate": 3.524382055096308e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 486 + }, + { + "completion_length": 221.9791717529297, + "epoch": 16.235955056179776, + "grad_norm": 0.0018606485020804804, + "kl": 0.24593098958333334, + "learning_rate": 3.505053586808928e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 487 + }, + { + "completion_length": 181.3680623372396, + "epoch": 16.269662921348313, + "grad_norm": 0.0003023417125708598, + "kl": 0.2874348958333333, + "learning_rate": 3.485749620510247e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 488 + }, + { + "completion_length": 136.18750508626303, + "epoch": 16.303370786516854, + "grad_norm": 0.0007412092525650524, + "kl": 0.3489583333333333, + "learning_rate": 3.4664704725899084e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 489 + }, + { + "completion_length": 231.3263931274414, + "epoch": 16.337078651685392, + "grad_norm": 0.0023119731750364607, + "kl": 0.22477213541666666, + "learning_rate": 3.447216459030789e-07, + "loss": 0.0, + "reward": 0.09652778630455335, + "reward_std": 0.0028752734263738, + "rewards/format_reward_func": 0.09652778630455335, + "rewards/solution_reward_func": 0.0, + "step": 490 + }, + { + "completion_length": 146.0416717529297, + "epoch": 16.370786516853933, + "grad_norm": 0.07934830006529212, + "kl": 0.38330078125, + "learning_rate": 3.427987895403811e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 491 + }, + { + "completion_length": 194.65972900390625, + "epoch": 16.40449438202247, + "grad_norm": 0.0024616964768175435, + "kl": 0.255615234375, + "learning_rate": 3.408785096862782e-07, + "loss": 0.0, + "reward": 0.3708333745598793, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2708333383003871, + "step": 492 + }, + { + "completion_length": 157.9166717529297, + "epoch": 16.43820224719101, + "grad_norm": 0.05090342779756699, + "kl": 0.279296875, + "learning_rate": 3.389608378139215e-07, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 493 + }, + { + "completion_length": 187.18750508626303, + "epoch": 16.471910112359552, + "grad_norm": 0.0005266055465295907, + "kl": 0.2958984375, + "learning_rate": 3.3704580535371857e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 494 + }, + { + "completion_length": 139.47917048136392, + "epoch": 16.50561797752809, + "grad_norm": 0.00033242377384275375, + "kl": 0.2864583333333333, + "learning_rate": 3.351334436928169e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 495 + }, + { + "completion_length": 171.61111450195312, + "epoch": 16.53932584269663, + "grad_norm": 0.0003433146219479661, + "kl": 0.3076171875, + "learning_rate": 3.3322378417458977e-07, + "loss": 0.0, + "reward": 0.3222222353021304, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 496 + }, + { + "completion_length": 201.29861323038736, + "epoch": 16.573033707865168, + "grad_norm": 0.0005523469896541151, + "kl": 0.2691243489583333, + "learning_rate": 3.3131685809812307e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 497 + }, + { + "completion_length": 165.68750508626303, + "epoch": 16.60674157303371, + "grad_norm": 0.001259766625478532, + "kl": 0.2861328125, + "learning_rate": 3.294126967177019e-07, + "loss": 0.0, + "reward": 0.21041667958100638, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 498 + }, + { + "completion_length": 142.47917048136392, + "epoch": 16.640449438202246, + "grad_norm": 0.029801353652665036, + "kl": 0.2766927083333333, + "learning_rate": 3.27511331242298e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 499 + }, + { + "completion_length": 187.46528116861978, + "epoch": 16.674157303370787, + "grad_norm": 0.00023592876790693798, + "kl": 0.21061197916666666, + "learning_rate": 3.2561279283505884e-07, + "loss": 0.0, + "reward": 0.2527777974804242, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15277778108914694, + "step": 500 + }, + { + "completion_length": 232.31250762939453, + "epoch": 16.707865168539325, + "grad_norm": 0.039691578143193414, + "kl": 0.19384765625, + "learning_rate": 3.237171126127963e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 501 + }, + { + "completion_length": 220.0277862548828, + "epoch": 16.741573033707866, + "grad_norm": 0.0002919158109261208, + "kl": 0.252197265625, + "learning_rate": 3.2182432164547744e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 502 + }, + { + "completion_length": 148.4791717529297, + "epoch": 16.775280898876403, + "grad_norm": 0.0006969325031468606, + "kl": 0.2596028645833333, + "learning_rate": 3.199344509557144e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 503 + }, + { + "completion_length": 199.78473027547201, + "epoch": 16.808988764044944, + "grad_norm": 0.0029776921604326156, + "kl": 0.21036783854166666, + "learning_rate": 3.1804753151825627e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 504 + }, + { + "completion_length": 211.5763931274414, + "epoch": 16.84269662921348, + "grad_norm": 0.000362935900616192, + "kl": 0.26025390625, + "learning_rate": 3.161635942594817e-07, + "loss": 0.0, + "reward": 0.3777778223156929, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 505 + }, + { + "completion_length": 174.34722391764322, + "epoch": 16.876404494382022, + "grad_norm": 0.0005819455013136116, + "kl": 0.3260091145833333, + "learning_rate": 3.142826700568918e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 506 + }, + { + "completion_length": 146.27083587646484, + "epoch": 16.910112359550563, + "grad_norm": 0.0003226275246467629, + "kl": 0.2959798177083333, + "learning_rate": 3.1240478973860397e-07, + "loss": 0.0, + "reward": 0.4888889367381732, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3888888955116272, + "step": 507 + }, + { + "completion_length": 153.27083778381348, + "epoch": 16.9438202247191, + "grad_norm": 0.0002953886776196681, + "kl": 0.2813313802083333, + "learning_rate": 3.105299840828466e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 508 + }, + { + "completion_length": 190.10417556762695, + "epoch": 16.97752808988764, + "grad_norm": 0.00023914591194317672, + "kl": 0.18937174479166666, + "learning_rate": 3.086582838174551e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 509 + }, + { + "completion_length": 221.75, + "epoch": 17.0, + "grad_norm": 0.0002365166305588903, + "kl": 0.1475830078125, + "learning_rate": 3.0678971961936764e-07, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 510 + }, + { + "completion_length": 199.2986157735189, + "epoch": 17.03370786516854, + "grad_norm": 0.0001592642382625475, + "kl": 0.19205729166666666, + "learning_rate": 3.0492432211412246e-07, + "loss": 0.0, + "reward": 0.21041667958100638, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 511 + }, + { + "completion_length": 184.52084096272787, + "epoch": 17.06741573033708, + "grad_norm": 0.026191303709783974, + "kl": 0.3232421875, + "learning_rate": 3.030621218753565e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 512 + }, + { + "completion_length": 217.34028116861978, + "epoch": 17.10112359550562, + "grad_norm": 0.0005213211536442613, + "kl": 0.19645182291666666, + "learning_rate": 3.012031494243036e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 513 + }, + { + "completion_length": 203.84722646077475, + "epoch": 17.134831460674157, + "grad_norm": 0.00017776950087120656, + "kl": 0.23974609375, + "learning_rate": 2.9934743522929473e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 514 + }, + { + "completion_length": 213.1041717529297, + "epoch": 17.168539325842698, + "grad_norm": 0.00020671902226922128, + "kl": 0.19140625, + "learning_rate": 2.9749500970525833e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 515 + }, + { + "completion_length": 172.56250762939453, + "epoch": 17.202247191011235, + "grad_norm": 0.00040316237402194784, + "kl": 0.31884765625, + "learning_rate": 2.95645903213222e-07, + "loss": 0.0, + "reward": 0.21111111591259638, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 516 + }, + { + "completion_length": 219.9791742960612, + "epoch": 17.235955056179776, + "grad_norm": 0.0003038942050579442, + "kl": 0.22672526041666666, + "learning_rate": 2.938001460598147e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 517 + }, + { + "completion_length": 163.3541717529297, + "epoch": 17.269662921348313, + "grad_norm": 0.000256374475494251, + "kl": 0.29345703125, + "learning_rate": 2.9195776849677035e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 518 + }, + { + "completion_length": 200.3263956705729, + "epoch": 17.303370786516854, + "grad_norm": 0.21010452644547833, + "kl": 0.2587890625, + "learning_rate": 2.9011880072043153e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 519 + }, + { + "completion_length": 143.42361323038736, + "epoch": 17.337078651685392, + "grad_norm": 0.049163029216386225, + "kl": 0.2815755208333333, + "learning_rate": 2.8828327287125507e-07, + "loss": 0.0, + "reward": 0.2597222452362378, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 520 + }, + { + "completion_length": 176.43055979410806, + "epoch": 17.370786516853933, + "grad_norm": 0.0004144706327603887, + "kl": 0.25146484375, + "learning_rate": 2.8645121503331745e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 521 + }, + { + "completion_length": 145.35416793823242, + "epoch": 17.40449438202247, + "grad_norm": 0.00011116813702915777, + "kl": 0.32861328125, + "learning_rate": 2.846226572338225e-07, + "loss": 0.0, + "reward": 0.3777778148651123, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 522 + }, + { + "completion_length": 209.6041717529297, + "epoch": 17.43820224719101, + "grad_norm": 0.0006482477088515298, + "kl": 0.21346028645833334, + "learning_rate": 2.8279762944260907e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 523 + }, + { + "completion_length": 217.17361704508463, + "epoch": 17.471910112359552, + "grad_norm": 0.0002547376273377751, + "kl": 0.22298177083333334, + "learning_rate": 2.8097616157165885e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 524 + }, + { + "completion_length": 172.06945037841797, + "epoch": 17.50561797752809, + "grad_norm": 0.0002926450792546274, + "kl": 0.2517903645833333, + "learning_rate": 2.791582834746073e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 525 + }, + { + "completion_length": 145.8541742960612, + "epoch": 17.53932584269663, + "grad_norm": 0.0009642904327038002, + "kl": 0.3297526041666667, + "learning_rate": 2.773440249462539e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 526 + }, + { + "completion_length": 190.89584096272787, + "epoch": 17.573033707865168, + "grad_norm": 0.0011017436943071563, + "kl": 0.230712890625, + "learning_rate": 2.7553341572207365e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 527 + }, + { + "completion_length": 201.82639694213867, + "epoch": 17.60674157303371, + "grad_norm": 0.00032329113091717046, + "kl": 0.24650065104166666, + "learning_rate": 2.7372648547773056e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 528 + }, + { + "completion_length": 221.59028244018555, + "epoch": 17.640449438202246, + "grad_norm": 0.025196081610186986, + "kl": 0.22233072916666666, + "learning_rate": 2.7192326382858975e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 529 + }, + { + "completion_length": 188.31250635782877, + "epoch": 17.674157303370787, + "grad_norm": 0.00048174302825108346, + "kl": 0.22721354166666666, + "learning_rate": 2.7012378032923343e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 530 + }, + { + "completion_length": 159.56250762939453, + "epoch": 17.707865168539325, + "grad_norm": 0.00023391908498085804, + "kl": 0.3929036458333333, + "learning_rate": 2.6832806447297556e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 531 + }, + { + "completion_length": 178.4305623372396, + "epoch": 17.741573033707866, + "grad_norm": 0.008927285579615134, + "kl": 0.23388671875, + "learning_rate": 2.665361456913797e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 532 + }, + { + "completion_length": 197.43750508626303, + "epoch": 17.775280898876403, + "grad_norm": 0.0007373960660768022, + "kl": 0.22493489583333334, + "learning_rate": 2.6474805335377497e-07, + "loss": 0.0, + "reward": 0.21041668206453323, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 533 + }, + { + "completion_length": 170.97222391764322, + "epoch": 17.808988764044944, + "grad_norm": 0.09895245114478557, + "kl": 0.2718098958333333, + "learning_rate": 2.6296381676677604e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 534 + }, + { + "completion_length": 155.2916692097982, + "epoch": 17.84269662921348, + "grad_norm": 0.00019845446555343958, + "kl": 0.2713216145833333, + "learning_rate": 2.6118346517380204e-07, + "loss": 0.0, + "reward": 0.2597222452362378, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 535 + }, + { + "completion_length": 163.6041742960612, + "epoch": 17.876404494382022, + "grad_norm": 0.35059106378117666, + "kl": 0.2736002604166667, + "learning_rate": 2.5940702775459744e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 536 + }, + { + "completion_length": 213.95139821370444, + "epoch": 17.910112359550563, + "grad_norm": 0.00032919963866057854, + "kl": 0.20572916666666666, + "learning_rate": 2.576345336247545e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 537 + }, + { + "completion_length": 200.06944783528647, + "epoch": 17.9438202247191, + "grad_norm": 0.0002946586980019382, + "kl": 0.22517903645833334, + "learning_rate": 2.558660118352348e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 538 + }, + { + "completion_length": 218.2152837117513, + "epoch": 17.97752808988764, + "grad_norm": 0.0007617386477090666, + "kl": 0.2649739583333333, + "learning_rate": 2.54101491371894e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 539 + }, + { + "completion_length": 143.25, + "epoch": 18.0, + "grad_norm": 0.0013684971481281655, + "kl": 0.3729248046875, + "learning_rate": 2.523410011550064e-07, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 540 + }, + { + "completion_length": 147.11805852254233, + "epoch": 18.03370786516854, + "grad_norm": 0.00026044187261099505, + "kl": 0.27197265625, + "learning_rate": 2.5058457003879095e-07, + "loss": 0.0, + "reward": 0.433333362142245, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3333333432674408, + "step": 541 + }, + { + "completion_length": 196.44445292154947, + "epoch": 18.06741573033708, + "grad_norm": 0.0001726259095882516, + "kl": 0.2848307291666667, + "learning_rate": 2.4883222681093914e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 542 + }, + { + "completion_length": 215.82639185587564, + "epoch": 18.10112359550562, + "grad_norm": 0.027896195514329737, + "kl": 0.25146484375, + "learning_rate": 2.470840001921416e-07, + "loss": 0.0, + "reward": 0.32083336760600406, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.2222222238779068, + "step": 543 + }, + { + "completion_length": 158.09028244018555, + "epoch": 18.134831460674157, + "grad_norm": 0.005923091097568209, + "kl": 0.3663736979166667, + "learning_rate": 2.4533991883561867e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 544 + }, + { + "completion_length": 195.6805623372396, + "epoch": 18.168539325842698, + "grad_norm": 0.0027432563151921527, + "kl": 0.2667643229166667, + "learning_rate": 2.4360001132665e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 545 + }, + { + "completion_length": 170.83334096272787, + "epoch": 18.202247191011235, + "grad_norm": 0.000246278258278897, + "kl": 0.3001302083333333, + "learning_rate": 2.4186430618210703e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 546 + }, + { + "completion_length": 167.8680591583252, + "epoch": 18.235955056179776, + "grad_norm": 0.000783715655676822, + "kl": 0.3297526041666667, + "learning_rate": 2.401328318499846e-07, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 547 + }, + { + "completion_length": 161.98611704508463, + "epoch": 18.269662921348313, + "grad_norm": 0.18241735833336076, + "kl": 0.3321940104166667, + "learning_rate": 2.3840561670893495e-07, + "loss": 0.0, + "reward": 0.18333334724108377, + "reward_std": 0.05143445233503977, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.08333333333333333, + "step": 548 + }, + { + "completion_length": 155.9652862548828, + "epoch": 18.303370786516854, + "grad_norm": 0.0003336559068269552, + "kl": 0.3075358072916667, + "learning_rate": 2.366826890678027e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 549 + }, + { + "completion_length": 175.50695164998373, + "epoch": 18.337078651685392, + "grad_norm": 0.0004679263573041468, + "kl": 0.3214518229166667, + "learning_rate": 2.349640771651611e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 550 + }, + { + "completion_length": 229.22222773234049, + "epoch": 18.370786516853933, + "grad_norm": 0.0004848884442788, + "kl": 0.232666015625, + "learning_rate": 2.3324980916884858e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 551 + }, + { + "completion_length": 158.61805979410806, + "epoch": 18.40449438202247, + "grad_norm": 0.0009329956005683197, + "kl": 0.3297526041666667, + "learning_rate": 2.3153991317550808e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 552 + }, + { + "completion_length": 163.10416793823242, + "epoch": 18.43820224719101, + "grad_norm": 0.0005516296077125221, + "kl": 0.2710774739583333, + "learning_rate": 2.2983441721012542e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 553 + }, + { + "completion_length": 153.09028180440268, + "epoch": 18.471910112359552, + "grad_norm": 0.000298872459206572, + "kl": 0.2736002604166667, + "learning_rate": 2.2813334922557077e-07, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 554 + }, + { + "completion_length": 192.59722646077475, + "epoch": 18.50561797752809, + "grad_norm": 0.0009054638856262661, + "kl": 0.169677734375, + "learning_rate": 2.264367371021401e-07, + "loss": 0.0, + "reward": 0.3222222353021304, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 555 + }, + { + "completion_length": 174.73611704508463, + "epoch": 18.53932584269663, + "grad_norm": 0.0002770136997766701, + "kl": 0.21346028645833334, + "learning_rate": 2.247446086470982e-07, + "loss": 0.0, + "reward": 0.3777778148651123, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 556 + }, + { + "completion_length": 207.28473027547201, + "epoch": 18.573033707865168, + "grad_norm": 0.05825502088348805, + "kl": 0.294921875, + "learning_rate": 2.2305699159422369e-07, + "loss": 0.0, + "reward": 0.15277778978149095, + "reward_std": 0.0029695695266127586, + "rewards/format_reward_func": 0.09722223008672397, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 557 + }, + { + "completion_length": 188.03472646077475, + "epoch": 18.60674157303371, + "grad_norm": 0.00015557238876316614, + "kl": 0.289794921875, + "learning_rate": 2.2137391360335328e-07, + "loss": 0.0, + "reward": 0.3777778148651123, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 558 + }, + { + "completion_length": 238.39584096272787, + "epoch": 18.640449438202246, + "grad_norm": 0.035713562760989116, + "kl": 0.24300130208333334, + "learning_rate": 2.1969540225992922e-07, + "loss": 0.0, + "reward": 0.21041668206453323, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 559 + }, + { + "completion_length": 228.77084350585938, + "epoch": 18.674157303370787, + "grad_norm": 0.0006005145822756564, + "kl": 0.21875, + "learning_rate": 2.180214850745467e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 560 + }, + { + "completion_length": 179.33333841959634, + "epoch": 18.707865168539325, + "grad_norm": 0.0004564146216875389, + "kl": 0.2501627604166667, + "learning_rate": 2.1635218948250388e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 561 + }, + { + "completion_length": 174.78472391764322, + "epoch": 18.741573033707866, + "grad_norm": 0.0003128496093798455, + "kl": 0.2858072916666667, + "learning_rate": 2.1468754284335095e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 562 + }, + { + "completion_length": 187.93750508626303, + "epoch": 18.775280898876403, + "grad_norm": 0.00018512921721307578, + "kl": 0.2775065104166667, + "learning_rate": 2.1302757244044255e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 563 + }, + { + "completion_length": 192.46527989705405, + "epoch": 18.808988764044944, + "grad_norm": 0.000266200751305559, + "kl": 0.23868815104166666, + "learning_rate": 2.113723054804904e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 564 + }, + { + "completion_length": 215.0555623372396, + "epoch": 18.84269662921348, + "grad_norm": 0.0017516227688565205, + "kl": 0.23738606770833334, + "learning_rate": 2.0972176909311712e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 565 + }, + { + "completion_length": 174.13195037841797, + "epoch": 18.876404494382022, + "grad_norm": 0.00018264524840000274, + "kl": 0.2679850260416667, + "learning_rate": 2.0807599033041234e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 566 + }, + { + "completion_length": 167.75000508626303, + "epoch": 18.910112359550563, + "grad_norm": 0.0002559989868456028, + "kl": 0.257080078125, + "learning_rate": 2.0643499616648845e-07, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 567 + }, + { + "completion_length": 153.86111704508463, + "epoch": 18.9438202247191, + "grad_norm": 0.0004999551391215485, + "kl": 0.2953287760416667, + "learning_rate": 2.0479881349703882e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 568 + }, + { + "completion_length": 136.27778244018555, + "epoch": 18.97752808988764, + "grad_norm": 0.04190039881867179, + "kl": 0.2545572916666667, + "learning_rate": 2.031674691388971e-07, + "loss": 0.0, + "reward": 0.4263889143864314, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3263888955116272, + "step": 569 + }, + { + "completion_length": 215.0, + "epoch": 19.0, + "grad_norm": 0.04190039881867179, + "kl": 0.1629638671875, + "learning_rate": 2.0154098982959744e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 570 + }, + { + "completion_length": 168.04167048136392, + "epoch": 19.03370786516854, + "grad_norm": 0.0001341953629555816, + "kl": 0.2789713541666667, + "learning_rate": 1.999194022269368e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 571 + }, + { + "completion_length": 195.00694783528647, + "epoch": 19.06741573033708, + "grad_norm": 0.00020498046490480363, + "kl": 0.30078125, + "learning_rate": 1.9830273290853766e-07, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 572 + }, + { + "completion_length": 239.819455464681, + "epoch": 19.10112359550562, + "grad_norm": 0.0005943589747518772, + "kl": 0.18318684895833334, + "learning_rate": 1.96691008371412e-07, + "loss": 0.0, + "reward": 0.1541666785875956, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 573 + }, + { + "completion_length": 158.29167048136392, + "epoch": 19.134831460674157, + "grad_norm": 0.047560030725533456, + "kl": 0.2509765625, + "learning_rate": 1.950842550315277e-07, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 574 + }, + { + "completion_length": 185.8125025431315, + "epoch": 19.168539325842698, + "grad_norm": 0.0007172726668013988, + "kl": 0.22526041666666666, + "learning_rate": 1.9348249922337518e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 575 + }, + { + "completion_length": 176.61806360880533, + "epoch": 19.202247191011235, + "grad_norm": 0.0008811528127459397, + "kl": 0.2940266927083333, + "learning_rate": 1.918857671995363e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 576 + }, + { + "completion_length": 170.8263931274414, + "epoch": 19.235955056179776, + "grad_norm": 0.001179973268080948, + "kl": 0.3216145833333333, + "learning_rate": 1.9029408513025335e-07, + "loss": 0.0, + "reward": 0.4333333745598793, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3333333432674408, + "step": 577 + }, + { + "completion_length": 211.79861704508463, + "epoch": 19.269662921348313, + "grad_norm": 0.0006627643540678583, + "kl": 0.2586263020833333, + "learning_rate": 1.8870747910300062e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 578 + }, + { + "completion_length": 155.1875025431315, + "epoch": 19.303370786516854, + "grad_norm": 0.00048531562442700196, + "kl": 0.3245442708333333, + "learning_rate": 1.8712597512205657e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 579 + }, + { + "completion_length": 176.35417048136392, + "epoch": 19.337078651685392, + "grad_norm": 0.00028395786449119596, + "kl": 0.2833658854166667, + "learning_rate": 1.8554959910807772e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 580 + }, + { + "completion_length": 196.03472900390625, + "epoch": 19.370786516853933, + "grad_norm": 0.0001654614858044179, + "kl": 0.18741861979166666, + "learning_rate": 1.8397837689767426e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 581 + }, + { + "completion_length": 160.04861704508463, + "epoch": 19.40449438202247, + "grad_norm": 0.00018889052175852746, + "kl": 0.24820963541666666, + "learning_rate": 1.824123342429858e-07, + "loss": 0.0, + "reward": 0.2104166845480601, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 582 + }, + { + "completion_length": 203.73611450195312, + "epoch": 19.43820224719101, + "grad_norm": 0.021960395357737222, + "kl": 0.19148763020833334, + "learning_rate": 1.808514968112596e-07, + "loss": 0.0, + "reward": 0.3222222353021304, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 583 + }, + { + "completion_length": 154.79861704508463, + "epoch": 19.471910112359552, + "grad_norm": 0.0002616209821961607, + "kl": 0.3020833333333333, + "learning_rate": 1.7929589018443014e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 584 + }, + { + "completion_length": 148.24305725097656, + "epoch": 19.50561797752809, + "grad_norm": 0.002710281727650938, + "kl": 0.3792317708333333, + "learning_rate": 1.777455398586995e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 585 + }, + { + "completion_length": 209.23611704508463, + "epoch": 19.53932584269663, + "grad_norm": 0.00044279739478836946, + "kl": 0.215576171875, + "learning_rate": 1.7620047124411997e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 586 + }, + { + "completion_length": 203.9166692097982, + "epoch": 19.573033707865168, + "grad_norm": 0.00018720890391204392, + "kl": 0.207763671875, + "learning_rate": 1.7466070966417678e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 587 + }, + { + "completion_length": 222.73611704508463, + "epoch": 19.60674157303371, + "grad_norm": 0.00027659453774651666, + "kl": 0.21435546875, + "learning_rate": 1.7312628035537386e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 588 + }, + { + "completion_length": 216.28472773234049, + "epoch": 19.640449438202246, + "grad_norm": 0.0006651186379978518, + "kl": 0.2509765625, + "learning_rate": 1.715972084668195e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 589 + }, + { + "completion_length": 214.03472900390625, + "epoch": 19.674157303370787, + "grad_norm": 0.005093142644139778, + "kl": 0.21647135416666666, + "learning_rate": 1.700735190598151e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 590 + }, + { + "completion_length": 180.29167048136392, + "epoch": 19.707865168539325, + "grad_norm": 0.00021382503730474258, + "kl": 0.2607421875, + "learning_rate": 1.6855523710744335e-07, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 591 + }, + { + "completion_length": 179.47222646077475, + "epoch": 19.741573033707866, + "grad_norm": 0.0004881405300365822, + "kl": 0.2679036458333333, + "learning_rate": 1.6704238749415955e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 592 + }, + { + "completion_length": 174.68750635782877, + "epoch": 19.775280898876403, + "grad_norm": 0.00041905274059340306, + "kl": 0.3966471354166667, + "learning_rate": 1.655349950153837e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 593 + }, + { + "completion_length": 200.13889694213867, + "epoch": 19.808988764044944, + "grad_norm": 0.0028517577771855593, + "kl": 0.2874348958333333, + "learning_rate": 1.6403308437709378e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 594 + }, + { + "completion_length": 195.22223154703775, + "epoch": 19.84269662921348, + "grad_norm": 0.0011276110234709326, + "kl": 0.2775065104166667, + "learning_rate": 1.6253668019542154e-07, + "loss": 0.0, + "reward": 0.09930556267499924, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.0, + "step": 595 + }, + { + "completion_length": 219.68750508626303, + "epoch": 19.876404494382022, + "grad_norm": 0.03488538133583427, + "kl": 0.20206705729166666, + "learning_rate": 1.6104580699624837e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 596 + }, + { + "completion_length": 148.3541717529297, + "epoch": 19.910112359550563, + "grad_norm": 0.05702381605498913, + "kl": 0.5608723958333334, + "learning_rate": 1.5956048921480335e-07, + "loss": 0.0001, + "reward": 0.2597222452362378, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.15972222884496054, + "step": 597 + }, + { + "completion_length": 166.65972646077475, + "epoch": 19.9438202247191, + "grad_norm": 0.037675327156333635, + "kl": 0.31396484375, + "learning_rate": 1.5808075119526322e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 598 + }, + { + "completion_length": 255.83334096272787, + "epoch": 19.97752808988764, + "grad_norm": 0.0009637475867221613, + "kl": 0.21443684895833334, + "learning_rate": 1.566066171903528e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 599 + }, + { + "completion_length": 178.75, + "epoch": 20.0, + "grad_norm": 0.00015996814481130495, + "kl": 0.2569580078125, + "learning_rate": 1.5513811136094785e-07, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 600 + }, + { + "completion_length": 153.1458371480306, + "epoch": 20.03370786516854, + "grad_norm": 0.00035732953358737354, + "kl": 0.26904296875, + "learning_rate": 1.536752577756795e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 601 + }, + { + "completion_length": 162.93750635782877, + "epoch": 20.06741573033708, + "grad_norm": 0.0002183734845821849, + "kl": 0.2861328125, + "learning_rate": 1.5221808041053873e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 602 + }, + { + "completion_length": 208.79862213134766, + "epoch": 20.10112359550562, + "grad_norm": 0.0007000026892602064, + "kl": 0.2601725260416667, + "learning_rate": 1.5076660314848422e-07, + "loss": 0.0, + "reward": 0.21111111591259638, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 603 + }, + { + "completion_length": 163.2291692097982, + "epoch": 20.134831460674157, + "grad_norm": 0.0011135002082513414, + "kl": 0.27001953125, + "learning_rate": 1.493208497790504e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 604 + }, + { + "completion_length": 188.23611704508463, + "epoch": 20.168539325842698, + "grad_norm": 0.0002996725820640498, + "kl": 0.2762044270833333, + "learning_rate": 1.478808439979583e-07, + "loss": 0.0, + "reward": 0.3222222576538722, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2222222238779068, + "step": 605 + }, + { + "completion_length": 187.9513956705729, + "epoch": 20.202247191011235, + "grad_norm": 0.0024256150363441953, + "kl": 0.25927734375, + "learning_rate": 1.4644660940672627e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 606 + }, + { + "completion_length": 138.69444529215494, + "epoch": 20.235955056179776, + "grad_norm": 0.00035899558165193525, + "kl": 0.2972005208333333, + "learning_rate": 1.4501816951228369e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 607 + }, + { + "completion_length": 197.69445037841797, + "epoch": 20.269662921348313, + "grad_norm": 0.03175607934367312, + "kl": 0.23307291666666666, + "learning_rate": 1.435955477265855e-07, + "loss": 0.0, + "reward": 0.26597224920988083, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 608 + }, + { + "completion_length": 189.9583422342936, + "epoch": 20.303370786516854, + "grad_norm": 0.0005107194044044579, + "kl": 0.293701171875, + "learning_rate": 1.4217876736622848e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 609 + }, + { + "completion_length": 229.42361704508463, + "epoch": 20.337078651685392, + "grad_norm": 0.0006320520954696784, + "kl": 0.2431640625, + "learning_rate": 1.4076785165206962e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 610 + }, + { + "completion_length": 124.14583841959636, + "epoch": 20.370786516853933, + "grad_norm": 0.00031276872475435034, + "kl": 0.2845052083333333, + "learning_rate": 1.3936282370884455e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 611 + }, + { + "completion_length": 181.7638931274414, + "epoch": 20.40449438202247, + "grad_norm": 0.00017349092419559616, + "kl": 0.24886067708333334, + "learning_rate": 1.3796370656478934e-07, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 612 + }, + { + "completion_length": 215.47222900390625, + "epoch": 20.43820224719101, + "grad_norm": 0.0001880996687355057, + "kl": 0.19132486979166666, + "learning_rate": 1.3657052315126248e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 613 + }, + { + "completion_length": 170.79861704508463, + "epoch": 20.471910112359552, + "grad_norm": 0.0002810709280405986, + "kl": 0.3165690104166667, + "learning_rate": 1.3518329630236987e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 614 + }, + { + "completion_length": 260.0972264607747, + "epoch": 20.50561797752809, + "grad_norm": 0.042187188050853905, + "kl": 0.20711263020833334, + "learning_rate": 1.338020487545896e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 615 + }, + { + "completion_length": 213.7777837117513, + "epoch": 20.53932584269663, + "grad_norm": 0.0001851019003729682, + "kl": 0.2527669270833333, + "learning_rate": 1.3242680314639993e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 616 + }, + { + "completion_length": 226.40972773234049, + "epoch": 20.573033707865168, + "grad_norm": 0.023844437105060456, + "kl": 0.19376627604166666, + "learning_rate": 1.310575820179079e-07, + "loss": 0.0, + "reward": 0.15486112236976624, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 617 + }, + { + "completion_length": 171.54167366027832, + "epoch": 20.60674157303371, + "grad_norm": 0.00021010349928677328, + "kl": 0.2820638020833333, + "learning_rate": 1.2969440781048013e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 618 + }, + { + "completion_length": 134.11111450195312, + "epoch": 20.640449438202246, + "grad_norm": 0.0010084558357691066, + "kl": 0.3831380208333333, + "learning_rate": 1.283373028663751e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 619 + }, + { + "completion_length": 180.94444783528647, + "epoch": 20.674157303370787, + "grad_norm": 0.0002054378275055212, + "kl": 0.2688802083333333, + "learning_rate": 1.2698628942837697e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 620 + }, + { + "completion_length": 182.97222900390625, + "epoch": 20.707865168539325, + "grad_norm": 0.0007744267283834212, + "kl": 0.22395833333333334, + "learning_rate": 1.2564138963943027e-07, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 621 + }, + { + "completion_length": 164.11805852254233, + "epoch": 20.741573033707866, + "grad_norm": 0.00017960774710246342, + "kl": 0.447265625, + "learning_rate": 1.24302625542278e-07, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 622 + }, + { + "completion_length": 259.31250254313153, + "epoch": 20.775280898876403, + "grad_norm": 0.16308362100618984, + "kl": 0.207763671875, + "learning_rate": 1.229700190790998e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 623 + }, + { + "completion_length": 196.23611195882162, + "epoch": 20.808988764044944, + "grad_norm": 0.0005471761264440179, + "kl": 0.232421875, + "learning_rate": 1.2164359209115232e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 624 + }, + { + "completion_length": 198.34028116861978, + "epoch": 20.84269662921348, + "grad_norm": 0.0005723121383704211, + "kl": 0.3268229166666667, + "learning_rate": 1.2032336631841182e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 625 + }, + { + "completion_length": 178.5138931274414, + "epoch": 20.876404494382022, + "grad_norm": 0.0004743536846415594, + "kl": 0.2896321614583333, + "learning_rate": 1.1900936339921691e-07, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 626 + }, + { + "completion_length": 197.2291742960612, + "epoch": 20.910112359550563, + "grad_norm": 0.000358683834699279, + "kl": 0.23649088541666666, + "learning_rate": 1.177016048699146e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 627 + }, + { + "completion_length": 204.22222900390625, + "epoch": 20.9438202247191, + "grad_norm": 0.00020560337442893765, + "kl": 0.24951171875, + "learning_rate": 1.1640011216450691e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 628 + }, + { + "completion_length": 167.7986183166504, + "epoch": 20.97752808988764, + "grad_norm": 0.0005777950758507777, + "kl": 0.2783203125, + "learning_rate": 1.1510490661430028e-07, + "loss": 0.0, + "reward": 0.3222222576538722, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2222222238779068, + "step": 629 + }, + { + "completion_length": 285.0, + "epoch": 21.0, + "grad_norm": 0.00028896356973310124, + "kl": 0.2581787109375, + "learning_rate": 1.1381600944755492e-07, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 630 + }, + { + "completion_length": 175.84028244018555, + "epoch": 21.03370786516854, + "grad_norm": 0.0011333379282095048, + "kl": 0.30712890625, + "learning_rate": 1.1253344178913771e-07, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 631 + }, + { + "completion_length": 174.7638956705729, + "epoch": 21.06741573033708, + "grad_norm": 0.0005113125284089561, + "kl": 0.2566731770833333, + "learning_rate": 1.1125722466017545e-07, + "loss": 0.0, + "reward": 0.3777778074145317, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 632 + }, + { + "completion_length": 170.9652862548828, + "epoch": 21.10112359550562, + "grad_norm": 0.000567750501402814, + "kl": 0.2529296875, + "learning_rate": 1.0998737897771054e-07, + "loss": 0.0, + "reward": 0.21111111591259638, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 633 + }, + { + "completion_length": 129.0763931274414, + "epoch": 21.134831460674157, + "grad_norm": 0.0006490317569770536, + "kl": 0.3323567708333333, + "learning_rate": 1.0872392555435855e-07, + "loss": 0.0, + "reward": 0.488888921837012, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3888889004786809, + "step": 634 + }, + { + "completion_length": 152.38889439900717, + "epoch": 21.168539325842698, + "grad_norm": 0.000288819247591633, + "kl": 0.3590494791666667, + "learning_rate": 1.0746688509796625e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 635 + }, + { + "completion_length": 215.37500635782877, + "epoch": 21.202247191011235, + "grad_norm": 0.09509192691224101, + "kl": 0.17830403645833334, + "learning_rate": 1.0621627821127288e-07, + "loss": 0.0, + "reward": 0.14861112584670386, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 636 + }, + { + "completion_length": 212.9791742960612, + "epoch": 21.235955056179776, + "grad_norm": 0.0004321258063297821, + "kl": 0.240234375, + "learning_rate": 1.0497212539157219e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 637 + }, + { + "completion_length": 168.4583396911621, + "epoch": 21.269662921348313, + "grad_norm": 0.00025448148191447624, + "kl": 0.24723307291666666, + "learning_rate": 1.0373444703037643e-07, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 638 + }, + { + "completion_length": 131.09028244018555, + "epoch": 21.303370786516854, + "grad_norm": 0.0029377138654805676, + "kl": 0.3684895833333333, + "learning_rate": 1.0250326341308274e-07, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 639 + }, + { + "completion_length": 206.4027862548828, + "epoch": 21.337078651685392, + "grad_norm": 0.00032973067704078626, + "kl": 0.2626139322916667, + "learning_rate": 1.0127859471863969e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 640 + }, + { + "completion_length": 169.2083371480306, + "epoch": 21.370786516853933, + "grad_norm": 0.0004075048935201697, + "kl": 0.2930501302083333, + "learning_rate": 1.0006046101921727e-07, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 641 + }, + { + "completion_length": 145.7638956705729, + "epoch": 21.40449438202247, + "grad_norm": 0.05338005115598968, + "kl": 0.31591796875, + "learning_rate": 9.884888227987759e-08, + "loss": 0.0, + "reward": 0.2041666880249977, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.10416666666666667, + "step": 642 + }, + { + "completion_length": 178.29861704508463, + "epoch": 21.43820224719101, + "grad_norm": 0.0003599425330882252, + "kl": 0.22802734375, + "learning_rate": 9.764387835824794e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 643 + }, + { + "completion_length": 158.7569491068522, + "epoch": 21.471910112359552, + "grad_norm": 0.00020968012419614123, + "kl": 0.24641927083333334, + "learning_rate": 9.644546900419531e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 644 + }, + { + "completion_length": 188.34722391764322, + "epoch": 21.50561797752809, + "grad_norm": 0.0006269598121744968, + "kl": 0.3116861979166667, + "learning_rate": 9.525367385950206e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 645 + }, + { + "completion_length": 195.36111704508463, + "epoch": 21.53932584269663, + "grad_norm": 0.001291702139087472, + "kl": 0.2936197916666667, + "learning_rate": 9.406851245754477e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 646 + }, + { + "completion_length": 183.1388905843099, + "epoch": 21.573033707865168, + "grad_norm": 0.0007546123178681681, + "kl": 0.291015625, + "learning_rate": 9.289000422297372e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 647 + }, + { + "completion_length": 193.90972646077475, + "epoch": 21.60674157303371, + "grad_norm": 0.0002355729579157676, + "kl": 0.25048828125, + "learning_rate": 9.171816847139447e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 648 + }, + { + "completion_length": 192.04167302449545, + "epoch": 21.640449438202246, + "grad_norm": 0.000708093859211731, + "kl": 0.2530924479166667, + "learning_rate": 9.055302440905177e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 649 + }, + { + "completion_length": 238.34722900390625, + "epoch": 21.674157303370787, + "grad_norm": 0.0010115801998800392, + "kl": 0.22021484375, + "learning_rate": 8.939459113251407e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 650 + }, + { + "completion_length": 246.14584604899088, + "epoch": 21.707865168539325, + "grad_norm": 0.00015701443909386454, + "kl": 0.15079752604166666, + "learning_rate": 8.824288762836097e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 651 + }, + { + "completion_length": 187.0277837117513, + "epoch": 21.741573033707866, + "grad_norm": 0.00033617915303408224, + "kl": 0.21321614583333334, + "learning_rate": 8.70979327728718e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 652 + }, + { + "completion_length": 192.2500025431315, + "epoch": 21.775280898876403, + "grad_norm": 0.0006794413223166669, + "kl": 0.24332682291666666, + "learning_rate": 8.595974533171651e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 653 + }, + { + "completion_length": 192.7638931274414, + "epoch": 21.808988764044944, + "grad_norm": 0.0002873869578904506, + "kl": 0.22412109375, + "learning_rate": 8.4828343959648e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 654 + }, + { + "completion_length": 194.81250635782877, + "epoch": 21.84269662921348, + "grad_norm": 0.0399085129648236, + "kl": 0.279541015625, + "learning_rate": 8.370374720019629e-08, + "loss": 0.0, + "reward": 0.2097222457329432, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 655 + }, + { + "completion_length": 184.7430623372396, + "epoch": 21.876404494382022, + "grad_norm": 0.043644105650381015, + "kl": 0.21126302083333334, + "learning_rate": 8.258597348536451e-08, + "loss": 0.0, + "reward": 0.15486112236976624, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 656 + }, + { + "completion_length": 246.0416717529297, + "epoch": 21.910112359550563, + "grad_norm": 0.09641891969018652, + "kl": 0.21199544270833334, + "learning_rate": 8.147504113532682e-08, + "loss": 0.0, + "reward": 0.14791668206453323, + "reward_std": 0.0216060404976209, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.04861111442248026, + "step": 657 + }, + { + "completion_length": 235.88889821370444, + "epoch": 21.9438202247191, + "grad_norm": 0.00026395665371413483, + "kl": 0.21378580729166666, + "learning_rate": 8.037096835812884e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 658 + }, + { + "completion_length": 184.0902837117513, + "epoch": 21.97752808988764, + "grad_norm": 0.00022547220455543436, + "kl": 0.2903645833333333, + "learning_rate": 7.9273773249388e-08, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 659 + }, + { + "completion_length": 187.0, + "epoch": 22.0, + "grad_norm": 0.00022547220455543436, + "kl": 0.27587890625, + "learning_rate": 7.81834737919978e-08, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 660 + }, + { + "completion_length": 239.2152887980143, + "epoch": 22.03370786516854, + "grad_norm": 0.00026129371410752627, + "kl": 0.19563802083333334, + "learning_rate": 7.710008785583289e-08, + "loss": 0.0, + "reward": 0.09930556267499924, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.0, + "step": 661 + }, + { + "completion_length": 219.77778116861978, + "epoch": 22.06741573033708, + "grad_norm": 0.0720117078235811, + "kl": 0.22607421875, + "learning_rate": 7.602363319745608e-08, + "loss": 0.0, + "reward": 0.3152778049310048, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.21527778605620065, + "step": 662 + }, + { + "completion_length": 196.0416717529297, + "epoch": 22.10112359550562, + "grad_norm": 0.0006203240647822728, + "kl": 0.23876953125, + "learning_rate": 7.495412745982759e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 663 + }, + { + "completion_length": 222.88195037841797, + "epoch": 22.134831460674157, + "grad_norm": 0.00019386445482427227, + "kl": 0.24039713541666666, + "learning_rate": 7.389158817201541e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 664 + }, + { + "completion_length": 169.30555979410806, + "epoch": 22.168539325842698, + "grad_norm": 0.0012245296472149798, + "kl": 0.2843424479166667, + "learning_rate": 7.283603274890848e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 665 + }, + { + "completion_length": 156.3680623372396, + "epoch": 22.202247191011235, + "grad_norm": 0.0004342989088723961, + "kl": 0.33203125, + "learning_rate": 7.178747849093092e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 666 + }, + { + "completion_length": 148.95833841959634, + "epoch": 22.235955056179776, + "grad_norm": 0.0002122003256238786, + "kl": 0.2965494791666667, + "learning_rate": 7.074594258375849e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 667 + }, + { + "completion_length": 196.5069465637207, + "epoch": 22.269662921348313, + "grad_norm": 0.0005347691003669362, + "kl": 0.22835286458333334, + "learning_rate": 6.971144209803736e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 668 + }, + { + "completion_length": 205.11111704508463, + "epoch": 22.303370786516854, + "grad_norm": 0.0002628755773513076, + "kl": 0.2989908854166667, + "learning_rate": 6.86839939891039e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 669 + }, + { + "completion_length": 135.08333587646484, + "epoch": 22.337078651685392, + "grad_norm": 0.00047867159577611845, + "kl": 0.2989095052083333, + "learning_rate": 6.766361509670687e-08, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 670 + }, + { + "completion_length": 200.7638931274414, + "epoch": 22.370786516853933, + "grad_norm": 0.00015208202471283735, + "kl": 0.24153645833333334, + "learning_rate": 6.665032214473126e-08, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 671 + }, + { + "completion_length": 196.0555623372396, + "epoch": 22.40449438202247, + "grad_norm": 0.04462460980187588, + "kl": 0.30419921875, + "learning_rate": 6.564413174092443e-08, + "loss": 0.0, + "reward": 0.2104166845480601, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 672 + }, + { + "completion_length": 180.42361958821616, + "epoch": 22.43820224719101, + "grad_norm": 0.0004364288169739122, + "kl": 0.28271484375, + "learning_rate": 6.464506037662415e-08, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 673 + }, + { + "completion_length": 181.54167111714682, + "epoch": 22.471910112359552, + "grad_norm": 0.00039649450557427017, + "kl": 0.3271484375, + "learning_rate": 6.365312442648769e-08, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 674 + }, + { + "completion_length": 179.6805623372396, + "epoch": 22.50561797752809, + "grad_norm": 0.0011572513401064352, + "kl": 0.29052734375, + "learning_rate": 6.266834014822376e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 675 + }, + { + "completion_length": 181.51389185587564, + "epoch": 22.53932584269663, + "grad_norm": 0.00039210323797979823, + "kl": 0.29345703125, + "learning_rate": 6.16907236823262e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 676 + }, + { + "completion_length": 187.59722773234049, + "epoch": 22.573033707865168, + "grad_norm": 0.0004503210090645172, + "kl": 0.2568359375, + "learning_rate": 6.072029105180909e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 677 + }, + { + "completion_length": 205.73611958821616, + "epoch": 22.60674157303371, + "grad_norm": 0.0002451871680843138, + "kl": 0.20003255208333334, + "learning_rate": 5.97570581619446e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 678 + }, + { + "completion_length": 187.25695037841797, + "epoch": 22.640449438202246, + "grad_norm": 0.0006996349416177805, + "kl": 0.205810546875, + "learning_rate": 5.880104080000181e-08, + "loss": 0.0, + "reward": 0.1972222402691841, + "reward_std": 0.025717226167519886, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.09722222884496053, + "step": 679 + }, + { + "completion_length": 150.1319491068522, + "epoch": 22.674157303370787, + "grad_norm": 0.032936616275985584, + "kl": 0.3470052083333333, + "learning_rate": 5.785225463498828e-08, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.03928371022144953, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 680 + }, + { + "completion_length": 191.94444783528647, + "epoch": 22.707865168539325, + "grad_norm": 0.3028052223353757, + "kl": 0.31640625, + "learning_rate": 5.691071521739299e-08, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 681 + }, + { + "completion_length": 188.6041692097982, + "epoch": 22.741573033707866, + "grad_norm": 0.0014993329601292295, + "kl": 0.246826171875, + "learning_rate": 5.5976437978931755e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 682 + }, + { + "completion_length": 187.1111183166504, + "epoch": 22.775280898876403, + "grad_norm": 0.034097284466993213, + "kl": 0.23152669270833334, + "learning_rate": 5.50494382322943e-08, + "loss": 0.0, + "reward": 0.26597223182519275, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 683 + }, + { + "completion_length": 192.34722900390625, + "epoch": 22.808988764044944, + "grad_norm": 0.0002156528679256827, + "kl": 0.23095703125, + "learning_rate": 5.412973117089287e-08, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 684 + }, + { + "completion_length": 177.7013956705729, + "epoch": 22.84269662921348, + "grad_norm": 0.0007468229952328737, + "kl": 0.2757161458333333, + "learning_rate": 5.321733186861355e-08, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 685 + }, + { + "completion_length": 153.67361704508463, + "epoch": 22.876404494382022, + "grad_norm": 0.00021930562534913767, + "kl": 0.27587890625, + "learning_rate": 5.231225527956923e-08, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 686 + }, + { + "completion_length": 197.67361195882162, + "epoch": 22.910112359550563, + "grad_norm": 0.0001930674336640042, + "kl": 0.1875, + "learning_rate": 5.141451623785453e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 687 + }, + { + "completion_length": 154.46528244018555, + "epoch": 22.9438202247191, + "grad_norm": 0.00022220240871867917, + "kl": 0.3035481770833333, + "learning_rate": 5.052412945730239e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 688 + }, + { + "completion_length": 162.59028244018555, + "epoch": 22.97752808988764, + "grad_norm": 0.00033434089709682226, + "kl": 0.3816731770833333, + "learning_rate": 4.964110953124306e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 689 + }, + { + "completion_length": 285.75, + "epoch": 23.0, + "grad_norm": 0.0029808567952967525, + "kl": 0.1558837890625, + "learning_rate": 4.876547093226513e-08, + "loss": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.0, + "step": 690 + }, + { + "completion_length": 154.12500508626303, + "epoch": 23.03370786516854, + "grad_norm": 0.00019444589042034763, + "kl": 0.4013671875, + "learning_rate": 4.7897228011977875e-08, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 691 + }, + { + "completion_length": 156.72917302449545, + "epoch": 23.06741573033708, + "grad_norm": 0.00047010252122451973, + "kl": 0.2881673177083333, + "learning_rate": 4.703639500077655e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 692 + }, + { + "completion_length": 229.2291717529297, + "epoch": 23.10112359550562, + "grad_norm": 0.0005326773809168474, + "kl": 0.20450846354166666, + "learning_rate": 4.618298600760895e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 693 + }, + { + "completion_length": 225.0902837117513, + "epoch": 23.134831460674157, + "grad_norm": 0.000502759177162051, + "kl": 0.20987955729166666, + "learning_rate": 4.533701501974391e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 694 + }, + { + "completion_length": 186.02778116861978, + "epoch": 23.168539325842698, + "grad_norm": 0.00014990266514685729, + "kl": 0.24169921875, + "learning_rate": 4.4498495902542346e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 695 + }, + { + "completion_length": 184.22222900390625, + "epoch": 23.202247191011235, + "grad_norm": 0.00024307737881120418, + "kl": 0.2609049479166667, + "learning_rate": 4.366744239922998e-08, + "loss": 0.0, + "reward": 0.3222222353021304, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 696 + }, + { + "completion_length": 201.38889439900717, + "epoch": 23.235955056179776, + "grad_norm": 0.00040924399996122005, + "kl": 0.19352213541666666, + "learning_rate": 4.284386813067181e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 697 + }, + { + "completion_length": 242.84722900390625, + "epoch": 23.269662921348313, + "grad_norm": 0.000374710722347806, + "kl": 0.21394856770833334, + "learning_rate": 4.202778659514955e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 698 + }, + { + "completion_length": 173.90972391764322, + "epoch": 23.303370786516854, + "grad_norm": 0.0005620378509065019, + "kl": 0.3025716145833333, + "learning_rate": 4.121921116813948e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 699 + }, + { + "completion_length": 154.6111157735189, + "epoch": 23.337078651685392, + "grad_norm": 0.0002590834768844316, + "kl": 0.24446614583333334, + "learning_rate": 4.041815510209395e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 700 + }, + { + "completion_length": 160.75000762939453, + "epoch": 23.370786516853933, + "grad_norm": 0.00028182337248498063, + "kl": 0.2849934895833333, + "learning_rate": 3.962463152622364e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 701 + }, + { + "completion_length": 152.95833841959634, + "epoch": 23.40449438202247, + "grad_norm": 0.00022479142601588184, + "kl": 0.333984375, + "learning_rate": 3.8838653446283065e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 702 + }, + { + "completion_length": 161.1180623372396, + "epoch": 23.43820224719101, + "grad_norm": 0.0018515853857898302, + "kl": 0.2770182291666667, + "learning_rate": 3.806023374435663e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 703 + }, + { + "completion_length": 188.15972900390625, + "epoch": 23.471910112359552, + "grad_norm": 0.0003908424644345001, + "kl": 0.2732747395833333, + "learning_rate": 3.7289385178647935e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 704 + }, + { + "completion_length": 156.4166692097982, + "epoch": 23.50561797752809, + "grad_norm": 0.0004139485671846104, + "kl": 0.2649739583333333, + "learning_rate": 3.6526120383270634e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 705 + }, + { + "completion_length": 191.87500762939453, + "epoch": 23.53932584269663, + "grad_norm": 0.00034025317415438535, + "kl": 0.2535807291666667, + "learning_rate": 3.5770451868041174e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 706 + }, + { + "completion_length": 224.9652887980143, + "epoch": 23.573033707865168, + "grad_norm": 0.00047538032753343355, + "kl": 0.2521158854166667, + "learning_rate": 3.5022392018274173e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 707 + }, + { + "completion_length": 184.33333587646484, + "epoch": 23.60674157303371, + "grad_norm": 0.00016902296162218144, + "kl": 0.26953125, + "learning_rate": 3.4281953094578875e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 708 + }, + { + "completion_length": 148.07639439900717, + "epoch": 23.640449438202246, + "grad_norm": 0.0017519561231616905, + "kl": 0.35302734375, + "learning_rate": 3.354914723265867e-08, + "loss": 0.0, + "reward": 0.3777778148651123, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 709 + }, + { + "completion_length": 247.15972900390625, + "epoch": 23.674157303370787, + "grad_norm": 0.02444370398223035, + "kl": 0.21565755208333334, + "learning_rate": 3.282398644311185e-08, + "loss": 0.0, + "reward": 0.09930556267499924, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.0, + "step": 710 + }, + { + "completion_length": 192.4513931274414, + "epoch": 23.707865168539325, + "grad_norm": 0.00031928287705952656, + "kl": 0.1865234375, + "learning_rate": 3.210648261123505e-08, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 711 + }, + { + "completion_length": 130.00694783528647, + "epoch": 23.741573033707866, + "grad_norm": 0.02166961057094426, + "kl": 0.4842122395833333, + "learning_rate": 3.1396647496828244e-08, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 712 + }, + { + "completion_length": 165.3055559794108, + "epoch": 23.775280898876403, + "grad_norm": 0.00021593071019189043, + "kl": 0.2757161458333333, + "learning_rate": 3.069449273400199e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 713 + }, + { + "completion_length": 190.54861704508463, + "epoch": 23.808988764044944, + "grad_norm": 0.0005275181809997875, + "kl": 0.2594401041666667, + "learning_rate": 3.000002983098693e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 714 + }, + { + "completion_length": 185.94444783528647, + "epoch": 23.84269662921348, + "grad_norm": 0.05024427499231955, + "kl": 0.2776692708333333, + "learning_rate": 2.9313270169944948e-08, + "loss": 0.0, + "reward": 0.09930556515852611, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.0, + "step": 715 + }, + { + "completion_length": 165.8263956705729, + "epoch": 23.876404494382022, + "grad_norm": 0.0009329993269044629, + "kl": 0.3880208333333333, + "learning_rate": 2.8634225006782864e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 716 + }, + { + "completion_length": 195.93750508626303, + "epoch": 23.910112359550563, + "grad_norm": 0.00025637307456966284, + "kl": 0.22314453125, + "learning_rate": 2.796290547096791e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 717 + }, + { + "completion_length": 151.31944783528647, + "epoch": 23.9438202247191, + "grad_norm": 0.04947045078852399, + "kl": 0.3113606770833333, + "learning_rate": 2.7299322565344953e-08, + "loss": 0.0, + "reward": 0.3152778049310048, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.21527778605620065, + "step": 718 + }, + { + "completion_length": 182.43750508626303, + "epoch": 23.97752808988764, + "grad_norm": 0.00036627573146962256, + "kl": 0.2613932291666667, + "learning_rate": 2.6643487165956603e-08, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 719 + }, + { + "completion_length": 140.5, + "epoch": 24.0, + "grad_norm": 0.0002971632065732625, + "kl": 0.225830078125, + "learning_rate": 2.5995410021864783e-08, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 720 + }, + { + "completion_length": 155.55556106567383, + "epoch": 24.03370786516854, + "grad_norm": 0.0007566839878927481, + "kl": 0.3429361979166667, + "learning_rate": 2.5355101754974462e-08, + "loss": 0.0, + "reward": 0.3222222353021304, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 721 + }, + { + "completion_length": 220.02084096272787, + "epoch": 24.06741573033708, + "grad_norm": 0.0021202332071350374, + "kl": 0.30712890625, + "learning_rate": 2.4722572859859903e-08, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 722 + }, + { + "completion_length": 209.56945037841797, + "epoch": 24.10112359550562, + "grad_norm": 0.0014748911079252268, + "kl": 0.2996419270833333, + "learning_rate": 2.4097833703592197e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 723 + }, + { + "completion_length": 204.44444783528647, + "epoch": 24.134831460674157, + "grad_norm": 0.00017857332631362514, + "kl": 0.2545572916666667, + "learning_rate": 2.348089452556956e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 724 + }, + { + "completion_length": 169.6527837117513, + "epoch": 24.168539325842698, + "grad_norm": 0.0001785950074356697, + "kl": 0.20987955729166666, + "learning_rate": 2.2871765437349642e-08, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 725 + }, + { + "completion_length": 175.8055623372396, + "epoch": 24.202247191011235, + "grad_norm": 0.0009559561061891435, + "kl": 0.30419921875, + "learning_rate": 2.2270456422483653e-08, + "loss": 0.0, + "reward": 0.3777777949968974, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2777777860562007, + "step": 726 + }, + { + "completion_length": 121.77777989705403, + "epoch": 24.235955056179776, + "grad_norm": 0.0003273833221331931, + "kl": 0.291015625, + "learning_rate": 2.1676977336352765e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 727 + }, + { + "completion_length": 237.87500508626303, + "epoch": 24.269662921348313, + "grad_norm": 0.0018590277479054312, + "kl": 0.2599283854166667, + "learning_rate": 2.109133790600648e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 728 + }, + { + "completion_length": 195.4513905843099, + "epoch": 24.303370786516854, + "grad_norm": 0.0029628369386597965, + "kl": 0.3411458333333333, + "learning_rate": 2.0513547730003357e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 729 + }, + { + "completion_length": 218.77084096272787, + "epoch": 24.337078651685392, + "grad_norm": 0.0008109096746229552, + "kl": 0.21565755208333334, + "learning_rate": 1.9943616278253638e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 730 + }, + { + "completion_length": 158.29167048136392, + "epoch": 24.370786516853933, + "grad_norm": 0.00026495704066199625, + "kl": 0.2783203125, + "learning_rate": 1.9381552891864162e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 731 + }, + { + "completion_length": 241.6666742960612, + "epoch": 24.40449438202247, + "grad_norm": 0.00047831879231841437, + "kl": 0.20320638020833334, + "learning_rate": 1.882736678298491e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 732 + }, + { + "completion_length": 152.5208371480306, + "epoch": 24.43820224719101, + "grad_norm": 0.00027852996212374625, + "kl": 0.3201497395833333, + "learning_rate": 1.8281067034658426e-08, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 733 + }, + { + "completion_length": 179.12500508626303, + "epoch": 24.471910112359552, + "grad_norm": 0.0765084026559966, + "kl": 0.243896484375, + "learning_rate": 1.7742662600670642e-08, + "loss": 0.0, + "reward": 0.20416668554147085, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1041666716337204, + "step": 734 + }, + { + "completion_length": 163.68750127156576, + "epoch": 24.50561797752809, + "grad_norm": 0.005763544778778364, + "kl": 0.3357747395833333, + "learning_rate": 1.7212162305404288e-08, + "loss": 0.0, + "reward": 0.3222222353021304, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 735 + }, + { + "completion_length": 156.7638905843099, + "epoch": 24.53932584269663, + "grad_norm": 0.0007794087861847593, + "kl": 0.33251953125, + "learning_rate": 1.6689574843694432e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 736 + }, + { + "completion_length": 165.84722900390625, + "epoch": 24.573033707865168, + "grad_norm": 0.1349189227634209, + "kl": 0.23486328125, + "learning_rate": 1.6174908780685447e-08, + "loss": 0.0, + "reward": 0.3152778074145317, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.21527778108914694, + "step": 737 + }, + { + "completion_length": 152.34028116861978, + "epoch": 24.60674157303371, + "grad_norm": 0.00039937049073144774, + "kl": 0.2869466145833333, + "learning_rate": 1.5668172551691174e-08, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 738 + }, + { + "completion_length": 199.28472646077475, + "epoch": 24.640449438202246, + "grad_norm": 0.00025139777333443327, + "kl": 0.21630859375, + "learning_rate": 1.516937446205624e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 739 + }, + { + "completion_length": 173.2916742960612, + "epoch": 24.674157303370787, + "grad_norm": 0.00025058596047241666, + "kl": 0.2620442708333333, + "learning_rate": 1.4678522687020412e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 740 + }, + { + "completion_length": 209.97223154703775, + "epoch": 24.707865168539325, + "grad_norm": 0.00038173865169876023, + "kl": 0.2373046875, + "learning_rate": 1.4195625271584189e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 741 + }, + { + "completion_length": 199.31250508626303, + "epoch": 24.741573033707866, + "grad_norm": 0.0005465176596977205, + "kl": 0.21565755208333334, + "learning_rate": 1.3720690130377022e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 742 + }, + { + "completion_length": 238.7986183166504, + "epoch": 24.775280898876403, + "grad_norm": 0.00032664084576131233, + "kl": 0.20279947916666666, + "learning_rate": 1.3253725047527809e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 743 + }, + { + "completion_length": 173.81944783528647, + "epoch": 24.808988764044944, + "grad_norm": 0.004055452857130601, + "kl": 0.44677734375, + "learning_rate": 1.2794737676536993e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 744 + }, + { + "completion_length": 208.9375025431315, + "epoch": 24.84269662921348, + "grad_norm": 0.00020579896685799117, + "kl": 0.18115234375, + "learning_rate": 1.2343735540151446e-08, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 745 + }, + { + "completion_length": 160.34722646077475, + "epoch": 24.876404494382022, + "grad_norm": 0.0008356329502367595, + "kl": 0.3623046875, + "learning_rate": 1.1900726030241004e-08, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 746 + }, + { + "completion_length": 216.31251017252603, + "epoch": 24.910112359550563, + "grad_norm": 0.000656143426696293, + "kl": 0.2999674479166667, + "learning_rate": 1.1465716407677295e-08, + "loss": 0.0, + "reward": 0.2666666979591052, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.16666666666666666, + "step": 747 + }, + { + "completion_length": 186.24305979410806, + "epoch": 24.9438202247191, + "grad_norm": 0.00016809393151369283, + "kl": 0.2517903645833333, + "learning_rate": 1.1038713802214717e-08, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 748 + }, + { + "completion_length": 180.78472900390625, + "epoch": 24.97752808988764, + "grad_norm": 0.00039975048038322483, + "kl": 0.2503255208333333, + "learning_rate": 1.0619725212373754e-08, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 749 + }, + { + "completion_length": 92.5, + "epoch": 25.0, + "grad_norm": 0.00039975048038322483, + "kl": 0.373779296875, + "learning_rate": 1.0208757505326015e-08, + "loss": 0.0, + "reward": 0.600000012665987, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.5, + "step": 750 + }, + { + "completion_length": 176.9861157735189, + "epoch": 25.03370786516854, + "grad_norm": 0.0006370220229769309, + "kl": 0.3396809895833333, + "learning_rate": 9.805817416782047e-09, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 751 + }, + { + "completion_length": 156.05555979410806, + "epoch": 25.06741573033708, + "grad_norm": 0.0019754220869445094, + "kl": 0.3076171875, + "learning_rate": 9.410911550880474e-09, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 752 + }, + { + "completion_length": 217.2430648803711, + "epoch": 25.10112359550562, + "grad_norm": 0.001921143937532553, + "kl": 0.19124348958333334, + "learning_rate": 9.024046380080141e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 753 + }, + { + "completion_length": 208.7152837117513, + "epoch": 25.134831460674157, + "grad_norm": 0.0003456804102915417, + "kl": 0.22037760416666666, + "learning_rate": 8.645228245053759e-09, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 754 + }, + { + "completion_length": 239.0902887980143, + "epoch": 25.168539325842698, + "grad_norm": 0.00015194232532251295, + "kl": 0.25439453125, + "learning_rate": 8.274463354584316e-09, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 755 + }, + { + "completion_length": 151.90972773234049, + "epoch": 25.202247191011235, + "grad_norm": 0.00043630916755023195, + "kl": 0.2633463541666667, + "learning_rate": 7.91175778546288e-09, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 756 + }, + { + "completion_length": 154.2638931274414, + "epoch": 25.235955056179776, + "grad_norm": 0.0002392403261461652, + "kl": 0.3411458333333333, + "learning_rate": 7.557117482389352e-09, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 757 + }, + { + "completion_length": 175.05555725097656, + "epoch": 25.269662921348313, + "grad_norm": 0.0002575479251715013, + "kl": 0.22574869791666666, + "learning_rate": 7.2105482578749265e-09, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 758 + }, + { + "completion_length": 196.63195037841797, + "epoch": 25.303370786516854, + "grad_norm": 0.0001966749125281889, + "kl": 0.2586263020833333, + "learning_rate": 6.872055792146614e-09, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 759 + }, + { + "completion_length": 171.82639503479004, + "epoch": 25.337078651685392, + "grad_norm": 0.0002780892448283573, + "kl": 0.3160807291666667, + "learning_rate": 6.541645633054649e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 760 + }, + { + "completion_length": 153.02778244018555, + "epoch": 25.370786516853933, + "grad_norm": 0.003144225969746984, + "kl": 0.2815755208333333, + "learning_rate": 6.219323195981063e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 761 + }, + { + "completion_length": 236.03472646077475, + "epoch": 25.40449438202247, + "grad_norm": 0.0004151626251654825, + "kl": 0.20084635416666666, + "learning_rate": 5.90509376375109e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 762 + }, + { + "completion_length": 143.0763905843099, + "epoch": 25.43820224719101, + "grad_norm": 0.0007846172152213033, + "kl": 0.3971354166666667, + "learning_rate": 5.598962486546732e-09, + "loss": 0.0, + "reward": 0.4888889441887538, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.3888888955116272, + "step": 763 + }, + { + "completion_length": 234.56945673624674, + "epoch": 25.471910112359552, + "grad_norm": 0.000554944672487995, + "kl": 0.270263671875, + "learning_rate": 5.3009343818219975e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 764 + }, + { + "completion_length": 181.52778244018555, + "epoch": 25.50561797752809, + "grad_norm": 0.004690384031544844, + "kl": 0.22135416666666666, + "learning_rate": 5.011014334221186e-09, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 765 + }, + { + "completion_length": 202.48611450195312, + "epoch": 25.53932584269663, + "grad_norm": 0.00026768563065533924, + "kl": 0.21842447916666666, + "learning_rate": 4.7292070954983445e-09, + "loss": 0.0, + "reward": 0.3222222353021304, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 766 + }, + { + "completion_length": 215.1388956705729, + "epoch": 25.573033707865168, + "grad_norm": 0.0003594817174693318, + "kl": 0.20963541666666666, + "learning_rate": 4.455517284439603e-09, + "loss": 0.0, + "reward": 0.2666666954755783, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 767 + }, + { + "completion_length": 213.75000762939453, + "epoch": 25.60674157303371, + "grad_norm": 0.00019267633482959783, + "kl": 0.22013346354166666, + "learning_rate": 4.189949386787462e-09, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 768 + }, + { + "completion_length": 182.9166692097982, + "epoch": 25.640449438202246, + "grad_norm": 0.0027837338741882904, + "kl": 0.2776692708333333, + "learning_rate": 3.932507755167236e-09, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 769 + }, + { + "completion_length": 177.7083396911621, + "epoch": 25.674157303370787, + "grad_norm": 0.00029506169734862047, + "kl": 0.2815755208333333, + "learning_rate": 3.683196609015782e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 770 + }, + { + "completion_length": 150.11111704508463, + "epoch": 25.707865168539325, + "grad_norm": 0.0006376818639406546, + "kl": 0.3323567708333333, + "learning_rate": 3.4420200345122165e-09, + "loss": 0.0, + "reward": 0.3291666905085246, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2291666716337204, + "step": 771 + }, + { + "completion_length": 229.09722900390625, + "epoch": 25.741573033707866, + "grad_norm": 0.21591520249329493, + "kl": 0.20231119791666666, + "learning_rate": 3.2089819845111944e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 772 + }, + { + "completion_length": 170.4375025431315, + "epoch": 25.775280898876403, + "grad_norm": 0.0002507166141399343, + "kl": 0.234619140625, + "learning_rate": 2.984086278477682e-09, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 773 + }, + { + "completion_length": 166.25694783528647, + "epoch": 25.808988764044944, + "grad_norm": 0.0003110073392551284, + "kl": 0.2703450520833333, + "learning_rate": 2.767336602424786e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 774 + }, + { + "completion_length": 187.59028498331705, + "epoch": 25.84269662921348, + "grad_norm": 0.0003291216298769772, + "kl": 0.2513834635416667, + "learning_rate": 2.5587365088532433e-09, + "loss": 0.0, + "reward": 0.3222222551703453, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 775 + }, + { + "completion_length": 195.84722646077475, + "epoch": 25.876404494382022, + "grad_norm": 0.000633623608702707, + "kl": 0.2586263020833333, + "learning_rate": 2.3582894166930267e-09, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 776 + }, + { + "completion_length": 120.79861450195312, + "epoch": 25.910112359550563, + "grad_norm": 0.0003326520037952373, + "kl": 0.3360188802083333, + "learning_rate": 2.1659986112473883e-09, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 777 + }, + { + "completion_length": 178.4652837117513, + "epoch": 25.9438202247191, + "grad_norm": 0.00037813847731855375, + "kl": 0.23876953125, + "learning_rate": 1.9818672441391237e-09, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 778 + }, + { + "completion_length": 173.78472518920898, + "epoch": 25.97752808988764, + "grad_norm": 0.0005401399584431868, + "kl": 0.31591796875, + "learning_rate": 1.8058983332587818e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 779 + }, + { + "completion_length": 248.25, + "epoch": 26.0, + "grad_norm": 0.0014254670355503191, + "kl": 0.40838623046875, + "learning_rate": 1.638094762715314e-09, + "loss": 0.0, + "reward": 0.35000000707805157, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000149011612, + "rewards/solution_reward_func": 0.25, + "step": 780 + }, + { + "completion_length": 175.93055852254233, + "epoch": 26.03370786516854, + "grad_norm": 0.00042937305289155933, + "kl": 0.35546875, + "learning_rate": 1.4784592827886688e-09, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 781 + }, + { + "completion_length": 189.23611450195312, + "epoch": 26.06741573033708, + "grad_norm": 0.0005835392662758728, + "kl": 0.24332682291666666, + "learning_rate": 1.3269945098847713e-09, + "loss": 0.0, + "reward": 0.2111111357808113, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 782 + }, + { + "completion_length": 192.49306106567383, + "epoch": 26.10112359550562, + "grad_norm": 0.0010556307653775384, + "kl": 0.2911783854166667, + "learning_rate": 1.18370292649278e-09, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 783 + }, + { + "completion_length": 191.62500381469727, + "epoch": 26.134831460674157, + "grad_norm": 0.0064669595438772995, + "kl": 0.24348958333333334, + "learning_rate": 1.0485868811441756e-09, + "loss": 0.0, + "reward": 0.10000000894069672, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.0, + "step": 784 + }, + { + "completion_length": 176.3888956705729, + "epoch": 26.168539325842698, + "grad_norm": 0.0005183559096263181, + "kl": 0.2431640625, + "learning_rate": 9.216485883744019e-10, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 785 + }, + { + "completion_length": 182.03473027547201, + "epoch": 26.202247191011235, + "grad_norm": 0.00030315997258418355, + "kl": 0.24593098958333334, + "learning_rate": 8.02890128686562e-10, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 786 + }, + { + "completion_length": 226.59722773234049, + "epoch": 26.235955056179776, + "grad_norm": 0.0003033164387677657, + "kl": 0.197265625, + "learning_rate": 6.923134485172233e-10, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 787 + }, + { + "completion_length": 196.3055648803711, + "epoch": 26.269662921348313, + "grad_norm": 0.00023608236389864086, + "kl": 0.2635091145833333, + "learning_rate": 5.899203602046654e-10, + "loss": 0.0, + "reward": 0.15486112236976624, + "reward_std": 0.0019641853868961334, + "rewards/format_reward_func": 0.09930556515852611, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 788 + }, + { + "completion_length": 194.43750762939453, + "epoch": 26.303370786516854, + "grad_norm": 0.03821653017721124, + "kl": 0.26171875, + "learning_rate": 4.957125419590147e-10, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 789 + }, + { + "completion_length": 176.77083841959634, + "epoch": 26.337078651685392, + "grad_norm": 0.03556165038128272, + "kl": 0.3053385416666667, + "learning_rate": 4.0969153783498854e-10, + "loss": 0.0, + "reward": 0.2097222382823626, + "reward_std": 0.0025717224925756454, + "rewards/format_reward_func": 0.09861111889282863, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 790 + }, + { + "completion_length": 180.49306106567383, + "epoch": 26.370786516853933, + "grad_norm": 0.0007970756607432913, + "kl": 0.3291015625, + "learning_rate": 3.318587577062493e-10, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 791 + }, + { + "completion_length": 178.54861195882162, + "epoch": 26.40449438202247, + "grad_norm": 0.009193979651874646, + "kl": 0.27880859375, + "learning_rate": 2.6221547724253333e-10, + "loss": 0.0, + "reward": 0.15555556863546371, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.055555557211240135, + "step": 792 + }, + { + "completion_length": 219.5486208597819, + "epoch": 26.43820224719101, + "grad_norm": 0.0004060633120508939, + "kl": 0.2599283854166667, + "learning_rate": 2.0076283788872382e-10, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 793 + }, + { + "completion_length": 143.59722518920898, + "epoch": 26.471910112359552, + "grad_norm": 0.00030500575685706824, + "kl": 0.2950846354166667, + "learning_rate": 1.4750184684597656e-10, + "loss": 0.0, + "reward": 0.2666666756073634, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 794 + }, + { + "completion_length": 147.64583841959634, + "epoch": 26.50561797752809, + "grad_norm": 0.00037006877114459423, + "kl": 0.2903645833333333, + "learning_rate": 1.024333770555108e-10, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 795 + }, + { + "completion_length": 168.2013956705729, + "epoch": 26.53932584269663, + "grad_norm": 0.0006103173589443691, + "kl": 0.251953125, + "learning_rate": 6.555816718389895e-11, + "loss": 0.0, + "reward": 0.2666666880249977, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.1666666716337204, + "step": 796 + }, + { + "completion_length": 170.35416793823242, + "epoch": 26.573033707865168, + "grad_norm": 0.000258191960856421, + "kl": 0.30859375, + "learning_rate": 3.687682161146455e-11, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 797 + }, + { + "completion_length": 196.0833371480306, + "epoch": 26.60674157303371, + "grad_norm": 0.0007612175025148647, + "kl": 0.24267578125, + "learning_rate": 1.6389810421846284e-11, + "loss": 0.0, + "reward": 0.2111111283302307, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.11111111442248027, + "step": 798 + }, + { + "completion_length": 165.51389694213867, + "epoch": 26.640449438202246, + "grad_norm": 0.00019571280917988966, + "kl": 0.24430338541666666, + "learning_rate": 4.0974693947259945e-12, + "loss": 0.0, + "reward": 0.3847222502032916, + "reward_std": 0.019641855110724766, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.2847222288449605, + "step": 799 + }, + { + "completion_length": 176.1111157735189, + "epoch": 26.674157303370787, + "grad_norm": 0.1819918282980935, + "kl": 0.23502604166666666, + "learning_rate": 0.0, + "loss": 0.0, + "reward": 0.3222222477197647, + "reward_std": 0.0, + "rewards/format_reward_func": 0.10000000894069672, + "rewards/solution_reward_func": 0.22222222884496054, + "step": 800 + }, + { + "epoch": 26.674157303370787, + "step": 800, + "total_flos": 0.0, + "train_loss": 0.00031588936745230277, + "train_runtime": 44241.5608, + "train_samples_per_second": 0.325, + "train_steps_per_second": 0.018 + } + ], + "logging_steps": 1, + "max_steps": 800, + "num_input_tokens_seen": 0, + "num_train_epochs": 28, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}