{ "best_metric": null, "best_model_checkpoint": null, "epoch": 58.8235294117647, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 498.59722900390625, "epoch": 0.058823529411764705, "grad_norm": 0.6520975546512277, "kl": 0.0, "learning_rate": 3.3333333333333334e-08, "loss": 0.0, "reward": 0.18425556272268295, "reward_std": 0.017856805585324764, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.0856444463133812, "step": 1 }, { "completion_length": 497.1111297607422, "epoch": 0.11764705882352941, "grad_norm": 0.7159117786005874, "kl": 0.0, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 0.39816589653491974, "reward_std": 0.009694711305201054, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.08983253687620163, "step": 2 }, { "completion_length": 526.0416717529297, "epoch": 0.17647058823529413, "grad_norm": 0.6489040632989099, "kl": 0.000640869140625, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.37728095799684525, "reward_std": 0.07768510933965445, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.08769762516021729, "step": 3 }, { "completion_length": 469.7083282470703, "epoch": 0.23529411764705882, "grad_norm": 0.7194065431652082, "kl": 0.0007963180541992188, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "reward": 0.6219849437475204, "reward_std": 0.2233370840549469, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4340277761220932, "rewards/thinker_reward_func": 0.08795715123414993, "step": 4 }, { "completion_length": 481.3888854980469, "epoch": 0.29411764705882354, "grad_norm": 0.6997879187476348, "kl": 0.00061798095703125, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 0.3125055730342865, "reward_std": 0.20004642521962523, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1215277761220932, "rewards/thinker_reward_func": 0.09097778052091599, "step": 5 }, { "completion_length": 485.51390075683594, "epoch": 0.35294117647058826, "grad_norm": 0.7114201494890842, "kl": 0.0005130767822265625, "learning_rate": 2e-07, "loss": -0.0, "reward": 0.43261032551527023, "reward_std": 0.09006764832884073, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2430555522441864, "rewards/thinker_reward_func": 0.08955476060509682, "step": 6 }, { "completion_length": 467.5972137451172, "epoch": 0.4117647058823529, "grad_norm": 0.6891177879827999, "kl": 0.0006580352783203125, "learning_rate": 2.3333333333333333e-07, "loss": 0.0, "reward": 0.482131764292717, "reward_std": 0.11464605433866382, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2951388880610466, "rewards/thinker_reward_func": 0.08699285984039307, "step": 7 }, { "completion_length": 452.2361145019531, "epoch": 0.47058823529411764, "grad_norm": 0.6116014199314035, "kl": 0.0004215240478515625, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "reward": 0.6363135129213333, "reward_std": 0.09077320108190179, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.451388880610466, "rewards/thinker_reward_func": 0.0849246084690094, "step": 8 }, { "completion_length": 485.6666564941406, "epoch": 0.5294117647058824, "grad_norm": 0.636168426767794, "kl": 0.000751495361328125, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.6385730504989624, "reward_std": 0.274067685008049, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4513888955116272, "rewards/thinker_reward_func": 0.0871841311454773, "step": 9 }, { "completion_length": 435.09722900390625, "epoch": 0.5882352941176471, "grad_norm": 0.785010084704913, "kl": 0.0006542205810546875, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.9461833238601685, "reward_std": 0.27340032160282135, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7638888955116272, "rewards/thinker_reward_func": 0.08229444548487663, "step": 10 }, { "completion_length": 473.9166717529297, "epoch": 0.6470588235294118, "grad_norm": 0.7650231255066277, "kl": 0.0005340576171875, "learning_rate": 3.666666666666666e-07, "loss": 0.0, "reward": 0.3578111082315445, "reward_std": 0.19107795506715775, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.08420000225305557, "step": 11 }, { "completion_length": 477.4861145019531, "epoch": 0.7058823529411765, "grad_norm": 0.5833688105051688, "kl": 0.00064849853515625, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.5305055379867554, "reward_std": 0.28634703159332275, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.347222238779068, "rewards/thinker_reward_func": 0.08328333497047424, "step": 12 }, { "completion_length": 501.05555725097656, "epoch": 0.7647058823529411, "grad_norm": 0.6467284539111613, "kl": 0.0006084442138671875, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "reward": 0.36562617123126984, "reward_std": 0.1580744907259941, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1736111119389534, "rewards/thinker_reward_func": 0.09201508015394211, "step": 13 }, { "completion_length": 548.5138854980469, "epoch": 0.8235294117647058, "grad_norm": 0.5618305279517638, "kl": 0.000579833984375, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "reward": 0.7808889150619507, "reward_std": 0.12558943405747414, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.09061111509799957, "step": 14 }, { "completion_length": 459.9305725097656, "epoch": 0.8823529411764706, "grad_norm": 0.6703922347223938, "kl": 0.0007381439208984375, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.39646191895008087, "reward_std": 0.3002852573990822, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333320915699, "rewards/thinker_reward_func": 0.08812857419252396, "step": 15 }, { "completion_length": 483.51390075683594, "epoch": 0.9411764705882353, "grad_norm": 0.6791275909785556, "kl": 0.0006427764892578125, "learning_rate": 5.333333333333333e-07, "loss": 0.0, "reward": 0.20873889327049255, "reward_std": 0.06733320350758731, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0173611119389534, "rewards/thinker_reward_func": 0.09137777984142303, "step": 16 }, { "completion_length": 529.1111145019531, "epoch": 1.0, "grad_norm": 0.6166338531624204, "kl": 0.000553131103515625, "learning_rate": 5.666666666666666e-07, "loss": 0.0, "reward": 0.4009253904223442, "reward_std": 0.00787693727761507, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09259206801652908, "step": 17 }, { "completion_length": 531.625, "epoch": 1.0588235294117647, "grad_norm": 0.6979157451919794, "kl": 0.0007419586181640625, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.4107230305671692, "reward_std": 0.19463036954402924, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.2256944514811039, "rewards/thinker_reward_func": 0.0878063514828682, "step": 18 }, { "completion_length": 491.90277099609375, "epoch": 1.1176470588235294, "grad_norm": 0.6014715255378996, "kl": 0.0006866455078125, "learning_rate": 6.333333333333332e-07, "loss": 0.0, "reward": 0.5352508053183556, "reward_std": 0.1606993437744677, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.347222238779068, "rewards/thinker_reward_func": 0.08802857249975204, "step": 19 }, { "completion_length": 450.30555725097656, "epoch": 1.1764705882352942, "grad_norm": 0.7120424054311372, "kl": 0.00096893310546875, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.3287603110074997, "reward_std": 0.25174400210380554, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1388888880610466, "rewards/thinker_reward_func": 0.08987143263220787, "step": 20 }, { "completion_length": 503.3611297607422, "epoch": 1.2352941176470589, "grad_norm": 0.6658291226664168, "kl": 0.001064300537109375, "learning_rate": 7e-07, "loss": 0.0, "reward": 0.2234611213207245, "reward_std": 0.08884047297760844, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0347222238779068, "rewards/thinker_reward_func": 0.08873889222741127, "step": 21 }, { "completion_length": 522.3472137451172, "epoch": 1.2941176470588236, "grad_norm": 0.6339572018630596, "kl": 0.001392364501953125, "learning_rate": 7.333333333333332e-07, "loss": 0.0, "reward": 0.525196835398674, "reward_std": 0.30757758766412735, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3298611119389534, "rewards/thinker_reward_func": 0.0953357145190239, "step": 22 }, { "completion_length": 477.0138854980469, "epoch": 1.3529411764705883, "grad_norm": 0.6497735036333904, "kl": 0.00196075439453125, "learning_rate": 7.666666666666667e-07, "loss": 0.0, "reward": 0.575045257806778, "reward_std": 0.15946069359779358, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09310079738497734, "step": 23 }, { "completion_length": 492.6805725097656, "epoch": 1.4117647058823528, "grad_norm": 0.5979019206243117, "kl": 0.0023193359375, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.4188135117292404, "reward_std": 0.1467236652970314, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2256944477558136, "rewards/thinker_reward_func": 0.0931190513074398, "step": 24 }, { "completion_length": 527.0694580078125, "epoch": 1.4705882352941178, "grad_norm": 0.5855070057275423, "kl": 0.00214385986328125, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "reward": 0.6457309499382973, "reward_std": 0.12476762756705284, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4513889029622078, "rewards/thinker_reward_func": 0.09434206783771515, "step": 25 }, { "completion_length": 558.6527709960938, "epoch": 1.5294117647058822, "grad_norm": 0.4997197011360854, "kl": 0.00323486328125, "learning_rate": 8.666666666666667e-07, "loss": 0.0, "reward": 0.40455079823732376, "reward_std": 0.0040462876204401255, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09621746093034744, "step": 26 }, { "completion_length": 595.0277709960938, "epoch": 1.5882352941176472, "grad_norm": 0.5865146250582498, "kl": 0.00372314453125, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.3697452172636986, "reward_std": 0.08609590050764382, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.09613412991166115, "step": 27 }, { "completion_length": 577.9722290039062, "epoch": 1.6470588235294117, "grad_norm": 0.6576602491193846, "kl": 0.00543212890625, "learning_rate": 9.333333333333333e-07, "loss": 0.0, "reward": 0.42172859609127045, "reward_std": 0.19996163249015808, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2256944477558136, "rewards/thinker_reward_func": 0.09603413566946983, "step": 28 }, { "completion_length": 538.0972290039062, "epoch": 1.7058823529411766, "grad_norm": 0.5659036488773891, "kl": 0.0074615478515625, "learning_rate": 9.666666666666666e-07, "loss": 0.0, "reward": 0.8046882152557373, "reward_std": 0.32572464644908905, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6076388955116272, "rewards/thinker_reward_func": 0.09704921394586563, "step": 29 }, { "completion_length": 646.0694274902344, "epoch": 1.7647058823529411, "grad_norm": 0.47528110871938745, "kl": 0.0086669921875, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.19637539237737656, "reward_std": 0.00796083573368378, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.09915318712592125, "step": 30 }, { "completion_length": 617.1111145019531, "epoch": 1.8235294117647058, "grad_norm": 0.5386156523370382, "kl": 0.0096435546875, "learning_rate": 9.99997377618298e-07, "loss": 0.0, "reward": 0.6779214590787888, "reward_std": 0.32167865335941315, "rewards/format_reward_func": 0.09583334997296333, "rewards/solution_reward_func": 0.4861111342906952, "rewards/thinker_reward_func": 0.09597699344158173, "step": 31 }, { "completion_length": 584.2083435058594, "epoch": 1.8823529411764706, "grad_norm": 0.46878348325965946, "kl": 0.01171875, "learning_rate": 9.999895105006994e-07, "loss": 0.0001, "reward": 0.371415913105011, "reward_std": 0.08435766166076064, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.09780476614832878, "step": 32 }, { "completion_length": 568.4166564941406, "epoch": 1.9411764705882353, "grad_norm": 0.5531284070676309, "kl": 0.01513671875, "learning_rate": 9.999763987297264e-07, "loss": 0.0, "reward": 0.5100920647382736, "reward_std": 0.1121294666081667, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3124999925494194, "rewards/thinker_reward_func": 0.0975920706987381, "step": 33 }, { "completion_length": 614.5416564941406, "epoch": 2.0, "grad_norm": 0.4288389612791824, "kl": 0.013702392578125, "learning_rate": 9.999580424429159e-07, "loss": 0.0, "reward": 0.40492936223745346, "reward_std": 0.2590707167983055, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333358168602, "rewards/thinker_reward_func": 0.0979849323630333, "step": 34 }, { "completion_length": 605.0972290039062, "epoch": 2.0588235294117645, "grad_norm": 0.4085383600942031, "kl": 0.0184326171875, "learning_rate": 9.99934441832816e-07, "loss": 0.0001, "reward": 0.35585635900497437, "reward_std": 0.16349926963448524, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1562500074505806, "rewards/thinker_reward_func": 0.09960635751485825, "step": 35 }, { "completion_length": 647.9583435058594, "epoch": 2.1176470588235294, "grad_norm": 0.39957223752948406, "kl": 0.01995849609375, "learning_rate": 9.999055971469863e-07, "loss": 0.0001, "reward": 0.5819444805383682, "reward_std": 0.3510873466730118, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444328546524, "rewards/thinker_reward_func": 0.10000000894069672, "step": 36 }, { "completion_length": 712.1527709960938, "epoch": 2.176470588235294, "grad_norm": 0.43035682554603394, "kl": 0.0181884765625, "learning_rate": 9.998715086879935e-07, "loss": 0.0, "reward": 0.3518674746155739, "reward_std": 0.10940386261790991, "rewards/format_reward_func": 0.097222238779068, "rewards/solution_reward_func": 0.15625, "rewards/thinker_reward_func": 0.09839524701237679, "step": 37 }, { "completion_length": 697.6111145019531, "epoch": 2.235294117647059, "grad_norm": 0.35797272613726705, "kl": 0.0201416015625, "learning_rate": 9.9983217681341e-07, "loss": 0.0001, "reward": 0.26934922486543655, "reward_std": 0.15457768738269806, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0694444477558136, "rewards/thinker_reward_func": 0.09990477189421654, "step": 38 }, { "completion_length": 648.7916870117188, "epoch": 2.2941176470588234, "grad_norm": 0.394737226125212, "kl": 0.029541015625, "learning_rate": 9.997876019358083e-07, "loss": 0.0001, "reward": 0.21717777848243713, "reward_std": 0.060597628660616465, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0173611119389534, "rewards/thinker_reward_func": 0.09981667622923851, "step": 39 }, { "completion_length": 704.75, "epoch": 2.3529411764705883, "grad_norm": 0.41083466644473565, "kl": 0.0279541015625, "learning_rate": 9.997377845227574e-07, "loss": 0.0, "reward": 0.6645524203777313, "reward_std": 0.3215784430503845, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.4687500074505806, "rewards/thinker_reward_func": 0.09858016669750214, "step": 40 }, { "completion_length": 728.7222290039062, "epoch": 2.411764705882353, "grad_norm": 0.3655695162534301, "kl": 0.02679443359375, "learning_rate": 9.996827250968189e-07, "loss": 0.0, "reward": 0.47220082581043243, "reward_std": 0.23376674950122833, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.2777777761220932, "rewards/thinker_reward_func": 0.0985896959900856, "step": 41 }, { "completion_length": 728.6944580078125, "epoch": 2.4705882352941178, "grad_norm": 0.4933028845366201, "kl": 0.02685546875, "learning_rate": 9.996224242355397e-07, "loss": 0.0, "reward": 0.43467937409877777, "reward_std": 0.39883507043123245, "rewards/format_reward_func": 0.09444445371627808, "rewards/solution_reward_func": 0.2430555522441864, "rewards/thinker_reward_func": 0.09717937186360359, "step": 42 }, { "completion_length": 752.4305419921875, "epoch": 2.5294117647058822, "grad_norm": 0.398435195219813, "kl": 0.02899169921875, "learning_rate": 9.995568825714478e-07, "loss": 0.0002, "reward": 0.5055555999279022, "reward_std": 0.2514152005314827, "rewards/format_reward_func": 0.09444445744156837, "rewards/solution_reward_func": 0.3124999888241291, "rewards/thinker_reward_func": 0.09861112385988235, "step": 43 }, { "completion_length": 686.3472290039062, "epoch": 2.588235294117647, "grad_norm": 0.4072403147117123, "kl": 0.0338134765625, "learning_rate": 9.994861007920439e-07, "loss": 0.0001, "reward": 0.5284722447395325, "reward_std": 0.2548440806567669, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.329861119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 44 }, { "completion_length": 773.1666564941406, "epoch": 2.6470588235294117, "grad_norm": 0.24478664979472278, "kl": 0.030029296875, "learning_rate": 9.994100796397953e-07, "loss": 0.0001, "reward": 0.19583334028720856, "reward_std": 0.011298743076622486, "rewards/format_reward_func": 0.09583334997296333, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 45 }, { "completion_length": 775.4305725097656, "epoch": 2.7058823529411766, "grad_norm": 0.3001129545186208, "kl": 0.032958984375, "learning_rate": 9.993288199121282e-07, "loss": 0.0001, "reward": 0.17916666716337204, "reward_std": 0.04454982839524746, "rewards/format_reward_func": 0.08611112087965012, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.09305556491017342, "step": 46 }, { "completion_length": 719.8194274902344, "epoch": 2.764705882352941, "grad_norm": 0.1977839589382364, "kl": 0.0380859375, "learning_rate": 9.992423224614183e-07, "loss": 0.0003, "reward": 0.5645158961415291, "reward_std": 0.09418189525604248, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3645833432674408, "rewards/thinker_reward_func": 0.09993255138397217, "step": 47 }, { "completion_length": 765.8194580078125, "epoch": 2.8235294117647056, "grad_norm": 0.26333450938849334, "kl": 0.0361328125, "learning_rate": 9.991505881949836e-07, "loss": 0.0001, "reward": 0.3854167014360428, "reward_std": 0.07938566524535418, "rewards/format_reward_func": 0.097222238779068, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.097222238779068, "step": 48 }, { "completion_length": 720.9166870117188, "epoch": 2.8823529411764706, "grad_norm": 0.35416188797563314, "kl": 0.03558349609375, "learning_rate": 9.990536180750723e-07, "loss": 0.0001, "reward": 0.34363653510808945, "reward_std": 0.11070389486849308, "rewards/format_reward_func": 0.09305556118488312, "rewards/solution_reward_func": 0.15625, "rewards/thinker_reward_func": 0.09433095902204514, "step": 49 }, { "completion_length": 701.375, "epoch": 2.9411764705882355, "grad_norm": 0.3135310613469125, "kl": 0.0462646484375, "learning_rate": 9.989514131188558e-07, "loss": 0.0003, "reward": 0.8569445013999939, "reward_std": 0.28638995438814163, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6597222089767456, "rewards/thinker_reward_func": 0.09861112385988235, "step": 50 }, { "completion_length": 745.6388854980469, "epoch": 3.0, "grad_norm": 0.4251267838750491, "kl": 0.035888671875, "learning_rate": 9.988439743984152e-07, "loss": 0.0, "reward": 0.472222276031971, "reward_std": 0.23791787028312683, "rewards/format_reward_func": 0.09583334997296333, "rewards/solution_reward_func": 0.2777777910232544, "rewards/thinker_reward_func": 0.09861112385988235, "step": 51 }, { "completion_length": 751.0833435058594, "epoch": 3.0588235294117645, "grad_norm": 0.2773953472350835, "kl": 0.04150390625, "learning_rate": 9.987313030407323e-07, "loss": 0.0001, "reward": 0.26527779549360275, "reward_std": 0.1614770144224167, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.0694444440305233, "rewards/thinker_reward_func": 0.10000000894069672, "step": 52 }, { "completion_length": 747.7916870117188, "epoch": 3.1176470588235294, "grad_norm": 0.3384643875245562, "kl": 0.0419921875, "learning_rate": 9.986134002276759e-07, "loss": 0.0001, "reward": 0.31458335369825363, "reward_std": 0.12412398308515549, "rewards/format_reward_func": 0.09444445371627808, "rewards/solution_reward_func": 0.1215277761220932, "rewards/thinker_reward_func": 0.09861112385988235, "step": 53 }, { "completion_length": 720.7916870117188, "epoch": 3.176470588235294, "grad_norm": 0.36882543725292255, "kl": 0.0360107421875, "learning_rate": 9.98490267195991e-07, "loss": 0.0, "reward": 0.44513891637325287, "reward_std": 0.23386678844690323, "rewards/format_reward_func": 0.09166667982935905, "rewards/solution_reward_func": 0.2604166641831398, "rewards/thinker_reward_func": 0.09305556863546371, "step": 54 }, { "completion_length": 730.0972290039062, "epoch": 3.235294117647059, "grad_norm": 0.33360084798147177, "kl": 0.037841796875, "learning_rate": 9.983619052372847e-07, "loss": 0.0003, "reward": 0.5562500357627869, "reward_std": 0.11368476320058107, "rewards/format_reward_func": 0.09444445744156837, "rewards/solution_reward_func": 0.3645833283662796, "rewards/thinker_reward_func": 0.0972222313284874, "step": 55 }, { "completion_length": 788.013916015625, "epoch": 3.2941176470588234, "grad_norm": 0.38623542514898973, "kl": 0.037353515625, "learning_rate": 9.98228315698013e-07, "loss": 0.0, "reward": 0.5680556297302246, "reward_std": 0.23198091983795166, "rewards/format_reward_func": 0.09166667610406876, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09444445744156837, "step": 56 }, { "completion_length": 728.4166870117188, "epoch": 3.3529411764705883, "grad_norm": 0.4342326053730827, "kl": 0.035400390625, "learning_rate": 9.980894999794678e-07, "loss": 0.0, "reward": 0.5215277969837189, "reward_std": 0.35785046219825745, "rewards/format_reward_func": 0.09583334997296333, "rewards/solution_reward_func": 0.329861119389534, "rewards/thinker_reward_func": 0.09583334997296333, "step": 57 }, { "completion_length": 753.6111145019531, "epoch": 3.411764705882353, "grad_norm": 0.28420293965927146, "kl": 0.0347900390625, "learning_rate": 9.979454595377593e-07, "loss": 0.0001, "reward": 0.32152777910232544, "reward_std": 0.1883704587817192, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1215277835726738, "rewards/thinker_reward_func": 0.10000000894069672, "step": 58 }, { "completion_length": 732.6805725097656, "epoch": 3.4705882352941178, "grad_norm": 0.3495782120767918, "kl": 0.0418701171875, "learning_rate": 9.97796195883804e-07, "loss": 0.0001, "reward": 0.26250002533197403, "reward_std": 0.12663250416517258, "rewards/format_reward_func": 0.09583334624767303, "rewards/solution_reward_func": 0.0694444477558136, "rewards/thinker_reward_func": 0.097222238779068, "step": 59 }, { "completion_length": 705.2222290039062, "epoch": 3.5294117647058822, "grad_norm": 0.29828734041040145, "kl": 0.0386962890625, "learning_rate": 9.97641710583307e-07, "loss": 0.0001, "reward": 0.544322282075882, "reward_std": 0.1639484316110611, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3472222238779068, "rewards/thinker_reward_func": 0.09848890081048012, "step": 60 }, { "completion_length": 721.4444580078125, "epoch": 3.588235294117647, "grad_norm": 0.3510932373447653, "kl": 0.0411376953125, "learning_rate": 9.974820052567459e-07, "loss": 0.0001, "reward": 0.3506944701075554, "reward_std": 0.22282803524285555, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.15625, "rewards/thinker_reward_func": 0.0972222313284874, "step": 61 }, { "completion_length": 753.513916015625, "epoch": 3.6470588235294117, "grad_norm": 0.23882431688618347, "kl": 0.0340576171875, "learning_rate": 9.973170815793542e-07, "loss": 0.0001, "reward": 0.3840278089046478, "reward_std": 0.22344040125608444, "rewards/format_reward_func": 0.09444445744156837, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.09861112385988235, "step": 62 }, { "completion_length": 679.888916015625, "epoch": 3.7058823529411766, "grad_norm": 0.3804202197386677, "kl": 0.0382080078125, "learning_rate": 9.971469412811032e-07, "loss": 0.0001, "reward": 0.4923016279935837, "reward_std": 0.1725456016138196, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2951388955116272, "rewards/thinker_reward_func": 0.09855160117149353, "step": 63 }, { "completion_length": 714.3611145019531, "epoch": 3.764705882352941, "grad_norm": 0.41161762799647783, "kl": 0.0352783203125, "learning_rate": 9.969715861466839e-07, "loss": 0.0001, "reward": 0.5284698531031609, "reward_std": 0.27010616147890687, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3298611044883728, "rewards/thinker_reward_func": 0.099997628480196, "step": 64 }, { "completion_length": 669.5138854980469, "epoch": 3.8235294117647056, "grad_norm": 0.3881890378584106, "kl": 0.03131103515625, "learning_rate": 9.967910180154888e-07, "loss": 0.0001, "reward": 0.9437499940395355, "reward_std": 0.27113811671733856, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.7465277910232544, "rewards/thinker_reward_func": 0.09861112385988235, "step": 65 }, { "completion_length": 758.8194580078125, "epoch": 3.8823529411764706, "grad_norm": 0.23700838065789764, "kl": 0.0330810546875, "learning_rate": 9.96605238781592e-07, "loss": 0.0001, "reward": 0.24930556863546371, "reward_std": 0.14244676008820534, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.0520833358168602, "rewards/thinker_reward_func": 0.10000000894069672, "step": 66 }, { "completion_length": 704.7361145019531, "epoch": 3.9411764705882355, "grad_norm": 0.31818157610033226, "kl": 0.03411865234375, "learning_rate": 9.964142503937305e-07, "loss": 0.0001, "reward": 0.46041668951511383, "reward_std": 0.18837044388055801, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.260416679084301, "rewards/thinker_reward_func": 0.10000000894069672, "step": 67 }, { "completion_length": 661.9444580078125, "epoch": 4.0, "grad_norm": 0.3731972153762077, "kl": 0.03857421875, "learning_rate": 9.96218054855281e-07, "loss": 0.0001, "reward": 0.5791285932064056, "reward_std": 0.13003577664494514, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09857303276658058, "step": 68 }, { "completion_length": 682.6527709960938, "epoch": 4.0588235294117645, "grad_norm": 0.4523101544357719, "kl": 0.03662109375, "learning_rate": 9.960166542242428e-07, "loss": 0.0001, "reward": 0.33319127559661865, "reward_std": 0.20120380818843842, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.1388888880610466, "rewards/thinker_reward_func": 0.09846905991435051, "step": 69 }, { "completion_length": 666.8472290039062, "epoch": 4.117647058823529, "grad_norm": 0.30940517888047325, "kl": 0.036376953125, "learning_rate": 9.958100506132126e-07, "loss": 0.0001, "reward": 0.4256944805383682, "reward_std": 0.19679853320121765, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2256944477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 70 }, { "completion_length": 650.8472290039062, "epoch": 4.176470588235294, "grad_norm": 0.4130033107174288, "kl": 0.038818359375, "learning_rate": 9.955982461893646e-07, "loss": 0.0001, "reward": 0.5097222477197647, "reward_std": 0.2958686426281929, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3125000149011612, "rewards/thinker_reward_func": 0.09861112385988235, "step": 71 }, { "completion_length": 688.3472290039062, "epoch": 4.235294117647059, "grad_norm": 0.3491146575685857, "kl": 0.033203125, "learning_rate": 9.953812431744274e-07, "loss": 0.0001, "reward": 0.6858817711472511, "reward_std": 0.24841637909412384, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4861111044883728, "rewards/thinker_reward_func": 0.09977064654231071, "step": 72 }, { "completion_length": 711.7083129882812, "epoch": 4.294117647058823, "grad_norm": 0.2781308631398239, "kl": 0.0296630859375, "learning_rate": 9.951590438446596e-07, "loss": 0.0001, "reward": 0.39097223430871964, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 73 }, { "completion_length": 618.5833435058594, "epoch": 4.352941176470588, "grad_norm": 0.4212066761333368, "kl": 0.03271484375, "learning_rate": 9.94931650530827e-07, "loss": 0.0001, "reward": 0.4427468478679657, "reward_std": 0.08203668083297089, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2430555522441864, "rewards/thinker_reward_func": 0.09969127923250198, "step": 74 }, { "completion_length": 630.4722290039062, "epoch": 4.411764705882353, "grad_norm": 0.28943290892524487, "kl": 0.02972412109375, "learning_rate": 9.946990656181779e-07, "loss": 0.0005, "reward": 0.894057959318161, "reward_std": 0.1673007868230343, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6944444477558136, "rewards/thinker_reward_func": 0.0996134988963604, "step": 75 }, { "completion_length": 737.3611145019531, "epoch": 4.470588235294118, "grad_norm": 0.275707479053098, "kl": 0.02587890625, "learning_rate": 9.94461291546418e-07, "loss": 0.0003, "reward": 0.4937499985098839, "reward_std": 0.11208804929628968, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2951388955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 76 }, { "completion_length": 649.25, "epoch": 4.529411764705882, "grad_norm": 0.5004467299432704, "kl": 0.02734375, "learning_rate": 9.942183308096853e-07, "loss": 0.0, "reward": 0.7552357614040375, "reward_std": 0.23633567988872528, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5555555671453476, "rewards/thinker_reward_func": 0.09968017041683197, "step": 77 }, { "completion_length": 685.2361145019531, "epoch": 4.588235294117647, "grad_norm": 0.21364484683683738, "kl": 0.028076171875, "learning_rate": 9.93970185956522e-07, "loss": 0.0001, "reward": 0.21733174473047256, "reward_std": 0.06014999374747276, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0173611119389534, "rewards/thinker_reward_func": 0.09997064620256424, "step": 78 }, { "completion_length": 695.6527709960938, "epoch": 4.647058823529412, "grad_norm": 0.33114860844846267, "kl": 0.02996826171875, "learning_rate": 9.937168595898508e-07, "loss": 0.0001, "reward": 0.5801206827163696, "reward_std": 0.08673680061474442, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09956508874893188, "step": 79 }, { "completion_length": 665.2222290039062, "epoch": 4.705882352941177, "grad_norm": 0.22757475831212048, "kl": 0.02728271484375, "learning_rate": 9.934583543669453e-07, "loss": 0.0001, "reward": 0.30416668206453323, "reward_std": 0.1087985411286354, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1041666641831398, "rewards/thinker_reward_func": 0.10000000894069672, "step": 80 }, { "completion_length": 702.8333435058594, "epoch": 4.764705882352941, "grad_norm": 0.34350824848982153, "kl": 0.02581787109375, "learning_rate": 9.93194672999403e-07, "loss": 0.0001, "reward": 0.31863097101449966, "reward_std": 0.11731169736594893, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.1215277761220932, "rewards/thinker_reward_func": 0.0984920784831047, "step": 81 }, { "completion_length": 667.138916015625, "epoch": 4.823529411764706, "grad_norm": 0.39883942500783687, "kl": 0.02972412109375, "learning_rate": 9.929258182531166e-07, "loss": 0.0001, "reward": 0.6283214688301086, "reward_std": 0.22314860671758652, "rewards/format_reward_func": 0.09583334624767303, "rewards/solution_reward_func": 0.4340277686715126, "rewards/thinker_reward_func": 0.09846032783389091, "step": 82 }, { "completion_length": 656.3055725097656, "epoch": 4.882352941176471, "grad_norm": 0.2242601107753345, "kl": 0.0289306640625, "learning_rate": 9.926517929482452e-07, "loss": 0.0001, "reward": 0.2666666731238365, "reward_std": 0.10504928976297379, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.0694444477558136, "rewards/thinker_reward_func": 0.09861112385988235, "step": 83 }, { "completion_length": 661.4583435058594, "epoch": 4.9411764705882355, "grad_norm": 0.3519741103575541, "kl": 0.0291748046875, "learning_rate": 9.923725999591846e-07, "loss": 0.0003, "reward": 0.738193690776825, "reward_std": 0.17531868081027824, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5381944328546524, "rewards/thinker_reward_func": 0.09999921545386314, "step": 84 }, { "completion_length": 653.2638854980469, "epoch": 5.0, "grad_norm": 0.40194573197009204, "kl": 0.02935791015625, "learning_rate": 9.92088242214537e-07, "loss": 0.0003, "reward": 0.6860658973455429, "reward_std": 0.22289105877280235, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4861111119389534, "rewards/thinker_reward_func": 0.0999547690153122, "step": 85 }, { "completion_length": 671.7083129882812, "epoch": 5.0588235294117645, "grad_norm": 0.3543905633415421, "kl": 0.02911376953125, "learning_rate": 9.91798722697081e-07, "loss": 0.0001, "reward": 0.1954309567809105, "reward_std": 0.015423934441059828, "rewards/format_reward_func": 0.097222238779068, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.09820874407887459, "step": 86 }, { "completion_length": 695.3333435058594, "epoch": 5.117647058823529, "grad_norm": 0.0012182056535854718, "kl": 0.03045654296875, "learning_rate": 9.915040444437388e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 87 }, { "completion_length": 708.5972290039062, "epoch": 5.176470588235294, "grad_norm": 0.24238854467858215, "kl": 0.02813720703125, "learning_rate": 9.912042105455461e-07, "loss": 0.0003, "reward": 0.5298611223697662, "reward_std": 0.1072767972946167, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3298611044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 88 }, { "completion_length": 690.2222290039062, "epoch": 5.235294117647059, "grad_norm": 0.267004287874683, "kl": 0.03277587890625, "learning_rate": 9.908992241476186e-07, "loss": 0.0003, "reward": 0.651342898607254, "reward_std": 0.08125310840841848, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.451388880610466, "rewards/thinker_reward_func": 0.09995397552847862, "step": 89 }, { "completion_length": 619.2361145019531, "epoch": 5.294117647058823, "grad_norm": 0.18179900865251175, "kl": 0.0390625, "learning_rate": 9.905890884491194e-07, "loss": 0.0001, "reward": 0.40813571959733963, "reward_std": 0.0006522751646116376, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09980239346623421, "step": 90 }, { "completion_length": 662.875, "epoch": 5.352941176470588, "grad_norm": 0.45877287529584226, "kl": 0.03564453125, "learning_rate": 9.902738067032253e-07, "loss": 0.0, "reward": 0.6844961047172546, "reward_std": 0.36846765875816345, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4861111044883728, "rewards/thinker_reward_func": 0.0997738242149353, "step": 91 }, { "completion_length": 701.4166564941406, "epoch": 5.411764705882353, "grad_norm": 0.2891563659183162, "kl": 0.0345458984375, "learning_rate": 9.899533822170921e-07, "loss": 0.0003, "reward": 0.6109214425086975, "reward_std": 0.008863179478794336, "rewards/format_reward_func": 0.09444445371627808, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09981032833456993, "step": 92 }, { "completion_length": 666.5694580078125, "epoch": 5.470588235294118, "grad_norm": 0.3090076950686263, "kl": 0.0433349609375, "learning_rate": 9.896278183518216e-07, "loss": 0.0001, "reward": 0.42569447308778763, "reward_std": 0.17531593143939972, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2256944477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 93 }, { "completion_length": 724.6944580078125, "epoch": 5.529411764705882, "grad_norm": 0.49288359019812034, "kl": 0.0382080078125, "learning_rate": 9.892971185224244e-07, "loss": 0.0, "reward": 0.5617897212505341, "reward_std": 0.22415290772914886, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.3645833432674408, "rewards/thinker_reward_func": 0.09998413547873497, "step": 94 }, { "completion_length": 676.875, "epoch": 5.588235294117647, "grad_norm": 0.4296819989744239, "kl": 0.0369873046875, "learning_rate": 9.889612861977853e-07, "loss": 0.0003, "reward": 0.6658960580825806, "reward_std": 0.15112073719501495, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.46875, "rewards/thinker_reward_func": 0.09853493422269821, "step": 95 }, { "completion_length": 814.8472290039062, "epoch": 5.647058823529412, "grad_norm": 0.37420765964739117, "kl": 0.031494140625, "learning_rate": 9.886203249006264e-07, "loss": 0.0001, "reward": 0.36944446712732315, "reward_std": 0.16240034997463226, "rewards/format_reward_func": 0.09583334997296333, "rewards/solution_reward_func": 0.1736111119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 96 }, { "completion_length": 722.5833435058594, "epoch": 5.705882352941177, "grad_norm": 0.27076312723223045, "kl": 0.0419921875, "learning_rate": 9.882742382074706e-07, "loss": 0.0003, "reward": 0.5402778312563896, "reward_std": 0.17621736973524094, "rewards/format_reward_func": 0.09444445371627808, "rewards/solution_reward_func": 0.3472222164273262, "rewards/thinker_reward_func": 0.09861112385988235, "step": 97 }, { "completion_length": 744.5833435058594, "epoch": 5.764705882352941, "grad_norm": 0.2943806394564017, "kl": 0.0416259765625, "learning_rate": 9.879230297486034e-07, "loss": 0.0001, "reward": 0.2493055760860443, "reward_std": 0.10384479630738497, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.0520833320915699, "rewards/thinker_reward_func": 0.10000000894069672, "step": 98 }, { "completion_length": 748.5416870117188, "epoch": 5.823529411764706, "grad_norm": 0.36149495509035245, "kl": 0.0360107421875, "learning_rate": 9.875667032080352e-07, "loss": 0.0003, "reward": 0.5951389223337173, "reward_std": 0.15159399807453156, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.399305559694767, "rewards/thinker_reward_func": 0.09861112385988235, "step": 99 }, { "completion_length": 742.3055419921875, "epoch": 5.882352941176471, "grad_norm": 0.3092731111615846, "kl": 0.0419921875, "learning_rate": 9.872052623234631e-07, "loss": 0.0001, "reward": 0.3201388940215111, "reward_std": 0.17375044524669647, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.1215277761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 100 }, { "completion_length": 741.0694580078125, "epoch": 5.9411764705882355, "grad_norm": 0.38946089699432124, "kl": 0.04052734375, "learning_rate": 9.868387108862305e-07, "loss": 0.0001, "reward": 0.6500000059604645, "reward_std": 0.2533223628997803, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4513889029622078, "rewards/thinker_reward_func": 0.10000000894069672, "step": 101 }, { "completion_length": 695.9027709960938, "epoch": 6.0, "grad_norm": 0.40015069809129844, "kl": 0.041015625, "learning_rate": 9.86467052741289e-07, "loss": 0.0003, "reward": 0.7006944715976715, "reward_std": 0.1754266694188118, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.5034722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 102 }, { "completion_length": 723.0833435058594, "epoch": 6.0588235294117645, "grad_norm": 0.3441877266257983, "kl": 0.0428466796875, "learning_rate": 9.860902917871566e-07, "loss": 0.0001, "reward": 0.5805555880069733, "reward_std": 0.2883881404995918, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 103 }, { "completion_length": 726.9305725097656, "epoch": 6.117647058823529, "grad_norm": 0.36194926480231226, "kl": 0.039794921875, "learning_rate": 9.85708431975877e-07, "loss": 0.0001, "reward": 0.2986111268401146, "reward_std": 0.20383387804031372, "rewards/format_reward_func": 0.09583334624767303, "rewards/solution_reward_func": 0.1041666641831398, "rewards/thinker_reward_func": 0.09861112385988235, "step": 104 }, { "completion_length": 700.9305725097656, "epoch": 6.176470588235294, "grad_norm": 0.31722026637857087, "kl": 0.0439453125, "learning_rate": 9.853214773129795e-07, "loss": 0.0003, "reward": 0.7715278267860413, "reward_std": 0.14604554697871208, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5729166716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 105 }, { "completion_length": 770.1944580078125, "epoch": 6.235294117647059, "grad_norm": 0.28161132826424157, "kl": 0.04443359375, "learning_rate": 9.84929431857435e-07, "loss": 0.0001, "reward": 0.19166667014360428, "reward_std": 0.020478153601288795, "rewards/format_reward_func": 0.09305556491017342, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.09861112385988235, "step": 106 }, { "completion_length": 779.1528015136719, "epoch": 6.294117647058823, "grad_norm": 0.3319595861309673, "kl": 0.044677734375, "learning_rate": 9.845322997216151e-07, "loss": 0.0001, "reward": 0.3652777746319771, "reward_std": 0.09704753756523132, "rewards/format_reward_func": 0.09305556118488312, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.09861112385988235, "step": 107 }, { "completion_length": 688.0555419921875, "epoch": 6.352941176470588, "grad_norm": 0.4334352548380135, "kl": 0.04638671875, "learning_rate": 9.841300850712478e-07, "loss": 0.0003, "reward": 0.5993055999279022, "reward_std": 0.24381054192781448, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 108 }, { "completion_length": 699.1111145019531, "epoch": 6.411764705882353, "grad_norm": 0.4010082289955278, "kl": 0.044189453125, "learning_rate": 9.837227921253745e-07, "loss": 0.0003, "reward": 0.628424659371376, "reward_std": 0.15268938709050417, "rewards/format_reward_func": 0.097222238779068, "rewards/solution_reward_func": 0.4340277910232544, "rewards/thinker_reward_func": 0.09717461839318275, "step": 109 }, { "completion_length": 777.3472290039062, "epoch": 6.470588235294118, "grad_norm": 0.4665084894779671, "kl": 0.0374755859375, "learning_rate": 9.833104251563055e-07, "loss": 0.0, "reward": 0.44928018003702164, "reward_std": 0.24663562327623367, "rewards/format_reward_func": 0.09444445744156837, "rewards/solution_reward_func": 0.2604166641831398, "rewards/thinker_reward_func": 0.09441905841231346, "step": 110 }, { "completion_length": 719.3055725097656, "epoch": 6.529411764705882, "grad_norm": 0.2673381941190818, "kl": 0.0423583984375, "learning_rate": 9.828929884895752e-07, "loss": 0.0003, "reward": 0.7354166805744171, "reward_std": 0.18209128826856613, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5381944477558136, "rewards/thinker_reward_func": 0.09861112385988235, "step": 111 }, { "completion_length": 711.5833435058594, "epoch": 6.588235294117647, "grad_norm": 0.17659885295447433, "kl": 0.0531005859375, "learning_rate": 9.824704865038967e-07, "loss": 0.0002, "reward": 0.35208336263895035, "reward_std": 0.10200794786214828, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.15625, "rewards/thinker_reward_func": 0.09861112385988235, "step": 112 }, { "completion_length": 706.6666870117188, "epoch": 6.647058823529412, "grad_norm": 0.45199386977470857, "kl": 0.0440673828125, "learning_rate": 9.820429236311158e-07, "loss": 0.0001, "reward": 0.43735477328300476, "reward_std": 0.24384743720293045, "rewards/format_reward_func": 0.097222238779068, "rewards/solution_reward_func": 0.2430555522441864, "rewards/thinker_reward_func": 0.09707700088620186, "step": 113 }, { "completion_length": 659.6666564941406, "epoch": 6.705882352941177, "grad_norm": 0.3479579484837783, "kl": 0.0419921875, "learning_rate": 9.816103043561648e-07, "loss": 0.0001, "reward": 0.3213087394833565, "reward_std": 0.10786611965158954, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1215277761220932, "rewards/thinker_reward_func": 0.09978096187114716, "step": 114 }, { "completion_length": 757.0, "epoch": 6.764705882352941, "grad_norm": 0.27814897665241, "kl": 0.0411376953125, "learning_rate": 9.81172633217015e-07, "loss": 0.0001, "reward": 0.4590238481760025, "reward_std": 0.19168715924024582, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.260416679084301, "rewards/thinker_reward_func": 0.09999604150652885, "step": 115 }, { "completion_length": 689.875, "epoch": 6.823529411764706, "grad_norm": 0.30265106862607477, "kl": 0.04541015625, "learning_rate": 9.8072991480463e-07, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.11219874769449234, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.0694444477558136, "rewards/thinker_reward_func": 0.09861112385988235, "step": 116 }, { "completion_length": 671.5972290039062, "epoch": 6.882352941176471, "grad_norm": 0.46378658613755586, "kl": 0.0498046875, "learning_rate": 9.80282153762916e-07, "loss": 0.0001, "reward": 0.5804207175970078, "reward_std": 0.30307846516370773, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09986509010195732, "step": 117 }, { "completion_length": 702.1528015136719, "epoch": 6.9411764705882355, "grad_norm": 0.41933210621861866, "kl": 0.048095703125, "learning_rate": 9.798293547886746e-07, "loss": 0.0001, "reward": 0.3722222372889519, "reward_std": 0.0859048985876143, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 118 }, { "completion_length": 633.2222290039062, "epoch": 7.0, "grad_norm": 0.3936053979487867, "kl": 0.0494384765625, "learning_rate": 9.793715226315528e-07, "loss": 0.0003, "reward": 0.7367214858531952, "reward_std": 0.24438317865133286, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5381944477558136, "rewards/thinker_reward_func": 0.09991588070988655, "step": 119 }, { "completion_length": 656.3194580078125, "epoch": 7.0588235294117645, "grad_norm": 0.4536334326063857, "kl": 0.0472412109375, "learning_rate": 9.789086620939935e-07, "loss": 0.0003, "reward": 0.6673611551523209, "reward_std": 0.1675281524658203, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.46875, "rewards/thinker_reward_func": 0.10000000894069672, "step": 120 }, { "completion_length": 700.2778015136719, "epoch": 7.117647058823529, "grad_norm": 0.42287079127010185, "kl": 0.0428466796875, "learning_rate": 9.784407780311845e-07, "loss": 0.0001, "reward": 0.42282386124134064, "reward_std": 0.2748475521802902, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2256944477558136, "rewards/thinker_reward_func": 0.0985182635486126, "step": 121 }, { "completion_length": 718.5416564941406, "epoch": 7.176470588235294, "grad_norm": 0.4227041306049192, "kl": 0.03955078125, "learning_rate": 9.77967875351008e-07, "loss": 0.0001, "reward": 0.647222250699997, "reward_std": 0.2667815089225769, "rewards/format_reward_func": 0.097222238779068, "rewards/solution_reward_func": 0.451388880610466, "rewards/thinker_reward_func": 0.09861112385988235, "step": 122 }, { "completion_length": 721.3472290039062, "epoch": 7.235294117647059, "grad_norm": 0.3584138878432119, "kl": 0.0372314453125, "learning_rate": 9.774899590139897e-07, "loss": 0.0001, "reward": 0.30258651822805405, "reward_std": 0.11427235836163163, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.1041666641831398, "rewards/thinker_reward_func": 0.09980873763561249, "step": 123 }, { "completion_length": 714.6944580078125, "epoch": 7.294117647058823, "grad_norm": 0.22947928340360682, "kl": 0.0460205078125, "learning_rate": 9.770070340332456e-07, "loss": 0.0001, "reward": 0.38680558651685715, "reward_std": 0.07457441324368119, "rewards/format_reward_func": 0.097222238779068, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.09861112385988235, "step": 124 }, { "completion_length": 661.0555419921875, "epoch": 7.352941176470588, "grad_norm": 0.4232842811117562, "kl": 0.0509033203125, "learning_rate": 9.765191054744304e-07, "loss": 0.0001, "reward": 0.6846134960651398, "reward_std": 0.308729350566864, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4861111119389534, "rewards/thinker_reward_func": 0.09989127889275551, "step": 125 }, { "completion_length": 691.6666564941406, "epoch": 7.411764705882353, "grad_norm": 0.3524637411337013, "kl": 0.044921875, "learning_rate": 9.760261784556838e-07, "loss": 0.0001, "reward": 0.4750000238418579, "reward_std": 0.21163997799158096, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.2777777910232544, "rewards/thinker_reward_func": 0.10000000894069672, "step": 126 }, { "completion_length": 650.25, "epoch": 7.470588235294118, "grad_norm": 0.4444689056848215, "kl": 0.04345703125, "learning_rate": 9.755282581475767e-07, "loss": 0.0001, "reward": 0.6458095759153366, "reward_std": 0.2049274630844593, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.4513888843357563, "rewards/thinker_reward_func": 0.09719842672348022, "step": 127 }, { "completion_length": 694.388916015625, "epoch": 7.529411764705882, "grad_norm": 0.21171268963546044, "kl": 0.0419921875, "learning_rate": 9.750253497730579e-07, "loss": 0.0001, "reward": 0.37361112982034683, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 128 }, { "completion_length": 694.8888854980469, "epoch": 7.588235294117647, "grad_norm": 0.29504960696556964, "kl": 0.046630859375, "learning_rate": 9.745174586073982e-07, "loss": 0.0001, "reward": 0.3561079576611519, "reward_std": 0.09471442550420761, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.15625, "rewards/thinker_reward_func": 0.09985794499516487, "step": 129 }, { "completion_length": 696.2916870117188, "epoch": 7.647058823529412, "grad_norm": 0.24226019842453053, "kl": 0.0430908203125, "learning_rate": 9.740045899781352e-07, "loss": 0.0003, "reward": 0.5791666954755783, "reward_std": 0.09071615058928728, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09861112385988235, "step": 130 }, { "completion_length": 658.125, "epoch": 7.705882352941177, "grad_norm": 0.2757669835420144, "kl": 0.0567626953125, "learning_rate": 9.734867492650186e-07, "loss": 0.0003, "reward": 0.508333370089531, "reward_std": 0.19147787988185883, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.3124999888241291, "rewards/thinker_reward_func": 0.09861112385988235, "step": 131 }, { "completion_length": 696.1944580078125, "epoch": 7.764705882352941, "grad_norm": 0.347062179925441, "kl": 0.043212890625, "learning_rate": 9.729639418999522e-07, "loss": 0.0001, "reward": 0.40818095952272415, "reward_std": 0.21436643600463867, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09984762594103813, "step": 132 }, { "completion_length": 666.5138854980469, "epoch": 7.823529411764706, "grad_norm": 0.27759861069928754, "kl": 0.051025390625, "learning_rate": 9.72436173366938e-07, "loss": 0.0001, "reward": 0.3908531814813614, "reward_std": 0.06035835844522808, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.09988096356391907, "step": 133 }, { "completion_length": 682.4722290039062, "epoch": 7.882352941176471, "grad_norm": 0.3117950219647706, "kl": 0.0447998046875, "learning_rate": 9.71903449202018e-07, "loss": 0.0003, "reward": 0.9783167243003845, "reward_std": 0.14826064556837082, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.78125, "rewards/thinker_reward_func": 0.09984445571899414, "step": 134 }, { "completion_length": 683.4028015136719, "epoch": 7.9411764705882355, "grad_norm": 0.3872213877978456, "kl": 0.0465087890625, "learning_rate": 9.713657749932171e-07, "loss": 0.0001, "reward": 0.31858333945274353, "reward_std": 0.11709045711904764, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.1215277761220932, "rewards/thinker_reward_func": 0.09844445437192917, "step": 135 }, { "completion_length": 714.8333435058594, "epoch": 8.0, "grad_norm": 0.4037447337238601, "kl": 0.04443359375, "learning_rate": 9.708231563804828e-07, "loss": 0.0001, "reward": 0.3881944492459297, "reward_std": 0.15085680782794952, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.09861112385988235, "step": 136 }, { "completion_length": 689.8888854980469, "epoch": 8.058823529411764, "grad_norm": 0.4403380430299111, "kl": 0.0469970703125, "learning_rate": 9.702755990556276e-07, "loss": 0.0001, "reward": 0.5645143091678619, "reward_std": 0.14147348329424858, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3645833432674408, "rewards/thinker_reward_func": 0.09993096068501472, "step": 137 }, { "completion_length": 671.263916015625, "epoch": 8.117647058823529, "grad_norm": 0.3448442259508587, "kl": 0.049072265625, "learning_rate": 9.697231087622689e-07, "loss": 0.0003, "reward": 0.7555555701255798, "reward_std": 0.16218729317188263, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5555555671453476, "rewards/thinker_reward_func": 0.10000000894069672, "step": 138 }, { "completion_length": 664.2222290039062, "epoch": 8.176470588235293, "grad_norm": 0.38687099031600203, "kl": 0.0543212890625, "learning_rate": 9.691656912957684e-07, "loss": 0.0001, "reward": 0.40555399656295776, "reward_std": 0.2847355753183365, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333358168602, "rewards/thinker_reward_func": 0.09860953316092491, "step": 139 }, { "completion_length": 746.5, "epoch": 8.235294117647058, "grad_norm": 0.31607135891359117, "kl": 0.0595703125, "learning_rate": 9.686033525031719e-07, "loss": 0.0001, "reward": 0.21597222983837128, "reward_std": 0.06495191156864166, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.0173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 140 }, { "completion_length": 685.7361145019531, "epoch": 8.294117647058824, "grad_norm": 0.34780415644741874, "kl": 0.052490234375, "learning_rate": 9.680360982831466e-07, "loss": 0.0001, "reward": 0.5472222939133644, "reward_std": 0.15436294674873352, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.347222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 141 }, { "completion_length": 692.6944580078125, "epoch": 8.352941176470589, "grad_norm": 0.4420624177021849, "kl": 0.0572509765625, "learning_rate": 9.674639345859212e-07, "loss": 0.0003, "reward": 1.0145833194255829, "reward_std": 0.3685266822576523, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.815972238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 142 }, { "completion_length": 633.8611145019531, "epoch": 8.411764705882353, "grad_norm": 0.40348431651390637, "kl": 0.06689453125, "learning_rate": 9.668868674132222e-07, "loss": 0.0003, "reward": 0.8423087894916534, "reward_std": 0.19697998228366487, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6423611044883728, "rewards/thinker_reward_func": 0.09994763135910034, "step": 143 }, { "completion_length": 700.4722290039062, "epoch": 8.470588235294118, "grad_norm": 0.2875913125462575, "kl": 0.056640625, "learning_rate": 9.663049028182111e-07, "loss": 0.0001, "reward": 0.44305558502674103, "reward_std": 0.15436295047402382, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2430555559694767, "rewards/thinker_reward_func": 0.10000000894069672, "step": 144 }, { "completion_length": 672.5, "epoch": 8.529411764705882, "grad_norm": 0.4030550715643627, "kl": 0.058349609375, "learning_rate": 9.657180469054212e-07, "loss": 0.0003, "reward": 0.5444444566965103, "reward_std": 0.17703995574265718, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.3472222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 145 }, { "completion_length": 708.5, "epoch": 8.588235294117647, "grad_norm": 0.4167413839840332, "kl": 0.0562744140625, "learning_rate": 9.651263058306932e-07, "loss": 0.0001, "reward": 0.37222225219011307, "reward_std": 0.2870337665081024, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 146 }, { "completion_length": 696.3472290039062, "epoch": 8.647058823529411, "grad_norm": 0.15279973166891359, "kl": 0.056640625, "learning_rate": 9.645296858011107e-07, "loss": 0.0005, "reward": 0.6138889193534851, "reward_std": 0.009622504003345966, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09861112385988235, "step": 147 }, { "completion_length": 686.9861145019531, "epoch": 8.705882352941176, "grad_norm": 0.34095418467206795, "kl": 0.0543212890625, "learning_rate": 9.63928193074936e-07, "loss": 0.0001, "reward": 0.5993055701255798, "reward_std": 0.18042195960879326, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 148 }, { "completion_length": 746.6805725097656, "epoch": 8.764705882352942, "grad_norm": 0.24518066748325146, "kl": 0.052978515625, "learning_rate": 9.633218339615432e-07, "loss": 0.0003, "reward": 0.40416669845581055, "reward_std": 0.011298743076622486, "rewards/format_reward_func": 0.09583334624767303, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 149 }, { "completion_length": 744.5833435058594, "epoch": 8.823529411764707, "grad_norm": 0.26367920115431825, "kl": 0.0489501953125, "learning_rate": 9.62710614821352e-07, "loss": 0.0001, "reward": 0.19722222536802292, "reward_std": 0.009622504003345966, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 150 }, { "completion_length": 649.7638854980469, "epoch": 8.882352941176471, "grad_norm": 0.37941482030832374, "kl": 0.060302734375, "learning_rate": 9.620945420657623e-07, "loss": 0.0001, "reward": 0.37361112982034683, "reward_std": 0.15436296164989471, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 151 }, { "completion_length": 638.3194580078125, "epoch": 8.941176470588236, "grad_norm": 0.4762396530681182, "kl": 0.0589599609375, "learning_rate": 9.61473622157086e-07, "loss": 0.0002, "reward": 0.8930548131465912, "reward_std": 0.2617532014846802, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6944444477558136, "rewards/thinker_reward_func": 0.09999921545386314, "step": 152 }, { "completion_length": 644.4028015136719, "epoch": 9.0, "grad_norm": 0.40877776646163283, "kl": 0.0614013671875, "learning_rate": 9.608478616084782e-07, "loss": 0.0001, "reward": 0.5298611521720886, "reward_std": 0.17531593888998032, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.329861119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 153 }, { "completion_length": 655.3888854980469, "epoch": 9.058823529411764, "grad_norm": 0.3338174317135874, "kl": 0.0621337890625, "learning_rate": 9.60217266983872e-07, "loss": 0.0003, "reward": 0.6513889208436012, "reward_std": 0.15436294674873352, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4513888955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 154 }, { "completion_length": 696.5694580078125, "epoch": 9.117647058823529, "grad_norm": 0.35662767143446183, "kl": 0.05224609375, "learning_rate": 9.59581844897906e-07, "loss": 0.0001, "reward": 0.19839365035295486, "reward_std": 0.00556455651530996, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.0997825488448143, "step": 155 }, { "completion_length": 694.1528015136719, "epoch": 9.176470588235293, "grad_norm": 0.30180005543511373, "kl": 0.0592041015625, "learning_rate": 9.589416020158577e-07, "loss": 0.0001, "reward": 0.23320873826742172, "reward_std": 0.08588497946038842, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.0347222238779068, "rewards/thinker_reward_func": 0.09987540543079376, "step": 156 }, { "completion_length": 691.2083435058594, "epoch": 9.235294117647058, "grad_norm": 0.4609010163866311, "kl": 0.0550537109375, "learning_rate": 9.582965450535713e-07, "loss": 0.0001, "reward": 0.5618055760860443, "reward_std": 0.15085680782794952, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3645833432674408, "rewards/thinker_reward_func": 0.09861112385988235, "step": 157 }, { "completion_length": 655.3055725097656, "epoch": 9.294117647058824, "grad_norm": 0.290059486126005, "kl": 0.06201171875, "learning_rate": 9.576466807773898e-07, "loss": 0.0003, "reward": 0.472222238779068, "reward_std": 0.11092686140909791, "rewards/format_reward_func": 0.09583334997296333, "rewards/solution_reward_func": 0.2777777910232544, "rewards/thinker_reward_func": 0.09861112385988235, "step": 158 }, { "completion_length": 679.6944274902344, "epoch": 9.352941176470589, "grad_norm": 0.4650858338739518, "kl": 0.066650390625, "learning_rate": 9.569920160040814e-07, "loss": 0.0001, "reward": 0.4054063707590103, "reward_std": 0.010035739745944738, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09846191480755806, "step": 159 }, { "completion_length": 657.0972290039062, "epoch": 9.411764705882353, "grad_norm": 0.4590188837463852, "kl": 0.06005859375, "learning_rate": 9.5633255760077e-07, "loss": 0.0003, "reward": 0.8770833909511566, "reward_std": 0.34273338317871094, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6770833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 160 }, { "completion_length": 683.7777709960938, "epoch": 9.470588235294118, "grad_norm": 0.4241100362565528, "kl": 0.061767578125, "learning_rate": 9.556683124848623e-07, "loss": 0.0001, "reward": 0.7034579813480377, "reward_std": 0.24386004358530045, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5034722238779068, "rewards/thinker_reward_func": 0.09998572617769241, "step": 161 }, { "completion_length": 674.8194580078125, "epoch": 9.529411764705882, "grad_norm": 0.2348934919555219, "kl": 0.066162109375, "learning_rate": 9.54999287623975e-07, "loss": 0.0003, "reward": 0.5993056371808052, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 162 }, { "completion_length": 652.9861145019531, "epoch": 9.588235294117647, "grad_norm": 0.002844020546061345, "kl": 0.064453125, "learning_rate": 9.543254900358628e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 163 }, { "completion_length": 685.4444580078125, "epoch": 9.647058823529411, "grad_norm": 0.3953414506641183, "kl": 0.06298828125, "learning_rate": 9.536469267883431e-07, "loss": 0.0001, "reward": 0.5270833745598793, "reward_std": 0.2976010739803314, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.3298611044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 164 }, { "completion_length": 679.6805725097656, "epoch": 9.705882352941176, "grad_norm": 0.250693007351783, "kl": 0.0582275390625, "learning_rate": 9.529636049992233e-07, "loss": 0.0002, "reward": 0.35625002533197403, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.15625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 165 }, { "completion_length": 677.9583435058594, "epoch": 9.764705882352942, "grad_norm": 0.3403407661370169, "kl": 0.0614013671875, "learning_rate": 9.522755318362259e-07, "loss": 0.0001, "reward": 0.3388524055480957, "reward_std": 0.18993037939071655, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1388888880610466, "rewards/thinker_reward_func": 0.09996350109577179, "step": 166 }, { "completion_length": 712.888916015625, "epoch": 9.823529411764707, "grad_norm": 0.21772865883199882, "kl": 0.058349609375, "learning_rate": 9.515827145169127e-07, "loss": 0.0003, "reward": 0.47777779400348663, "reward_std": 0.10257624089717865, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2777777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 167 }, { "completion_length": 706.6528015136719, "epoch": 9.882352941176471, "grad_norm": 0.14547543433601248, "kl": 0.0582275390625, "learning_rate": 9.508851603086092e-07, "loss": 0.0003, "reward": 0.5979166924953461, "reward_std": 0.06495191156864166, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 168 }, { "completion_length": 654.8194580078125, "epoch": 9.941176470588236, "grad_norm": 0.22463660992064308, "kl": 0.06591796875, "learning_rate": 9.501828765283294e-07, "loss": 0.0003, "reward": 0.6163706555962563, "reward_std": 0.0006096565048210323, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09970397874712944, "step": 169 }, { "completion_length": 691.5555725097656, "epoch": 10.0, "grad_norm": 0.24522811206862113, "kl": 0.05908203125, "learning_rate": 9.494758705426976e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 170 }, { "completion_length": 681.6944580078125, "epoch": 10.058823529411764, "grad_norm": 0.486235990048234, "kl": 0.067626953125, "learning_rate": 9.487641497678722e-07, "loss": 0.0001, "reward": 0.7513889074325562, "reward_std": 0.336214154958725, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.5555555671453476, "rewards/thinker_reward_func": 0.09861112385988235, "step": 171 }, { "completion_length": 668.25, "epoch": 10.117647058823529, "grad_norm": 0.5526145455835427, "kl": 0.076171875, "learning_rate": 9.480477216694673e-07, "loss": 0.0003, "reward": 0.8222222328186035, "reward_std": 0.21099746972322464, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.09861112385988235, "step": 172 }, { "completion_length": 686.2222290039062, "epoch": 10.176470588235293, "grad_norm": 0.31003915495904066, "kl": 0.0693359375, "learning_rate": 9.473265937624746e-07, "loss": 0.0001, "reward": 0.42569447308778763, "reward_std": 0.19679853320121765, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2256944477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 173 }, { "completion_length": 663.1666870117188, "epoch": 10.235294117647058, "grad_norm": 0.38723693491551, "kl": 0.066162109375, "learning_rate": 9.466007736111845e-07, "loss": 0.0001, "reward": 0.7527777850627899, "reward_std": 0.21099746599793434, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5555555671453476, "rewards/thinker_reward_func": 0.09861112385988235, "step": 174 }, { "completion_length": 696.0833129882812, "epoch": 10.294117647058824, "grad_norm": 0.222809544287753, "kl": 0.0606689453125, "learning_rate": 9.458702688291071e-07, "loss": 0.0003, "reward": 0.4055555611848831, "reward_std": 0.009622504934668541, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09861112385988235, "step": 175 }, { "completion_length": 638.9722290039062, "epoch": 10.352941176470589, "grad_norm": 0.21569131521876475, "kl": 0.068603515625, "learning_rate": 9.45135087078892e-07, "loss": 0.0003, "reward": 0.5472222343087196, "reward_std": 0.10257624089717865, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.347222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 176 }, { "completion_length": 678.5138854980469, "epoch": 10.411764705882353, "grad_norm": 0.22678331930987805, "kl": 0.0579833984375, "learning_rate": 9.443952360722476e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 177 }, { "completion_length": 674.3472290039062, "epoch": 10.470588235294118, "grad_norm": 0.0023846646099873864, "kl": 0.06298828125, "learning_rate": 9.43650723569861e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 178 }, { "completion_length": 643.7638854980469, "epoch": 10.529411764705882, "grad_norm": 0.26339932297264246, "kl": 0.06103515625, "learning_rate": 9.429015573813162e-07, "loss": 0.0005, "reward": 0.806250125169754, "reward_std": 0.0649519027210772, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 179 }, { "completion_length": 726.5833435058594, "epoch": 10.588235294117647, "grad_norm": 0.3871444205719643, "kl": 0.0595703125, "learning_rate": 9.421477453650117e-07, "loss": 0.0001, "reward": 0.26944445818662643, "reward_std": 0.15436294674873352, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0694444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 180 }, { "completion_length": 648.2638854980469, "epoch": 10.647058823529411, "grad_norm": 0.3298247231012932, "kl": 0.067138671875, "learning_rate": 9.413892954280791e-07, "loss": 0.0003, "reward": 0.6166666746139526, "reward_std": 0.2145536094903946, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 181 }, { "completion_length": 667.388916015625, "epoch": 10.705882352941176, "grad_norm": 0.3174388961192076, "kl": 0.0614013671875, "learning_rate": 9.406262155262994e-07, "loss": 0.0003, "reward": 0.7687500417232513, "reward_std": 0.14877208694815636, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.5729166716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 182 }, { "completion_length": 646.8472290039062, "epoch": 10.764705882352942, "grad_norm": 0.004852368264110094, "kl": 0.0640869140625, "learning_rate": 9.398585136640194e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 183 }, { "completion_length": 659.2222290039062, "epoch": 10.823529411764707, "grad_norm": 0.3502780734513927, "kl": 0.058349609375, "learning_rate": 9.390861978940685e-07, "loss": 0.0003, "reward": 0.8769706934690475, "reward_std": 0.26525890827178955, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6770833730697632, "rewards/thinker_reward_func": 0.09988731145858765, "step": 184 }, { "completion_length": 654.0278015136719, "epoch": 10.882352941176471, "grad_norm": 0.0019147605199013923, "kl": 0.059814453125, "learning_rate": 9.383092763176738e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 185 }, { "completion_length": 674.7222290039062, "epoch": 10.941176470588236, "grad_norm": 0.002076064225805522, "kl": 0.0584716796875, "learning_rate": 9.375277570843749e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 186 }, { "completion_length": 601.3611145019531, "epoch": 11.0, "grad_norm": 0.5466446984402868, "kl": 0.07568359375, "learning_rate": 9.367416483919387e-07, "loss": 0.0003, "reward": 0.9795388579368591, "reward_std": 0.1467470425995998, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.78125, "rewards/thinker_reward_func": 0.09967778623104095, "step": 187 }, { "completion_length": 673.75, "epoch": 11.058823529411764, "grad_norm": 0.28674924418271797, "kl": 0.0623779296875, "learning_rate": 9.359509584862735e-07, "loss": 0.0008, "reward": 1.186805635690689, "reward_std": 0.10384479910135269, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.9895833432674408, "rewards/thinker_reward_func": 0.09861112385988235, "step": 188 }, { "completion_length": 727.1805725097656, "epoch": 11.117647058823529, "grad_norm": 0.2756454442593715, "kl": 0.076416015625, "learning_rate": 9.351556956613422e-07, "loss": 0.0001, "reward": 0.3708333447575569, "reward_std": 0.090716153383255, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.09861112385988235, "step": 189 }, { "completion_length": 662.5278015136719, "epoch": 11.176470588235293, "grad_norm": 0.1678617140427314, "kl": 0.064208984375, "learning_rate": 9.343558682590755e-07, "loss": 0.0005, "reward": 0.6138888970017433, "reward_std": 0.009622504003345966, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09861112385988235, "step": 190 }, { "completion_length": 645.8611145019531, "epoch": 11.235294117647058, "grad_norm": 0.38248625952112364, "kl": 0.06982421875, "learning_rate": 9.335514846692845e-07, "loss": 0.0003, "reward": 0.8250000178813934, "reward_std": 0.30407533049583435, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 191 }, { "completion_length": 604.8611145019531, "epoch": 11.294117647058824, "grad_norm": 0.32192535786167115, "kl": 0.07275390625, "learning_rate": 9.327425533295723e-07, "loss": 0.0003, "reward": 0.5979167520999908, "reward_std": 0.0649519027210772, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 192 }, { "completion_length": 675.9028015136719, "epoch": 11.352941176470589, "grad_norm": 0.2745599070977184, "kl": 0.0654296875, "learning_rate": 9.319290827252459e-07, "loss": 0.0004, "reward": 0.6513539850711823, "reward_std": 0.0811753049492836, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.451388880610466, "rewards/thinker_reward_func": 0.09996509179472923, "step": 193 }, { "completion_length": 690.2778015136719, "epoch": 11.411764705882353, "grad_norm": 0.0016161717997938922, "kl": 0.0560302734375, "learning_rate": 9.311110813892269e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 194 }, { "completion_length": 677.6666564941406, "epoch": 11.470588235294118, "grad_norm": 0.004124730572438583, "kl": 0.068359375, "learning_rate": 9.302885579019626e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 195 }, { "completion_length": 696.9861450195312, "epoch": 11.529411764705882, "grad_norm": 0.27611657692959946, "kl": 0.0614013671875, "learning_rate": 9.294615208913348e-07, "loss": 0.0003, "reward": 0.6486111581325531, "reward_std": 0.21812323480844498, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.451388880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 196 }, { "completion_length": 701.5694580078125, "epoch": 11.588235294117647, "grad_norm": 0.27025825837574974, "kl": 0.06689453125, "learning_rate": 9.286299790325706e-07, "loss": 0.0003, "reward": 0.6340278089046478, "reward_std": 0.19679853320121765, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4340277761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 197 }, { "completion_length": 650.6111145019531, "epoch": 11.647058823529411, "grad_norm": 0.39099764489878025, "kl": 0.0662841796875, "learning_rate": 9.277939410481505e-07, "loss": 0.0003, "reward": 0.7555556148290634, "reward_std": 0.2228575423359871, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5555555522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 198 }, { "completion_length": 623.5416870117188, "epoch": 11.705882352941176, "grad_norm": 0.3828559472944619, "kl": 0.068359375, "learning_rate": 9.269534157077176e-07, "loss": 0.0001, "reward": 0.7208333760499954, "reward_std": 0.2228575423359871, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5208333432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 199 }, { "completion_length": 719.5138854980469, "epoch": 11.764705882352942, "grad_norm": 0.2772761826214984, "kl": 0.065673828125, "learning_rate": 9.261084118279846e-07, "loss": 0.0001, "reward": 0.19510556012392044, "reward_std": 0.007870343513786793, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.0992722325026989, "step": 200 }, { "completion_length": 661.2361450195312, "epoch": 11.823529411764707, "grad_norm": 0.22925219175182876, "kl": 0.096923828125, "learning_rate": 9.252589382726425e-07, "loss": 0.0002, "reward": 0.23472223430871964, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0347222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 201 }, { "completion_length": 613.0138854980469, "epoch": 11.882352941176471, "grad_norm": 0.43057249269275444, "kl": 0.0703125, "learning_rate": 9.244050039522672e-07, "loss": 0.0001, "reward": 0.7902778089046478, "reward_std": 0.3558620363473892, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777910232544, "rewards/thinker_reward_func": 0.10000000894069672, "step": 202 }, { "completion_length": 646.6666870117188, "epoch": 11.941176470588236, "grad_norm": 0.16328053880243956, "kl": 0.07421875, "learning_rate": 9.235466178242253e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 203 }, { "completion_length": 674.6805419921875, "epoch": 12.0, "grad_norm": 0.3773973763842648, "kl": 0.07177734375, "learning_rate": 9.226837888925812e-07, "loss": 0.0003, "reward": 0.8597222417593002, "reward_std": 0.29102082550525665, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6597222089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 204 }, { "completion_length": 673.2361145019531, "epoch": 12.058823529411764, "grad_norm": 0.2525755471948894, "kl": 0.0679931640625, "learning_rate": 9.218165262080022e-07, "loss": 0.0003, "reward": 0.5819444730877876, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 205 }, { "completion_length": 702.125, "epoch": 12.117647058823529, "grad_norm": 0.31886443096553807, "kl": 0.067138671875, "learning_rate": 9.209448388676635e-07, "loss": 0.0005, "reward": 0.9784722626209259, "reward_std": 0.15085680782794952, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.78125, "rewards/thinker_reward_func": 0.09861112385988235, "step": 206 }, { "completion_length": 651.9305725097656, "epoch": 12.176470588235293, "grad_norm": 0.2557191446368213, "kl": 0.079833984375, "learning_rate": 9.200687360151527e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 207 }, { "completion_length": 664.6666870117188, "epoch": 12.235294117647058, "grad_norm": 0.30578274081315815, "kl": 0.0810546875, "learning_rate": 9.191882268403741e-07, "loss": 0.0005, "reward": 0.8569444715976715, "reward_std": 0.19015737110748887, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.6597222089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 208 }, { "completion_length": 632.3611145019531, "epoch": 12.294117647058824, "grad_norm": 0.26833174176514446, "kl": 0.079833984375, "learning_rate": 9.183033205794524e-07, "loss": 0.0001, "reward": 0.5791667103767395, "reward_std": 0.12990381941199303, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09861112385988235, "step": 209 }, { "completion_length": 705.2638854980469, "epoch": 12.352941176470589, "grad_norm": 0.22465268986223422, "kl": 0.072998046875, "learning_rate": 9.174140265146355e-07, "loss": 0.0005, "reward": 0.807638943195343, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 210 }, { "completion_length": 674.9444580078125, "epoch": 12.411764705882353, "grad_norm": 0.33871794097199637, "kl": 0.0859375, "learning_rate": 9.165203539741974e-07, "loss": 0.0001, "reward": 0.42569446563720703, "reward_std": 0.20985303819179535, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2256944552063942, "rewards/thinker_reward_func": 0.10000000894069672, "step": 211 }, { "completion_length": 715.2777709960938, "epoch": 12.470588235294118, "grad_norm": 0.0020641831989944817, "kl": 0.06689453125, "learning_rate": 9.156223123323404e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 212 }, { "completion_length": 726.388916015625, "epoch": 12.529411764705882, "grad_norm": 0.26314345487200586, "kl": 0.06884765625, "learning_rate": 9.147199110090958e-07, "loss": 0.0003, "reward": 0.772916704416275, "reward_std": 0.14123429730534554, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5729166716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 213 }, { "completion_length": 649.8472290039062, "epoch": 12.588235294117647, "grad_norm": 0.33819128609143007, "kl": 0.07763671875, "learning_rate": 9.13813159470227e-07, "loss": 0.0003, "reward": 0.6513889133930206, "reward_std": 0.18366988748311996, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4513889104127884, "rewards/thinker_reward_func": 0.10000000894069672, "step": 214 }, { "completion_length": 663.3611145019531, "epoch": 12.647058823529411, "grad_norm": 0.14217933424794837, "kl": 0.078125, "learning_rate": 9.129020672271281e-07, "loss": 0.0005, "reward": 0.7875000536441803, "reward_std": 0.08758113533258438, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 215 }, { "completion_length": 638.1666870117188, "epoch": 12.705882352941176, "grad_norm": 0.28309281339631626, "kl": 0.108642578125, "learning_rate": 9.119866438367262e-07, "loss": 0.0003, "reward": 0.7902778685092926, "reward_std": 0.12028130888938904, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 216 }, { "completion_length": 615.0555725097656, "epoch": 12.764705882352942, "grad_norm": 0.002931608582941426, "kl": 0.07763671875, "learning_rate": 9.11066898901379e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 217 }, { "completion_length": 667.1111145019531, "epoch": 12.823529411764707, "grad_norm": 0.3263018392486828, "kl": 0.085205078125, "learning_rate": 9.101428420687757e-07, "loss": 0.0003, "reward": 0.9451388716697693, "reward_std": 0.2271392047405243, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.7465278059244156, "rewards/thinker_reward_func": 0.10000000894069672, "step": 218 }, { "completion_length": 647.1944580078125, "epoch": 12.882352941176471, "grad_norm": 0.2552970436244527, "kl": 0.087158203125, "learning_rate": 9.092144830318357e-07, "loss": 0.0005, "reward": 0.703472226858139, "reward_std": 0.1072767972946167, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5034722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 219 }, { "completion_length": 656.8472290039062, "epoch": 12.941176470588236, "grad_norm": 0.2586751484337686, "kl": 0.0732421875, "learning_rate": 9.082818315286054e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 220 }, { "completion_length": 672.0278015136719, "epoch": 13.0, "grad_norm": 0.23015763594282576, "kl": 0.076416015625, "learning_rate": 9.07344897342158e-07, "loss": 0.0002, "reward": 0.33888890594244003, "reward_std": 0.10257624089717865, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1388888955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 221 }, { "completion_length": 670.0694580078125, "epoch": 13.058823529411764, "grad_norm": 0.2990329312861943, "kl": 0.0751953125, "learning_rate": 9.064036903004899e-07, "loss": 0.0001, "reward": 0.39097223430871964, "reward_std": 0.14123429358005524, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 222 }, { "completion_length": 677.763916015625, "epoch": 13.117647058823529, "grad_norm": 0.003492873733280222, "kl": 0.07763671875, "learning_rate": 9.054582202764174e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 223 }, { "completion_length": 639.0277709960938, "epoch": 13.176470588235293, "grad_norm": 0.2760367073575632, "kl": 0.0732421875, "learning_rate": 9.045084971874737e-07, "loss": 0.0005, "reward": 0.9465278387069702, "reward_std": 0.17531593143939972, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7465278059244156, "rewards/thinker_reward_func": 0.10000000894069672, "step": 224 }, { "completion_length": 675.2083435058594, "epoch": 13.235294117647058, "grad_norm": 0.25631953879548763, "kl": 0.076171875, "learning_rate": 9.035545309958046e-07, "loss": 0.0005, "reward": 0.7888889312744141, "reward_std": 0.0859048985876143, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 225 }, { "completion_length": 636.2222290039062, "epoch": 13.294117647058824, "grad_norm": 0.30081428380756214, "kl": 0.083740234375, "learning_rate": 9.02596331708064e-07, "loss": 0.0003, "reward": 0.6687500923871994, "reward_std": 0.16271689906716347, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.46875, "rewards/thinker_reward_func": 0.10000000894069672, "step": 226 }, { "completion_length": 703.4722290039062, "epoch": 13.352941176470589, "grad_norm": 0.3550520986538617, "kl": 0.0673828125, "learning_rate": 9.016339093753092e-07, "loss": 0.0001, "reward": 0.581944465637207, "reward_std": 0.2956472486257553, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444328546524, "rewards/thinker_reward_func": 0.10000000894069672, "step": 227 }, { "completion_length": 726.0833435058594, "epoch": 13.411764705882353, "grad_norm": 0.2246496364820083, "kl": 0.07177734375, "learning_rate": 9.00667274092895e-07, "loss": 0.0003, "reward": 0.7354166805744171, "reward_std": 0.18180342763662338, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.5381944626569748, "rewards/thinker_reward_func": 0.10000000894069672, "step": 228 }, { "completion_length": 665.8194580078125, "epoch": 13.470588235294118, "grad_norm": 0.05391125705580338, "kl": 0.064453125, "learning_rate": 8.99696436000368e-07, "loss": 0.0002, "reward": 0.1999984160065651, "reward_std": 5.4983506743155885e-06, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.09999842196702957, "step": 229 }, { "completion_length": 602.0555725097656, "epoch": 13.529411764705882, "grad_norm": 0.34860448073434913, "kl": 0.0751953125, "learning_rate": 8.987214052813603e-07, "loss": 0.0003, "reward": 0.6513889133930206, "reward_std": 0.15436294674873352, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.451388880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 230 }, { "completion_length": 649.7916870117188, "epoch": 13.588235294117647, "grad_norm": 0.22644670874971207, "kl": 0.06884765625, "learning_rate": 8.977421921634831e-07, "loss": 0.0003, "reward": 0.5818658769130707, "reward_std": 0.08105738461017609, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09992143884301186, "step": 231 }, { "completion_length": 617.2916870117188, "epoch": 13.647058823529411, "grad_norm": 0.21830185522906972, "kl": 0.087890625, "learning_rate": 8.967588069182183e-07, "loss": 0.0002, "reward": 0.33888890594244003, "reward_std": 0.10257624089717865, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1388888955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 232 }, { "completion_length": 625.3888854980469, "epoch": 13.705882352941176, "grad_norm": 0.1837370030303864, "kl": 0.081298828125, "learning_rate": 8.957712598608122e-07, "loss": 0.0005, "reward": 0.6166134774684906, "reward_std": 0.00018420223204884678, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09994683787226677, "step": 233 }, { "completion_length": 613.1666564941406, "epoch": 13.764705882352942, "grad_norm": 0.4843084407543501, "kl": 0.07275390625, "learning_rate": 8.947795613501656e-07, "loss": 0.0001, "reward": 0.7208333611488342, "reward_std": 0.28769876435399055, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5208333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 234 }, { "completion_length": 651.2083435058594, "epoch": 13.823529411764707, "grad_norm": 0.0025775535647397994, "kl": 0.074462890625, "learning_rate": 8.937837217887272e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 235 }, { "completion_length": 563.7916564941406, "epoch": 13.882352941176471, "grad_norm": 0.4575289866509794, "kl": 0.080322265625, "learning_rate": 8.927837516223823e-07, "loss": 0.0003, "reward": 1.2065365612506866, "reward_std": 0.12127622961997986, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0069444477558136, "rewards/thinker_reward_func": 0.09959207102656364, "step": 236 }, { "completion_length": 657.4305725097656, "epoch": 13.941176470588236, "grad_norm": 0.2994367566253727, "kl": 0.07275390625, "learning_rate": 8.91779661340345e-07, "loss": 0.0003, "reward": 0.6686953008174896, "reward_std": 0.1627575010061264, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4687499850988388, "rewards/thinker_reward_func": 0.09994524717330933, "step": 237 }, { "completion_length": 621.5833435058594, "epoch": 14.0, "grad_norm": 0.31351802589229355, "kl": 0.099609375, "learning_rate": 8.907714614750472e-07, "loss": 0.0001, "reward": 0.5645833611488342, "reward_std": 0.14123430475592613, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3645833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 238 }, { "completion_length": 630.7777709960938, "epoch": 14.058823529411764, "grad_norm": 0.388423817061246, "kl": 0.0740966796875, "learning_rate": 8.897591626020284e-07, "loss": 0.0001, "reward": 0.7034722343087196, "reward_std": 0.24381054937839508, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5034722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 239 }, { "completion_length": 630.8472290039062, "epoch": 14.117647058823529, "grad_norm": 0.2155044458873289, "kl": 0.0751953125, "learning_rate": 8.887427753398247e-07, "loss": 0.0002, "reward": 0.1995222195982933, "reward_std": 0.0016550716245546937, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.09952223300933838, "step": 240 }, { "completion_length": 692.5555725097656, "epoch": 14.176470588235293, "grad_norm": 0.218220553509329, "kl": 0.067138671875, "learning_rate": 8.877223103498575e-07, "loss": 0.0005, "reward": 0.7902778387069702, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 241 }, { "completion_length": 611.2638854980469, "epoch": 14.235294117647058, "grad_norm": 0.0024576826319069817, "kl": 0.076904296875, "learning_rate": 8.866977783363218e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 242 }, { "completion_length": 630.0277709960938, "epoch": 14.294117647058824, "grad_norm": 0.39723474337652315, "kl": 0.07373046875, "learning_rate": 8.856691900460738e-07, "loss": 0.0003, "reward": 0.7202548086643219, "reward_std": 0.16942168725654483, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.520833320915699, "rewards/thinker_reward_func": 0.0994214378297329, "step": 243 }, { "completion_length": 615.8194580078125, "epoch": 14.352941176470589, "grad_norm": 0.0022971464798097925, "kl": 0.07861328125, "learning_rate": 8.846365562685176e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 244 }, { "completion_length": 685.2638854980469, "epoch": 14.411764705882353, "grad_norm": 0.20939589243202475, "kl": 0.066162109375, "learning_rate": 8.83599887835493e-07, "loss": 0.0003, "reward": 0.5993055775761604, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 245 }, { "completion_length": 624.9305419921875, "epoch": 14.470588235294118, "grad_norm": 0.35575946901489774, "kl": 0.08154296875, "learning_rate": 8.825591956211614e-07, "loss": 0.0005, "reward": 0.8423603475093842, "reward_std": 0.216074638068676, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6423611044883728, "rewards/thinker_reward_func": 0.09999921545386314, "step": 246 }, { "completion_length": 652.25, "epoch": 14.529411764705882, "grad_norm": 0.12237761044533556, "kl": 0.065673828125, "learning_rate": 8.815144905418916e-07, "loss": 0.0007, "reward": 0.8236111998558044, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 247 }, { "completion_length": 620.5694580078125, "epoch": 14.588235294117647, "grad_norm": 0.28576460613320454, "kl": 0.08251953125, "learning_rate": 8.804657835561456e-07, "loss": 0.0003, "reward": 0.616666704416275, "reward_std": 0.12028132006525993, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 248 }, { "completion_length": 650.125, "epoch": 14.647058823529411, "grad_norm": 0.2183527200084412, "kl": 0.071533203125, "learning_rate": 8.794130856643633e-07, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 249 }, { "completion_length": 678.5278015136719, "epoch": 14.705882352941176, "grad_norm": 0.2700773269387193, "kl": 0.06298828125, "learning_rate": 8.783564079088476e-07, "loss": 0.0005, "reward": 0.8409722745418549, "reward_std": 0.17863449454307556, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6423611044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 250 }, { "completion_length": 694.1666564941406, "epoch": 14.764705882352942, "grad_norm": 0.26399542569468964, "kl": 0.070068359375, "learning_rate": 8.772957613736482e-07, "loss": 0.0005, "reward": 0.8423612117767334, "reward_std": 0.14123429730534554, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6423611044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 251 }, { "completion_length": 730.125, "epoch": 14.823529411764707, "grad_norm": 0.1384697785936133, "kl": 0.06103515625, "learning_rate": 8.76231157184445e-07, "loss": 0.0002, "reward": 0.19722222536802292, "reward_std": 0.006487491074949503, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 252 }, { "completion_length": 682.3055419921875, "epoch": 14.882352941176471, "grad_norm": 0.270699809812934, "kl": 0.064453125, "learning_rate": 8.751626065084328e-07, "loss": 0.0003, "reward": 0.703472226858139, "reward_std": 0.16893918812274933, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5034722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 253 }, { "completion_length": 607.3611450195312, "epoch": 14.941176470588236, "grad_norm": 0.22421826442402407, "kl": 0.082763671875, "learning_rate": 8.74090120554202e-07, "loss": 0.0005, "reward": 0.7729166746139526, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5729166716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 254 }, { "completion_length": 667.4305419921875, "epoch": 15.0, "grad_norm": 0.3462945502329585, "kl": 0.075927734375, "learning_rate": 8.73013710571623e-07, "loss": 0.0001, "reward": 0.5791667103767395, "reward_std": 0.12990381196141243, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09861112385988235, "step": 255 }, { "completion_length": 665.2361145019531, "epoch": 15.058823529411764, "grad_norm": 0.19371903676765523, "kl": 0.0673828125, "learning_rate": 8.719333878517273e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 256 }, { "completion_length": 644.0555725097656, "epoch": 15.117647058823529, "grad_norm": 0.19729116702536212, "kl": 0.072509765625, "learning_rate": 8.708491637265887e-07, "loss": 0.0003, "reward": 0.5819444730877876, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 257 }, { "completion_length": 642.8194274902344, "epoch": 15.176470588235293, "grad_norm": 0.3994865707436707, "kl": 0.0631103515625, "learning_rate": 8.697610495692054e-07, "loss": 0.0001, "reward": 0.7381587997078896, "reward_std": 0.22231128811836243, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.538194477558136, "rewards/thinker_reward_func": 0.09996429830789566, "step": 258 }, { "completion_length": 658.2222290039062, "epoch": 15.235294117647058, "grad_norm": 0.0033928903878115164, "kl": 0.0721435546875, "learning_rate": 8.686690567933801e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 259 }, { "completion_length": 612.6528015136719, "epoch": 15.294117647058824, "grad_norm": 0.001646894793858676, "kl": 0.073486328125, "learning_rate": 8.675731968536002e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 260 }, { "completion_length": 668.75, "epoch": 15.352941176470589, "grad_norm": 0.37122107221262607, "kl": 0.066650390625, "learning_rate": 8.664734812449179e-07, "loss": 0.0001, "reward": 0.7555555701255798, "reward_std": 0.20137495175004005, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5555555671453476, "rewards/thinker_reward_func": 0.10000000894069672, "step": 261 }, { "completion_length": 608.3472290039062, "epoch": 15.411764705882353, "grad_norm": 0.0017308108195100017, "kl": 0.072021484375, "learning_rate": 8.653699215028296e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 262 }, { "completion_length": 642.6666564941406, "epoch": 15.470588235294118, "grad_norm": 0.0016793381301080032, "kl": 0.0640869140625, "learning_rate": 8.642625292031549e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 263 }, { "completion_length": 648.6250305175781, "epoch": 15.529411764705882, "grad_norm": 0.2361791235910215, "kl": 0.073974609375, "learning_rate": 8.631513159619149e-07, "loss": 0.0003, "reward": 0.8062119483947754, "reward_std": 0.06508387615031097, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.09996191784739494, "step": 264 }, { "completion_length": 735.9027709960938, "epoch": 15.588235294117647, "grad_norm": 0.22168663920960976, "kl": 0.059814453125, "learning_rate": 8.620362934352108e-07, "loss": 0.0001, "reward": 0.21319444477558136, "reward_std": 0.0676784347742796, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.0173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 265 }, { "completion_length": 652.0833435058594, "epoch": 15.647058823529411, "grad_norm": 0.23925794132420763, "kl": 0.076904296875, "learning_rate": 8.60917473319101e-07, "loss": 0.0001, "reward": 0.581944465637207, "reward_std": 0.12028131633996964, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 266 }, { "completion_length": 629.8888854980469, "epoch": 15.705882352941176, "grad_norm": 0.20068759856167329, "kl": 0.07763671875, "learning_rate": 8.597948673494794e-07, "loss": 0.0003, "reward": 0.5645833611488342, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3645833283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 267 }, { "completion_length": 682.7500305175781, "epoch": 15.764705882352942, "grad_norm": 0.30345460950428366, "kl": 0.06201171875, "learning_rate": 8.586684873019512e-07, "loss": 0.0001, "reward": 0.5472222566604614, "reward_std": 0.16218729317188263, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.347222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 268 }, { "completion_length": 652.0972290039062, "epoch": 15.823529411764707, "grad_norm": 0.0018282218682686919, "kl": 0.0667724609375, "learning_rate": 8.575383449917102e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 269 }, { "completion_length": 621.625, "epoch": 15.882352941176471, "grad_norm": 0.17741956186636917, "kl": 0.0648193359375, "learning_rate": 8.564044522734146e-07, "loss": 0.0005, "reward": 0.807638943195343, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 270 }, { "completion_length": 630.6250305175781, "epoch": 15.941176470588236, "grad_norm": 0.13161347332059978, "kl": 0.0645751953125, "learning_rate": 8.552668210410623e-07, "loss": 0.0005, "reward": 0.8062500655651093, "reward_std": 0.06495191156864166, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 271 }, { "completion_length": 634.3055725097656, "epoch": 16.0, "grad_norm": 0.2432027765098213, "kl": 0.0751953125, "learning_rate": 8.541254632278665e-07, "loss": 0.0003, "reward": 0.790266752243042, "reward_std": 0.08113213986143819, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.0999888963997364, "step": 272 }, { "completion_length": 655.0555725097656, "epoch": 16.058823529411764, "grad_norm": 0.32565320681924126, "kl": 0.059814453125, "learning_rate": 8.529803908061308e-07, "loss": 0.0001, "reward": 0.4068896844983101, "reward_std": 0.12528227269649506, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333358168602, "rewards/thinker_reward_func": 0.09994524717330933, "step": 273 }, { "completion_length": 612.6527709960938, "epoch": 16.11764705882353, "grad_norm": 0.3587808183711742, "kl": 0.078369140625, "learning_rate": 8.51831615787123e-07, "loss": 0.0001, "reward": 0.7901453077793121, "reward_std": 0.12065717577934265, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.09986747056245804, "step": 274 }, { "completion_length": 657.5416870117188, "epoch": 16.176470588235293, "grad_norm": 0.41550338842385137, "kl": 0.06689453125, "learning_rate": 8.506791502209496e-07, "loss": 0.0001, "reward": 0.581856369972229, "reward_std": 0.12058647722005844, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.09991191327571869, "step": 275 }, { "completion_length": 564.4583435058594, "epoch": 16.235294117647058, "grad_norm": 0.4142811657570539, "kl": 0.08349609375, "learning_rate": 8.495230061964287e-07, "loss": 0.0005, "reward": 1.1894080340862274, "reward_std": 0.1416172757744789, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.9895833432674408, "rewards/thinker_reward_func": 0.09982461482286453, "step": 276 }, { "completion_length": 594.125, "epoch": 16.294117647058822, "grad_norm": 0.22089914907967156, "kl": 0.08447265625, "learning_rate": 8.483631958409643e-07, "loss": 0.0005, "reward": 0.616558775305748, "reward_std": 0.0003739031089935452, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09989207237958908, "step": 277 }, { "completion_length": 609.8194274902344, "epoch": 16.352941176470587, "grad_norm": 0.2078156941429791, "kl": 0.0699462890625, "learning_rate": 8.471997313204182e-07, "loss": 0.0005, "reward": 0.6165436506271362, "reward_std": 0.0004261400899849832, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09987699240446091, "step": 278 }, { "completion_length": 645.3472290039062, "epoch": 16.41176470588235, "grad_norm": 0.2846863683715511, "kl": 0.080322265625, "learning_rate": 8.460326248389824e-07, "loss": 0.0003, "reward": 0.6861111372709274, "reward_std": 0.2051524817943573, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4861111342906952, "rewards/thinker_reward_func": 0.10000000894069672, "step": 279 }, { "completion_length": 636.3055419921875, "epoch": 16.470588235294116, "grad_norm": 0.0015164236256477498, "kl": 0.0771484375, "learning_rate": 8.448618886390521e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 280 }, { "completion_length": 715.4444580078125, "epoch": 16.529411764705884, "grad_norm": 0.15069301215205652, "kl": 0.063720703125, "learning_rate": 8.436875350010957e-07, "loss": 0.0003, "reward": 0.4055555835366249, "reward_std": 0.006487491074949503, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 281 }, { "completion_length": 617.9166870117188, "epoch": 16.58823529411765, "grad_norm": 0.37207199001906927, "kl": 0.080322265625, "learning_rate": 8.425095762435273e-07, "loss": 0.0003, "reward": 0.9118055701255798, "reward_std": 0.24858523905277252, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7118055820465088, "rewards/thinker_reward_func": 0.10000000894069672, "step": 282 }, { "completion_length": 660.6527709960938, "epoch": 16.647058823529413, "grad_norm": 0.2216846069954563, "kl": 0.0654296875, "learning_rate": 8.413280247225768e-07, "loss": 0.0003, "reward": 0.4069381058216095, "reward_std": 0.004833246115595102, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09999365732073784, "step": 283 }, { "completion_length": 699.6111145019531, "epoch": 16.705882352941178, "grad_norm": 0.18129547011457084, "kl": 0.0732421875, "learning_rate": 8.401428928321607e-07, "loss": 0.0005, "reward": 0.7381944432854652, "reward_std": 0.1072768121957779, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.538194477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 284 }, { "completion_length": 620.9722290039062, "epoch": 16.764705882352942, "grad_norm": 0.0032738017346263, "kl": 0.088623046875, "learning_rate": 8.389541930037516e-07, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 285 }, { "completion_length": 607.0833129882812, "epoch": 16.823529411764707, "grad_norm": 0.0016662357104584636, "kl": 0.077880859375, "learning_rate": 8.377619377062482e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 286 }, { "completion_length": 677.6528015136719, "epoch": 16.88235294117647, "grad_norm": 0.0016118234727070598, "kl": 0.071044921875, "learning_rate": 8.365661394458445e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 287 }, { "completion_length": 673.9861450195312, "epoch": 16.941176470588236, "grad_norm": 0.0015462089146118312, "kl": 0.06982421875, "learning_rate": 8.353668107658983e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 288 }, { "completion_length": 684.2916870117188, "epoch": 17.0, "grad_norm": 0.2651466721209784, "kl": 0.075927734375, "learning_rate": 8.341639642468001e-07, "loss": 0.0003, "reward": 0.581944465637207, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 289 }, { "completion_length": 668.9027709960938, "epoch": 17.058823529411764, "grad_norm": 0.22132617104143337, "kl": 0.08154296875, "learning_rate": 8.329576125058405e-07, "loss": 0.0005, "reward": 0.7902778387069702, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 290 }, { "completion_length": 710.0694580078125, "epoch": 17.11764705882353, "grad_norm": 0.14069488630864038, "kl": 0.06982421875, "learning_rate": 8.317477681970786e-07, "loss": 0.0005, "reward": 0.8062500655651093, "reward_std": 0.06495191156864166, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 291 }, { "completion_length": 663.6388854980469, "epoch": 17.176470588235293, "grad_norm": 0.21870066106538727, "kl": 0.083251953125, "learning_rate": 8.305344440112087e-07, "loss": 0.0005, "reward": 0.7729166746139526, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5729166716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 292 }, { "completion_length": 687.8333435058594, "epoch": 17.235294117647058, "grad_norm": 0.0014694013431178477, "kl": 0.072021484375, "learning_rate": 8.293176526754273e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 293 }, { "completion_length": 673.8611145019531, "epoch": 17.294117647058822, "grad_norm": 0.37341929898579157, "kl": 0.076416015625, "learning_rate": 8.280974069532998e-07, "loss": 0.0001, "reward": 0.738194465637207, "reward_std": 0.21450360491871834, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5381944626569748, "rewards/thinker_reward_func": 0.10000000894069672, "step": 294 }, { "completion_length": 696.8472290039062, "epoch": 17.352941176470587, "grad_norm": 0.21908255345060468, "kl": 0.072265625, "learning_rate": 8.268737196446263e-07, "loss": 0.0003, "reward": 0.6861111372709274, "reward_std": 0.1674174703657627, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4861111119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 295 }, { "completion_length": 677.1388854980469, "epoch": 17.41176470588235, "grad_norm": 0.17816079974835877, "kl": 0.0791015625, "learning_rate": 8.256466035853075e-07, "loss": 0.0003, "reward": 0.44305557012557983, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2430555522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 296 }, { "completion_length": 689.3611145019531, "epoch": 17.470588235294116, "grad_norm": 0.0016930518910548864, "kl": 0.0732421875, "learning_rate": 8.244160716472108e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 297 }, { "completion_length": 690.0833435058594, "epoch": 17.529411764705884, "grad_norm": 0.20719672053032218, "kl": 0.08203125, "learning_rate": 8.231821367380334e-07, "loss": 0.0003, "reward": 0.5993055775761604, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 298 }, { "completion_length": 654.3194580078125, "epoch": 17.58823529411765, "grad_norm": 0.0013688817218905282, "kl": 0.076171875, "learning_rate": 8.219448118011687e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 299 }, { "completion_length": 658.1666870117188, "epoch": 17.647058823529413, "grad_norm": 0.12522828184373716, "kl": 0.07958984375, "learning_rate": 8.207041098155699e-07, "loss": 0.0007, "reward": 1.01458340883255, "reward_std": 0.06495191156864166, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 300 }, { "completion_length": 587.2777709960938, "epoch": 17.705882352941178, "grad_norm": 0.00207468911167079, "kl": 0.0869140625, "learning_rate": 8.194600437956139e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 301 }, { "completion_length": 654.3888854980469, "epoch": 17.764705882352942, "grad_norm": 0.2693827276741301, "kl": 0.085205078125, "learning_rate": 8.18212626790964e-07, "loss": 0.0003, "reward": 0.581944465637207, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 302 }, { "completion_length": 662.1388854980469, "epoch": 17.823529411764707, "grad_norm": 0.18909863598425902, "kl": 0.079833984375, "learning_rate": 8.16961871886435e-07, "loss": 0.0007, "reward": 1.0159723460674286, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 303 }, { "completion_length": 603.5555725097656, "epoch": 17.88235294117647, "grad_norm": 0.1462069763754951, "kl": 0.098876953125, "learning_rate": 8.157077922018536e-07, "loss": 0.0005, "reward": 0.6166515946388245, "reward_std": 5.2236351621104404e-05, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09998492896556854, "step": 304 }, { "completion_length": 648.5694580078125, "epoch": 17.941176470588236, "grad_norm": 0.28188796719887094, "kl": 0.0859375, "learning_rate": 8.144504008919222e-07, "loss": 0.0007, "reward": 1.0493056178092957, "reward_std": 0.14604555070400238, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.8506944477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 305 }, { "completion_length": 707.2083435058594, "epoch": 18.0, "grad_norm": 0.0016187479534736716, "kl": 0.07568359375, "learning_rate": 8.131897111460809e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 306 }, { "completion_length": 681.7361145019531, "epoch": 18.058823529411764, "grad_norm": 0.1679916523632127, "kl": 0.0810546875, "learning_rate": 8.119257361883686e-07, "loss": 0.0002, "reward": 0.40831828862428665, "reward_std": 5.223857078817673e-05, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09998492896556854, "step": 307 }, { "completion_length": 680.4444274902344, "epoch": 18.11764705882353, "grad_norm": 0.001680347759371884, "kl": 0.074462890625, "learning_rate": 8.106584892772843e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 308 }, { "completion_length": 657.0416870117188, "epoch": 18.176470588235293, "grad_norm": 0.33434910998021883, "kl": 0.087890625, "learning_rate": 8.093879837056485e-07, "loss": 0.0005, "reward": 0.9784723222255707, "reward_std": 0.15085681155323982, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.7812500149011612, "rewards/thinker_reward_func": 0.09861112385988235, "step": 309 }, { "completion_length": 683.8194580078125, "epoch": 18.235294117647058, "grad_norm": 0.0020303493075305577, "kl": 0.083740234375, "learning_rate": 8.081142328004636e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 310 }, { "completion_length": 682.4305725097656, "epoch": 18.294117647058822, "grad_norm": 0.15917190754616006, "kl": 0.074462890625, "learning_rate": 8.068372499227736e-07, "loss": 0.0005, "reward": 0.7208333611488342, "reward_std": 0.10879853367805481, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5208333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 311 }, { "completion_length": 707.6944580078125, "epoch": 18.352941176470587, "grad_norm": 0.006823935889751849, "kl": 0.091552734375, "learning_rate": 8.05557048467525e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 312 }, { "completion_length": 677.4722290039062, "epoch": 18.41176470588235, "grad_norm": 0.19897727981385577, "kl": 0.08642578125, "learning_rate": 8.04273641863425e-07, "loss": 0.0003, "reward": 0.5819444730877876, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 313 }, { "completion_length": 660.4861145019531, "epoch": 18.470588235294116, "grad_norm": 0.004630268345867846, "kl": 0.09765625, "learning_rate": 8.029870435728017e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 314 }, { "completion_length": 668.9166870117188, "epoch": 18.529411764705884, "grad_norm": 0.2829973805195177, "kl": 0.096435546875, "learning_rate": 8.016972670914623e-07, "loss": 0.0005, "reward": 0.8597223162651062, "reward_std": 0.15436295047402382, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6597222089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 315 }, { "completion_length": 647.1805725097656, "epoch": 18.58823529411765, "grad_norm": 0.31753753991741246, "kl": 0.08935546875, "learning_rate": 8.004043259485518e-07, "loss": 0.0005, "reward": 0.9986112117767334, "reward_std": 0.12028131633996964, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.798611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 316 }, { "completion_length": 714.5694580078125, "epoch": 18.647058823529413, "grad_norm": 0.15237431533666765, "kl": 0.088134765625, "learning_rate": 7.991082337064109e-07, "loss": 0.0003, "reward": 0.5298611223697662, "reward_std": 0.1072767972946167, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3298611044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 317 }, { "completion_length": 684.3472290039062, "epoch": 18.705882352941178, "grad_norm": 0.27771374078594935, "kl": 0.095947265625, "learning_rate": 7.978090039604341e-07, "loss": 0.0005, "reward": 0.8770833611488342, "reward_std": 0.16271689534187317, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6770833283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 318 }, { "completion_length": 691.6944580078125, "epoch": 18.764705882352942, "grad_norm": 0.2298064210691231, "kl": 0.08154296875, "learning_rate": 7.965066503389264e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 319 }, { "completion_length": 706.0555725097656, "epoch": 18.823529411764707, "grad_norm": 0.418322970516858, "kl": 0.087646484375, "learning_rate": 7.952011865029613e-07, "loss": 0.0003, "reward": 0.9812501072883606, "reward_std": 0.18042195215821266, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.78125, "rewards/thinker_reward_func": 0.10000000894069672, "step": 320 }, { "completion_length": 761.8055419921875, "epoch": 18.88235294117647, "grad_norm": 0.22432605956254048, "kl": 0.079345703125, "learning_rate": 7.938926261462365e-07, "loss": 0.0005, "reward": 0.8048611581325531, "reward_std": 0.06662814924493432, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 321 }, { "completion_length": 756.75, "epoch": 18.941176470588236, "grad_norm": 0.1882040278507041, "kl": 0.083740234375, "learning_rate": 7.925809829949311e-07, "loss": 0.0002, "reward": 0.38958335667848587, "reward_std": 0.06495191156864166, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 322 }, { "completion_length": 691.9305419921875, "epoch": 19.0, "grad_norm": 0.3162237576427075, "kl": 0.0908203125, "learning_rate": 7.91266270807561e-07, "loss": 0.0003, "reward": 0.9104167222976685, "reward_std": 0.2323693484067917, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.7118055373430252, "rewards/thinker_reward_func": 0.10000000894069672, "step": 323 }, { "completion_length": 715.0972290039062, "epoch": 19.058823529411764, "grad_norm": 0.0015981474727847027, "kl": 0.08837890625, "learning_rate": 7.89948503374835e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 324 }, { "completion_length": 728.1111145019531, "epoch": 19.11764705882353, "grad_norm": 0.14466261921164753, "kl": 0.0849609375, "learning_rate": 7.886276945195097e-07, "loss": 0.0005, "reward": 0.6138889342546463, "reward_std": 0.006487491074949503, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 325 }, { "completion_length": 741.9722290039062, "epoch": 19.176470588235293, "grad_norm": 0.2845827770371415, "kl": 0.094970703125, "learning_rate": 7.873038580962453e-07, "loss": 0.0001, "reward": 0.35208336263895035, "reward_std": 0.10865604877471924, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.15625, "rewards/thinker_reward_func": 0.09861112385988235, "step": 326 }, { "completion_length": 681.1527709960938, "epoch": 19.235294117647058, "grad_norm": 0.0021894288821242658, "kl": 0.101318359375, "learning_rate": 7.859770079914592e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 327 }, { "completion_length": 677.25, "epoch": 19.294117647058822, "grad_norm": 0.18983983098301174, "kl": 0.096435546875, "learning_rate": 7.846471581231813e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 328 }, { "completion_length": 720.5555725097656, "epoch": 19.352941176470587, "grad_norm": 0.15882612251169118, "kl": 0.099365234375, "learning_rate": 7.833143224409075e-07, "loss": 0.0007, "reward": 0.9986111521720886, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.798611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 329 }, { "completion_length": 695.3055725097656, "epoch": 19.41176470588235, "grad_norm": 0.1935753726240872, "kl": 0.09814453125, "learning_rate": 7.819785149254532e-07, "loss": 0.0003, "reward": 0.581944465637207, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 330 }, { "completion_length": 698.4861145019531, "epoch": 19.470588235294116, "grad_norm": 0.0027845857678600087, "kl": 0.094970703125, "learning_rate": 7.806397495888073e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 331 }, { "completion_length": 699.4861145019531, "epoch": 19.529411764705884, "grad_norm": 0.2653009152749469, "kl": 0.09912109375, "learning_rate": 7.792980404739847e-07, "loss": 0.0005, "reward": 0.7715278267860413, "reward_std": 0.09903355687856674, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5729166567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 332 }, { "completion_length": 750.3333435058594, "epoch": 19.58823529411765, "grad_norm": 0.22082399704112146, "kl": 0.0869140625, "learning_rate": 7.77953401654879e-07, "loss": 0.0007, "reward": 1.1722223162651062, "reward_std": 0.15436296164989471, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.972222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 333 }, { "completion_length": 674.3472290039062, "epoch": 19.647058823529413, "grad_norm": 0.0021975317950046367, "kl": 0.095458984375, "learning_rate": 7.766058472361153e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 334 }, { "completion_length": 662.013916015625, "epoch": 19.705882352941178, "grad_norm": 0.001954956194511979, "kl": 0.096435546875, "learning_rate": 7.752553913529018e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 335 }, { "completion_length": 720.8333435058594, "epoch": 19.764705882352942, "grad_norm": 0.001943684781486068, "kl": 0.0888671875, "learning_rate": 7.739020481708814e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 336 }, { "completion_length": 708.6944580078125, "epoch": 19.823529411764707, "grad_norm": 0.2391168848990057, "kl": 0.095703125, "learning_rate": 7.725458318859841e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 337 }, { "completion_length": 713.1388854980469, "epoch": 19.88235294117647, "grad_norm": 0.004598411642750982, "kl": 0.10107421875, "learning_rate": 7.711867567242766e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 338 }, { "completion_length": 687.5833435058594, "epoch": 19.941176470588236, "grad_norm": 0.002136921842612121, "kl": 0.09619140625, "learning_rate": 7.698248369418146e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 339 }, { "completion_length": 739.0277709960938, "epoch": 20.0, "grad_norm": 0.25684536335041125, "kl": 0.0869140625, "learning_rate": 7.684600868244919e-07, "loss": 0.0003, "reward": 0.5993055775761604, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 340 }, { "completion_length": 670.3055725097656, "epoch": 20.058823529411764, "grad_norm": 0.2303461393461521, "kl": 0.10107421875, "learning_rate": 7.670925206878916e-07, "loss": 0.0003, "reward": 0.49513889104127884, "reward_std": 0.1072767972946167, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2951388955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 341 }, { "completion_length": 706.4722290039062, "epoch": 20.11764705882353, "grad_norm": 0.051131403161292596, "kl": 0.1650390625, "learning_rate": 7.657221528771351e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 342 }, { "completion_length": 707.0138854980469, "epoch": 20.176470588235293, "grad_norm": 0.27331846628570433, "kl": 0.0947265625, "learning_rate": 7.643489977667325e-07, "loss": 0.0003, "reward": 0.7194444537162781, "reward_std": 0.1910441741347313, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5208333432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 343 }, { "completion_length": 690.8333435058594, "epoch": 20.235294117647058, "grad_norm": 0.2173101741419663, "kl": 0.093505859375, "learning_rate": 7.629730697604313e-07, "loss": 0.0005, "reward": 0.7729166746139526, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5729166716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 344 }, { "completion_length": 712.75, "epoch": 20.294117647058822, "grad_norm": 0.26717911447737075, "kl": 0.099365234375, "learning_rate": 7.61594383291065e-07, "loss": 0.0003, "reward": 0.7729166746139526, "reward_std": 0.14123429358005524, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5729166716337204, "rewards/thinker_reward_func": 0.10000000894069672, "step": 345 }, { "completion_length": 690.1666564941406, "epoch": 20.352941176470587, "grad_norm": 0.26930155574104364, "kl": 0.097900390625, "learning_rate": 7.602129528204022e-07, "loss": 0.0003, "reward": 0.7902778759598732, "reward_std": 0.12028130143880844, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777910232544, "rewards/thinker_reward_func": 0.10000000894069672, "step": 346 }, { "completion_length": 665.7777709960938, "epoch": 20.41176470588235, "grad_norm": 0.0018267285126985595, "kl": 0.09423828125, "learning_rate": 7.588287928389951e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 347 }, { "completion_length": 748.125, "epoch": 20.470588235294116, "grad_norm": 0.137784799337207, "kl": 0.089599609375, "learning_rate": 7.574419178660268e-07, "loss": 0.0005, "reward": 0.6138888895511627, "reward_std": 0.006487491074949503, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 348 }, { "completion_length": 684.375, "epoch": 20.529411764705884, "grad_norm": 0.16439399425624684, "kl": 0.091552734375, "learning_rate": 7.560523424491594e-07, "loss": 0.0002, "reward": 0.19861111044883728, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 349 }, { "completion_length": 704.1111450195312, "epoch": 20.58823529411765, "grad_norm": 0.20657295117793315, "kl": 0.095458984375, "learning_rate": 7.546600811643816e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 350 }, { "completion_length": 706.6388854980469, "epoch": 20.647058823529413, "grad_norm": 0.21614730468947074, "kl": 0.085205078125, "learning_rate": 7.532651486158554e-07, "loss": 0.0003, "reward": 0.5993055775761604, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 351 }, { "completion_length": 662.4166564941406, "epoch": 20.705882352941178, "grad_norm": 0.002157889441138611, "kl": 0.099365234375, "learning_rate": 7.518675594357632e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 352 }, { "completion_length": 718.1944580078125, "epoch": 20.764705882352942, "grad_norm": 0.002710170601115251, "kl": 0.099609375, "learning_rate": 7.504673282841543e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 353 }, { "completion_length": 718.8055725097656, "epoch": 20.823529411764707, "grad_norm": 0.13060853596701286, "kl": 0.086181640625, "learning_rate": 7.490644698487908e-07, "loss": 0.0003, "reward": 0.5604167059063911, "reward_std": 0.10176008194684982, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.3645833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 354 }, { "completion_length": 694.7222290039062, "epoch": 20.88235294117647, "grad_norm": 0.28207745050786154, "kl": 0.090576171875, "learning_rate": 7.476589988449938e-07, "loss": 0.0005, "reward": 0.9972222745418549, "reward_std": 0.12509256973862648, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 355 }, { "completion_length": 732.1805725097656, "epoch": 20.941176470588236, "grad_norm": 0.31143153145110825, "kl": 0.0888671875, "learning_rate": 7.462509300154891e-07, "loss": 0.0003, "reward": 0.5604166984558105, "reward_std": 0.10176008194684982, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.3645833283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 356 }, { "completion_length": 682.5555725097656, "epoch": 21.0, "grad_norm": 0.0019797481812710944, "kl": 0.0908203125, "learning_rate": 7.448402781302525e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 357 }, { "completion_length": 714.3055725097656, "epoch": 21.058823529411764, "grad_norm": 0.006633847154592048, "kl": 0.09423828125, "learning_rate": 7.434270579863548e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 358 }, { "completion_length": 704.3750305175781, "epoch": 21.11764705882353, "grad_norm": 0.0018264239402050022, "kl": 0.0869140625, "learning_rate": 7.420112844078065e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 359 }, { "completion_length": 677.5138854980469, "epoch": 21.176470588235293, "grad_norm": 0.20566757196523447, "kl": 0.097412109375, "learning_rate": 7.405929722454025e-07, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 360 }, { "completion_length": 687.7083435058594, "epoch": 21.235294117647058, "grad_norm": 0.19673149169454932, "kl": 0.089111328125, "learning_rate": 7.391721363765663e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 361 }, { "completion_length": 611.5833435058594, "epoch": 21.294117647058822, "grad_norm": 0.001978600437447955, "kl": 0.097412109375, "learning_rate": 7.377487917051938e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 362 }, { "completion_length": 696.7778015136719, "epoch": 21.352941176470587, "grad_norm": 0.16705640483562892, "kl": 0.09521484375, "learning_rate": 7.363229531614972e-07, "loss": 0.0002, "reward": 0.37361112982034683, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 363 }, { "completion_length": 644.375, "epoch": 21.41176470588235, "grad_norm": 0.22364466029493996, "kl": 0.102783203125, "learning_rate": 7.348946357018479e-07, "loss": 0.0007, "reward": 1.0159723460674286, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 364 }, { "completion_length": 672.5972290039062, "epoch": 21.470588235294116, "grad_norm": 0.21241650630348324, "kl": 0.10107421875, "learning_rate": 7.334638543086203e-07, "loss": 0.0003, "reward": 0.5993055775761604, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 365 }, { "completion_length": 703.2777709960938, "epoch": 21.529411764705884, "grad_norm": 0.001977472750606569, "kl": 0.0908203125, "learning_rate": 7.320306239900342e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 366 }, { "completion_length": 701.2361145019531, "epoch": 21.58823529411765, "grad_norm": 0.1449945272325521, "kl": 0.10009765625, "learning_rate": 7.305949597799976e-07, "loss": 0.0005, "reward": 0.8062500357627869, "reward_std": 0.06495191156864166, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 367 }, { "completion_length": 765.75, "epoch": 21.647058823529413, "grad_norm": 0.32888139941750333, "kl": 0.08203125, "learning_rate": 7.291568767379483e-07, "loss": 0.0003, "reward": 0.7527777850627899, "reward_std": 0.1639854535460472, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.5555555671453476, "rewards/thinker_reward_func": 0.10000000894069672, "step": 368 }, { "completion_length": 710.4722290039062, "epoch": 21.705882352941178, "grad_norm": 0.0032048223945848504, "kl": 0.09619140625, "learning_rate": 7.277163899486974e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 369 }, { "completion_length": 692.0972290039062, "epoch": 21.764705882352942, "grad_norm": 0.002226770920561593, "kl": 0.09912109375, "learning_rate": 7.262735145222695e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 370 }, { "completion_length": 659.8472290039062, "epoch": 21.823529411764707, "grad_norm": 0.002766026784730193, "kl": 0.10595703125, "learning_rate": 7.24828265593745e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 371 }, { "completion_length": 723.1528015136719, "epoch": 21.88235294117647, "grad_norm": 0.270510534699326, "kl": 0.107666015625, "learning_rate": 7.233806583231011e-07, "loss": 0.0002, "reward": 0.3375000134110451, "reward_std": 0.10738749289885163, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.1388888955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 372 }, { "completion_length": 702.138916015625, "epoch": 21.941176470588236, "grad_norm": 0.00171940092024905, "kl": 0.09033203125, "learning_rate": 7.219307078950535e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 373 }, { "completion_length": 690.9861145019531, "epoch": 22.0, "grad_norm": 0.0016986385486730817, "kl": 0.092529296875, "learning_rate": 7.204784295188958e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 374 }, { "completion_length": 666.1111450195312, "epoch": 22.058823529411764, "grad_norm": 0.2831513242545434, "kl": 0.101318359375, "learning_rate": 7.190238384283412e-07, "loss": 0.0008, "reward": 1.1881945133209229, "reward_std": 0.09903355687856674, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.9895833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 375 }, { "completion_length": 716.1388854980469, "epoch": 22.11764705882353, "grad_norm": 0.1411649024298151, "kl": 0.114501953125, "learning_rate": 7.175669498813616e-07, "loss": 0.0002, "reward": 0.3708333447575569, "reward_std": 0.08758113533258438, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 376 }, { "completion_length": 655.125, "epoch": 22.176470588235293, "grad_norm": 0.24727883717920332, "kl": 0.110595703125, "learning_rate": 7.161077791600287e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 377 }, { "completion_length": 694.7361145019531, "epoch": 22.235294117647058, "grad_norm": 0.26889520142067047, "kl": 0.102294921875, "learning_rate": 7.14646341570353e-07, "loss": 0.0005, "reward": 0.7354167252779007, "reward_std": 0.11689930409193039, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5381944328546524, "rewards/thinker_reward_func": 0.09861112385988235, "step": 378 }, { "completion_length": 675.763916015625, "epoch": 22.294117647058822, "grad_norm": 0.18958561887641465, "kl": 0.11279296875, "learning_rate": 7.131826524421229e-07, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 379 }, { "completion_length": 655.5833129882812, "epoch": 22.352941176470587, "grad_norm": 0.002316041467796887, "kl": 0.096923828125, "learning_rate": 7.117167271287452e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 380 }, { "completion_length": 645.7083435058594, "epoch": 22.41176470588235, "grad_norm": 0.0019651287993491195, "kl": 0.101806640625, "learning_rate": 7.102485810070823e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 381 }, { "completion_length": 628.9166564941406, "epoch": 22.470588235294116, "grad_norm": 0.2711108346221353, "kl": 0.10400390625, "learning_rate": 7.087782294772926e-07, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 382 }, { "completion_length": 681.5278015136719, "epoch": 22.529411764705884, "grad_norm": 0.0024849427631519175, "kl": 0.100830078125, "learning_rate": 7.07305687962668e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 383 }, { "completion_length": 666.0833435058594, "epoch": 22.58823529411765, "grad_norm": 0.21150450598010262, "kl": 0.099853515625, "learning_rate": 7.05830971909472e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 384 }, { "completion_length": 684.0, "epoch": 22.647058823529413, "grad_norm": 0.21436072420864324, "kl": 0.10791015625, "learning_rate": 7.043540967867781e-07, "loss": 0.0005, "reward": 0.807638943195343, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 385 }, { "completion_length": 684.5972290039062, "epoch": 22.705882352941178, "grad_norm": 0.00445564477185778, "kl": 0.103759765625, "learning_rate": 7.028750780863078e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 386 }, { "completion_length": 713.625, "epoch": 22.764705882352942, "grad_norm": 0.0016367107705254397, "kl": 0.08984375, "learning_rate": 7.013939313222669e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 387 }, { "completion_length": 734.8472290039062, "epoch": 22.823529411764707, "grad_norm": 0.21041940470248624, "kl": 0.107666015625, "learning_rate": 6.999106720311845e-07, "loss": 0.0005, "reward": 0.7902778089046478, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 388 }, { "completion_length": 635.6805725097656, "epoch": 22.88235294117647, "grad_norm": 0.24835870348851913, "kl": 0.11474609375, "learning_rate": 6.984253157717485e-07, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 389 }, { "completion_length": 735.9861145019531, "epoch": 22.941176470588236, "grad_norm": 0.15966622416696788, "kl": 0.0927734375, "learning_rate": 6.969378781246436e-07, "loss": 0.0003, "reward": 0.40416670590639114, "reward_std": 0.007537784054875374, "rewards/format_reward_func": 0.09583334252238274, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 390 }, { "completion_length": 649.4166564941406, "epoch": 23.0, "grad_norm": 0.0022360804942963487, "kl": 0.1044921875, "learning_rate": 6.954483746923864e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 391 }, { "completion_length": 661.1805725097656, "epoch": 23.058823529411764, "grad_norm": 0.18649143818652728, "kl": 0.099853515625, "learning_rate": 6.939568210991632e-07, "loss": 0.0007, "reward": 1.0159723162651062, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 392 }, { "completion_length": 648.0416870117188, "epoch": 23.11764705882353, "grad_norm": 0.29073465256243136, "kl": 0.098876953125, "learning_rate": 6.924632329906656e-07, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.14123429358005524, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7812500149011612, "rewards/thinker_reward_func": 0.10000000894069672, "step": 393 }, { "completion_length": 682.7777709960938, "epoch": 23.176470588235293, "grad_norm": 0.001983442758327036, "kl": 0.09033203125, "learning_rate": 6.909676260339259e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 394 }, { "completion_length": 655.5694580078125, "epoch": 23.235294117647058, "grad_norm": 0.19500706808845225, "kl": 0.099365234375, "learning_rate": 6.894700159171534e-07, "loss": 0.0005, "reward": 0.7902778089046478, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 395 }, { "completion_length": 676.25, "epoch": 23.294117647058822, "grad_norm": 0.004052348865193785, "kl": 0.108154296875, "learning_rate": 6.879704183495695e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 396 }, { "completion_length": 643.0694580078125, "epoch": 23.352941176470587, "grad_norm": 0.0016799969137122526, "kl": 0.09716796875, "learning_rate": 6.864688490612433e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 397 }, { "completion_length": 664.9166564941406, "epoch": 23.41176470588235, "grad_norm": 0.3646673028681901, "kl": 0.09814453125, "learning_rate": 6.84965323802926e-07, "loss": 0.0003, "reward": 0.772916704416275, "reward_std": 0.21450360864400864, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5729166865348816, "rewards/thinker_reward_func": 0.10000000894069672, "step": 398 }, { "completion_length": 659.2361145019531, "epoch": 23.470588235294116, "grad_norm": 0.001948602862344372, "kl": 0.095703125, "learning_rate": 6.834598583458861e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 399 }, { "completion_length": 693.9305725097656, "epoch": 23.529411764705884, "grad_norm": 0.0018138678171251885, "kl": 0.085693359375, "learning_rate": 6.819524684817438e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 400 }, { "completion_length": 647.3888854980469, "epoch": 23.58823529411765, "grad_norm": 0.00316858993220515, "kl": 0.106689453125, "learning_rate": 6.804431700223055e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 401 }, { "completion_length": 641.7222290039062, "epoch": 23.647058823529413, "grad_norm": 0.2542814921684949, "kl": 0.10498046875, "learning_rate": 6.789319787993979e-07, "loss": 0.0005, "reward": 0.7902778089046478, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 402 }, { "completion_length": 671.0555725097656, "epoch": 23.705882352941178, "grad_norm": 0.2475069189153772, "kl": 0.09228515625, "learning_rate": 6.774189106647021e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 403 }, { "completion_length": 667.2778015136719, "epoch": 23.764705882352942, "grad_norm": 0.18710382444397977, "kl": 0.098876953125, "learning_rate": 6.759039814895862e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 404 }, { "completion_length": 652.75, "epoch": 23.823529411764707, "grad_norm": 0.007016922074230612, "kl": 0.111083984375, "learning_rate": 6.743872071649411e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 405 }, { "completion_length": 659.6666870117188, "epoch": 23.88235294117647, "grad_norm": 0.0019855951161394376, "kl": 0.109619140625, "learning_rate": 6.728686036010114e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 406 }, { "completion_length": 679.3194580078125, "epoch": 23.941176470588236, "grad_norm": 0.16166011004993436, "kl": 0.111572265625, "learning_rate": 6.713481867272299e-07, "loss": 0.0004, "reward": 0.5993055701255798, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 407 }, { "completion_length": 666.0278015136719, "epoch": 24.0, "grad_norm": 0.0023835544635048608, "kl": 0.10107421875, "learning_rate": 6.698259724920502e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 408 }, { "completion_length": 697.7083435058594, "epoch": 24.058823529411764, "grad_norm": 0.15167703500492816, "kl": 0.09033203125, "learning_rate": 6.683019768627794e-07, "loss": 0.0003, "reward": 0.5791666582226753, "reward_std": 0.08758113533258438, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 409 }, { "completion_length": 610.1805725097656, "epoch": 24.11764705882353, "grad_norm": 0.3412714875933119, "kl": 0.1181640625, "learning_rate": 6.667762158254103e-07, "loss": 0.0007, "reward": 0.963888943195343, "reward_std": 0.16741745918989182, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7638888955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 410 }, { "completion_length": 619.9027709960938, "epoch": 24.176470588235293, "grad_norm": 0.2878905920691581, "kl": 0.10888671875, "learning_rate": 6.652487053844544e-07, "loss": 0.0004, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 411 }, { "completion_length": 596.8750305175781, "epoch": 24.235294117647058, "grad_norm": 0.28419937837782455, "kl": 0.114990234375, "learning_rate": 6.637194615627732e-07, "loss": 0.0007, "reward": 1.1722223162651062, "reward_std": 0.15436296164989471, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.972222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 412 }, { "completion_length": 703.3194580078125, "epoch": 24.294117647058822, "grad_norm": 0.20831663705630538, "kl": 0.0986328125, "learning_rate": 6.621885004014111e-07, "loss": 0.0003, "reward": 0.42569445818662643, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.225694440305233, "rewards/thinker_reward_func": 0.10000000894069672, "step": 413 }, { "completion_length": 665.2778015136719, "epoch": 24.352941176470587, "grad_norm": 0.001770636350001259, "kl": 0.09521484375, "learning_rate": 6.606558379594261e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 414 }, { "completion_length": 633.0972290039062, "epoch": 24.41176470588235, "grad_norm": 0.0035393543019104053, "kl": 0.1181640625, "learning_rate": 6.59121490313722e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 415 }, { "completion_length": 657.625, "epoch": 24.470588235294116, "grad_norm": 0.002586690959675244, "kl": 0.10595703125, "learning_rate": 6.575854735588794e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 416 }, { "completion_length": 704.2083435058594, "epoch": 24.529411764705884, "grad_norm": 0.036873198613237713, "kl": 0.127685546875, "learning_rate": 6.560478038069872e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 417 }, { "completion_length": 608.0972290039062, "epoch": 24.58823529411765, "grad_norm": 0.005573913454981408, "kl": 0.11376953125, "learning_rate": 6.545084971874736e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 418 }, { "completion_length": 661.0416870117188, "epoch": 24.647058823529413, "grad_norm": 0.20292446761663813, "kl": 0.106689453125, "learning_rate": 6.529675698469369e-07, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 419 }, { "completion_length": 696.8472290039062, "epoch": 24.705882352941178, "grad_norm": 0.0020075801589722734, "kl": 0.09326171875, "learning_rate": 6.514250379489753e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 420 }, { "completion_length": 657.0, "epoch": 24.764705882352942, "grad_norm": 0.011410079673335035, "kl": 0.11669921875, "learning_rate": 6.498809176740189e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 421 }, { "completion_length": 639.75, "epoch": 24.823529411764707, "grad_norm": 0.0032417592983367883, "kl": 0.106689453125, "learning_rate": 6.483352252191584e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 422 }, { "completion_length": 616.4305419921875, "epoch": 24.88235294117647, "grad_norm": 0.0021500960415667533, "kl": 0.10595703125, "learning_rate": 6.467879767979764e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 423 }, { "completion_length": 633.5416870117188, "epoch": 24.941176470588236, "grad_norm": 0.0019294909947153375, "kl": 0.103271484375, "learning_rate": 6.452391886403766e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 424 }, { "completion_length": 646.1388854980469, "epoch": 25.0, "grad_norm": 0.18605123188721717, "kl": 0.10986328125, "learning_rate": 6.436888769924141e-07, "loss": 0.0007, "reward": 0.998611181974411, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 425 }, { "completion_length": 632.8472290039062, "epoch": 25.058823529411764, "grad_norm": 0.16699724627597806, "kl": 0.1123046875, "learning_rate": 6.421370581161243e-07, "loss": 0.0004, "reward": 0.4069444537162781, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 426 }, { "completion_length": 638.3055725097656, "epoch": 25.11764705882353, "grad_norm": 0.0021719543098197725, "kl": 0.104248046875, "learning_rate": 6.405837482893528e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 427 }, { "completion_length": 694.2777709960938, "epoch": 25.176470588235293, "grad_norm": 0.0021858473238625387, "kl": 0.10205078125, "learning_rate": 6.390289638055851e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 428 }, { "completion_length": 691.8888854980469, "epoch": 25.235294117647058, "grad_norm": 0.0016253222443421882, "kl": 0.088134765625, "learning_rate": 6.374727209737742e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 429 }, { "completion_length": 655.9861145019531, "epoch": 25.294117647058822, "grad_norm": 0.003217537344062847, "kl": 0.114990234375, "learning_rate": 6.359150361181714e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 430 }, { "completion_length": 632.9444580078125, "epoch": 25.352941176470587, "grad_norm": 0.00205288047473226, "kl": 0.10400390625, "learning_rate": 6.343559255781537e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 431 }, { "completion_length": 684.3194580078125, "epoch": 25.41176470588235, "grad_norm": 0.2808994205074334, "kl": 0.099853515625, "learning_rate": 6.327954057080526e-07, "loss": 0.0005, "reward": 0.9986112117767334, "reward_std": 0.12028130143880844, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.798611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 432 }, { "completion_length": 612.5138854980469, "epoch": 25.470588235294116, "grad_norm": 0.3217810486931272, "kl": 0.106201171875, "learning_rate": 6.312334928769833e-07, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.14123429358005524, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7812500149011612, "rewards/thinker_reward_func": 0.10000000894069672, "step": 433 }, { "completion_length": 608.6944580078125, "epoch": 25.529411764705884, "grad_norm": 0.24380395439957273, "kl": 0.104736328125, "learning_rate": 6.296702034686725e-07, "loss": 0.0007, "reward": 0.998611181974411, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 434 }, { "completion_length": 680.5555725097656, "epoch": 25.58823529411765, "grad_norm": 0.18688841481640184, "kl": 0.1044921875, "learning_rate": 6.281055538812861e-07, "loss": 0.0002, "reward": 0.21736110746860504, "reward_std": 0.060140661895275116, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 435 }, { "completion_length": 647.0972290039062, "epoch": 25.647058823529413, "grad_norm": 0.3241237287467168, "kl": 0.10546875, "learning_rate": 6.265395605272581e-07, "loss": 0.0007, "reward": 1.0506945550441742, "reward_std": 0.14123430475592613, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8506944477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 436 }, { "completion_length": 604.4444580078125, "epoch": 25.705882352941178, "grad_norm": 0.002666592072744748, "kl": 0.1142578125, "learning_rate": 6.249722398331176e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 437 }, { "completion_length": 609.0278015136719, "epoch": 25.764705882352942, "grad_norm": 0.22390816438800742, "kl": 0.14404296875, "learning_rate": 6.234036082393171e-07, "loss": 0.0007, "reward": 0.8423612117767334, "reward_std": 0.060140661895275116, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.642361119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 438 }, { "completion_length": 642.0138854980469, "epoch": 25.823529411764707, "grad_norm": 0.3242948799645708, "kl": 0.096435546875, "learning_rate": 6.218336822000597e-07, "loss": 0.0003, "reward": 0.7902778089046478, "reward_std": 0.12028130143880844, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 439 }, { "completion_length": 603.875, "epoch": 25.88235294117647, "grad_norm": 0.0017657308997026833, "kl": 0.09814453125, "learning_rate": 6.202624781831268e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 440 }, { "completion_length": 619.3194580078125, "epoch": 25.941176470588236, "grad_norm": 0.0019310494689285592, "kl": 0.100830078125, "learning_rate": 6.18690012669705e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 441 }, { "completion_length": 647.3055725097656, "epoch": 26.0, "grad_norm": 0.2881799892331985, "kl": 0.10498046875, "learning_rate": 6.171163021542133e-07, "loss": 0.0002, "reward": 0.37361112982034683, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 442 }, { "completion_length": 655.0277709960938, "epoch": 26.058823529411764, "grad_norm": 0.001957239872697614, "kl": 0.101806640625, "learning_rate": 6.155413631441306e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 443 }, { "completion_length": 648.9583435058594, "epoch": 26.11764705882353, "grad_norm": 0.001761609744310305, "kl": 0.108642578125, "learning_rate": 6.139652121598218e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 444 }, { "completion_length": 660.7777709960938, "epoch": 26.176470588235293, "grad_norm": 0.0018503445998393037, "kl": 0.09130859375, "learning_rate": 6.123878657343647e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 445 }, { "completion_length": 624.0000305175781, "epoch": 26.235294117647058, "grad_norm": 0.0019881733233238926, "kl": 0.10107421875, "learning_rate": 6.108093404133772e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 446 }, { "completion_length": 643.7638854980469, "epoch": 26.294117647058822, "grad_norm": 0.0019885302302481453, "kl": 0.100830078125, "learning_rate": 6.092296527548426e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 447 }, { "completion_length": 683.6666870117188, "epoch": 26.352941176470587, "grad_norm": 0.0018009078041427641, "kl": 0.087890625, "learning_rate": 6.076488193289374e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 448 }, { "completion_length": 652.9583435058594, "epoch": 26.41176470588235, "grad_norm": 0.16073482040082243, "kl": 0.09765625, "learning_rate": 6.060668567178559e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 449 }, { "completion_length": 641.8055725097656, "epoch": 26.470588235294116, "grad_norm": 0.00313118623735336, "kl": 0.103271484375, "learning_rate": 6.044837815156376e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 450 }, { "completion_length": 650.625, "epoch": 26.529411764705884, "grad_norm": 0.0022302049171323457, "kl": 0.09765625, "learning_rate": 6.028996103279917e-07, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 451 }, { "completion_length": 615.388916015625, "epoch": 26.58823529411765, "grad_norm": 0.001992659578907578, "kl": 0.10107421875, "learning_rate": 6.013143597721251e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 452 }, { "completion_length": 616.2083435058594, "epoch": 26.647058823529413, "grad_norm": 0.22042198583753825, "kl": 0.109130859375, "learning_rate": 5.997280464765653e-07, "loss": 0.0007, "reward": 1.0159723162651062, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 453 }, { "completion_length": 584.8055725097656, "epoch": 26.705882352941178, "grad_norm": 0.0034713108536116918, "kl": 0.117431640625, "learning_rate": 5.981406870809888e-07, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 454 }, { "completion_length": 576.5277709960938, "epoch": 26.764705882352942, "grad_norm": 0.0026772774376308157, "kl": 0.103515625, "learning_rate": 5.96552298236044e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 455 }, { "completion_length": 596.6111450195312, "epoch": 26.823529411764707, "grad_norm": 0.0031820535093156433, "kl": 0.103759765625, "learning_rate": 5.949628966031784e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 456 }, { "completion_length": 600.8750305175781, "epoch": 26.88235294117647, "grad_norm": 0.22192379747509197, "kl": 0.10498046875, "learning_rate": 5.933724988544632e-07, "loss": 0.0005, "reward": 0.807638980448246, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6076388955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 457 }, { "completion_length": 586.9583435058594, "epoch": 26.941176470588236, "grad_norm": 0.002448614208604815, "kl": 0.103515625, "learning_rate": 5.91781121672418e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 458 }, { "completion_length": 613.138916015625, "epoch": 27.0, "grad_norm": 0.0020520700947066067, "kl": 0.095458984375, "learning_rate": 5.901887817498367e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 459 }, { "completion_length": 608.1388854980469, "epoch": 27.058823529411764, "grad_norm": 0.2223007819718865, "kl": 0.10791015625, "learning_rate": 5.885954957896115e-07, "loss": 0.0004, "reward": 0.6166206300258636, "reward_std": 0.0001323264732491225, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09995397925376892, "step": 460 }, { "completion_length": 636.9861145019531, "epoch": 27.11764705882353, "grad_norm": 0.2560670909124307, "kl": 0.0966796875, "learning_rate": 5.870012805045579e-07, "loss": 0.0009, "reward": 1.2069445848464966, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0069444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 461 }, { "completion_length": 665.513916015625, "epoch": 27.176470588235293, "grad_norm": 0.2566702251230409, "kl": 0.1572265625, "learning_rate": 5.854061526172401e-07, "loss": 0.0004, "reward": 0.7888889312744141, "reward_std": 0.12509256228804588, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 462 }, { "completion_length": 677.5694274902344, "epoch": 27.235294117647058, "grad_norm": 0.0017615415297775468, "kl": 0.088623046875, "learning_rate": 5.83810128859795e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 463 }, { "completion_length": 650.5833435058594, "epoch": 27.294117647058822, "grad_norm": 0.009641886019789514, "kl": 0.12646484375, "learning_rate": 5.822132259737564e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 464 }, { "completion_length": 622.3194274902344, "epoch": 27.352941176470587, "grad_norm": 0.002180733808314296, "kl": 0.097900390625, "learning_rate": 5.806154607098799e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 465 }, { "completion_length": 655.4305419921875, "epoch": 27.41176470588235, "grad_norm": 0.0022702332243464854, "kl": 0.110107421875, "learning_rate": 5.790168498279671e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 466 }, { "completion_length": 596.6805725097656, "epoch": 27.470588235294116, "grad_norm": 0.26261501412563854, "kl": 0.111572265625, "learning_rate": 5.774174100966899e-07, "loss": 0.0009, "reward": 1.189583420753479, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.9895833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 467 }, { "completion_length": 597.3194580078125, "epoch": 27.529411764705884, "grad_norm": 0.31540771425004, "kl": 0.117431640625, "learning_rate": 5.75817158293414e-07, "loss": 0.0005, "reward": 0.9986112117767334, "reward_std": 0.12028130888938904, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 468 }, { "completion_length": 651.8611145019531, "epoch": 27.58823529411765, "grad_norm": 0.23208914836961758, "kl": 0.100341796875, "learning_rate": 5.742161112040236e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 469 }, { "completion_length": 588.4305725097656, "epoch": 27.647058823529413, "grad_norm": 0.21207176789154003, "kl": 0.1220703125, "learning_rate": 5.726142856227452e-07, "loss": 0.0009, "reward": 1.189583420753479, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.9895833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 470 }, { "completion_length": 669.5833129882812, "epoch": 27.705882352941178, "grad_norm": 0.0019046926141114045, "kl": 0.095947265625, "learning_rate": 5.710116983519711e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 471 }, { "completion_length": 653.8194580078125, "epoch": 27.764705882352942, "grad_norm": 0.0028361473283831326, "kl": 0.105224609375, "learning_rate": 5.694083662020834e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 472 }, { "completion_length": 671.7777709960938, "epoch": 27.823529411764707, "grad_norm": 0.20778025725420532, "kl": 0.100830078125, "learning_rate": 5.678043059912776e-07, "loss": 0.0003, "reward": 0.5472222417593002, "reward_std": 0.10257624089717865, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3472222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 473 }, { "completion_length": 682.388916015625, "epoch": 27.88235294117647, "grad_norm": 0.14682938299647352, "kl": 0.1064453125, "learning_rate": 5.661995345453866e-07, "loss": 0.0007, "reward": 0.8236111998558044, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 474 }, { "completion_length": 622.0277709960938, "epoch": 27.941176470588236, "grad_norm": 0.002733347622431441, "kl": 0.107666015625, "learning_rate": 5.645940686977032e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 475 }, { "completion_length": 607.9861145019531, "epoch": 28.0, "grad_norm": 0.002596559406231835, "kl": 0.1103515625, "learning_rate": 5.629879252888045e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 476 }, { "completion_length": 637.4028015136719, "epoch": 28.058823529411764, "grad_norm": 0.190681939686207, "kl": 0.1044921875, "learning_rate": 5.61381121166375e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 477 }, { "completion_length": 620.1944274902344, "epoch": 28.11764705882353, "grad_norm": 0.0021141105376722637, "kl": 0.11181640625, "learning_rate": 5.597736731850294e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 478 }, { "completion_length": 655.375, "epoch": 28.176470588235293, "grad_norm": 0.13746329555276007, "kl": 0.09765625, "learning_rate": 5.581655982061366e-07, "loss": 0.0007, "reward": 0.9958333671092987, "reward_std": 0.08758113533258438, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 479 }, { "completion_length": 647.638916015625, "epoch": 28.235294117647058, "grad_norm": 0.19091447289095045, "kl": 0.109619140625, "learning_rate": 5.565569130976422e-07, "loss": 0.0004, "reward": 0.44305555522441864, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2430555522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 480 }, { "completion_length": 648.2777709960938, "epoch": 28.294117647058822, "grad_norm": 0.002081272480843331, "kl": 0.098388671875, "learning_rate": 5.549476347338913e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 481 }, { "completion_length": 686.0000305175781, "epoch": 28.352941176470587, "grad_norm": 0.0017353978944917054, "kl": 0.09228515625, "learning_rate": 5.533377799954531e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 482 }, { "completion_length": 707.2916564941406, "epoch": 28.41176470588235, "grad_norm": 0.0022405292643790436, "kl": 0.095947265625, "learning_rate": 5.517273657689418e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 483 }, { "completion_length": 652.9861450195312, "epoch": 28.470588235294116, "grad_norm": 0.19296159262581897, "kl": 0.1064453125, "learning_rate": 5.501164089468405e-07, "loss": 0.0005, "reward": 0.7381944954395294, "reward_std": 0.1072767972946167, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5381944328546524, "rewards/thinker_reward_func": 0.10000000894069672, "step": 484 }, { "completion_length": 676.5555725097656, "epoch": 28.529411764705884, "grad_norm": 0.0025838091331138696, "kl": 0.105712890625, "learning_rate": 5.485049264273241e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 485 }, { "completion_length": 672.5972290039062, "epoch": 28.58823529411765, "grad_norm": 0.001962802006699604, "kl": 0.10009765625, "learning_rate": 5.468929351140815e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 486 }, { "completion_length": 694.0416870117188, "epoch": 28.647058823529413, "grad_norm": 0.001675330852692169, "kl": 0.09130859375, "learning_rate": 5.452804519161389e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 487 }, { "completion_length": 615.3055419921875, "epoch": 28.705882352941178, "grad_norm": 0.003025141907171453, "kl": 0.115966796875, "learning_rate": 5.436674937476819e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 488 }, { "completion_length": 688.5277709960938, "epoch": 28.764705882352942, "grad_norm": 0.15125391390732226, "kl": 0.098388671875, "learning_rate": 5.420540775278788e-07, "loss": 0.0005, "reward": 0.6138889342546463, "reward_std": 0.006487491074949503, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 489 }, { "completion_length": 718.8333435058594, "epoch": 28.823529411764707, "grad_norm": 0.26774710058264256, "kl": 0.111572265625, "learning_rate": 5.404402201807021e-07, "loss": 0.0009, "reward": 1.1868056058883667, "reward_std": 0.1007097833789885, "rewards/format_reward_func": 0.0972222313284874, "rewards/solution_reward_func": 0.9895833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 490 }, { "completion_length": 616.4305725097656, "epoch": 28.88235294117647, "grad_norm": 0.002022713991621201, "kl": 0.106689453125, "learning_rate": 5.388259386347517e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 491 }, { "completion_length": 622.5138854980469, "epoch": 28.941176470588236, "grad_norm": 0.0018558776027490403, "kl": 0.09814453125, "learning_rate": 5.37211249823077e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 492 }, { "completion_length": 648.6805419921875, "epoch": 29.0, "grad_norm": 0.002108460737867919, "kl": 0.1005859375, "learning_rate": 5.355961706829997e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 493 }, { "completion_length": 660.2222290039062, "epoch": 29.058823529411764, "grad_norm": 0.002240302186550476, "kl": 0.1044921875, "learning_rate": 5.339807181559358e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 494 }, { "completion_length": 695.4583435058594, "epoch": 29.11764705882353, "grad_norm": 0.0020254632215315994, "kl": 0.095458984375, "learning_rate": 5.323649091872178e-07, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 495 }, { "completion_length": 665.1111145019531, "epoch": 29.176470588235293, "grad_norm": 0.0017416176015752143, "kl": 0.0966796875, "learning_rate": 5.307487607259174e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 496 }, { "completion_length": 608.3611145019531, "epoch": 29.235294117647058, "grad_norm": 0.00216574595796518, "kl": 0.10791015625, "learning_rate": 5.291322897246668e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 497 }, { "completion_length": 659.1666564941406, "epoch": 29.294117647058822, "grad_norm": 0.0018165870424325708, "kl": 0.1064453125, "learning_rate": 5.275155131394824e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 498 }, { "completion_length": 729.513916015625, "epoch": 29.352941176470587, "grad_norm": 0.0016636911702198596, "kl": 0.08935546875, "learning_rate": 5.258984479295852e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 499 }, { "completion_length": 650.2361145019531, "epoch": 29.41176470588235, "grad_norm": 0.24569178017158996, "kl": 0.097412109375, "learning_rate": 5.242811110572242e-07, "loss": 0.0005, "reward": 0.807638980448246, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6076388955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 500 }, { "completion_length": 680.9305725097656, "epoch": 29.470588235294116, "grad_norm": 0.0034246734375818176, "kl": 0.10693359375, "learning_rate": 5.226635194874977e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 501 }, { "completion_length": 624.2361145019531, "epoch": 29.529411764705884, "grad_norm": 0.0032017469837379155, "kl": 0.10986328125, "learning_rate": 5.21045690188176e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 502 }, { "completion_length": 595.2361145019531, "epoch": 29.58823529411765, "grad_norm": 0.0023016787060217885, "kl": 0.111328125, "learning_rate": 5.19427640129523e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 503 }, { "completion_length": 655.4861145019531, "epoch": 29.647058823529413, "grad_norm": 0.0026725756351397723, "kl": 0.10205078125, "learning_rate": 5.178093862841178e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 504 }, { "completion_length": 697.4166870117188, "epoch": 29.705882352941178, "grad_norm": 0.0038210205913215504, "kl": 0.09765625, "learning_rate": 5.16190945626678e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 505 }, { "completion_length": 671.5694580078125, "epoch": 29.764705882352942, "grad_norm": 0.1599138726806641, "kl": 0.10693359375, "learning_rate": 5.145723351338798e-07, "loss": 0.0005, "reward": 0.6152777448296547, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 506 }, { "completion_length": 607.5555725097656, "epoch": 29.823529411764707, "grad_norm": 0.002173180865295566, "kl": 0.107421875, "learning_rate": 5.129535717841818e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 507 }, { "completion_length": 632.2916564941406, "epoch": 29.88235294117647, "grad_norm": 0.0026249257010622566, "kl": 0.109375, "learning_rate": 5.11334672557645e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 508 }, { "completion_length": 684.3333435058594, "epoch": 29.941176470588236, "grad_norm": 0.30186491360180046, "kl": 0.092041015625, "learning_rate": 5.097156544357567e-07, "loss": 0.0001, "reward": 0.5645833387970924, "reward_std": 0.14123429358005524, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3645833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 509 }, { "completion_length": 647.8194274902344, "epoch": 30.0, "grad_norm": 0.1889393768376545, "kl": 0.095458984375, "learning_rate": 5.080965344012508e-07, "loss": 0.0005, "reward": 0.73819450289011, "reward_std": 0.1072767972946167, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.538194477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 510 }, { "completion_length": 665.5555725097656, "epoch": 30.058823529411764, "grad_norm": 0.13324965252817114, "kl": 0.0986328125, "learning_rate": 5.064773294379302e-07, "loss": 0.0007, "reward": 0.8236111402511597, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 511 }, { "completion_length": 599.3750305175781, "epoch": 30.11764705882353, "grad_norm": 0.23068585562151334, "kl": 0.10546875, "learning_rate": 5.048580565304886e-07, "loss": 0.001, "reward": 1.3979167938232422, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.1979166865348816, "rewards/thinker_reward_func": 0.10000000894069672, "step": 512 }, { "completion_length": 646.4305725097656, "epoch": 30.176470588235293, "grad_norm": 0.011929141526141301, "kl": 0.140380859375, "learning_rate": 5.03238732664333e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 513 }, { "completion_length": 619.3055725097656, "epoch": 30.235294117647058, "grad_norm": 0.002897347592911484, "kl": 0.101318359375, "learning_rate": 5.016193748254044e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 514 }, { "completion_length": 630.6805725097656, "epoch": 30.294117647058822, "grad_norm": 0.002013641183799416, "kl": 0.099853515625, "learning_rate": 5e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 515 }, { "completion_length": 622.9722290039062, "epoch": 30.352941176470587, "grad_norm": 0.002544434166809688, "kl": 0.108642578125, "learning_rate": 4.983806251745957e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 516 }, { "completion_length": 672.875, "epoch": 30.41176470588235, "grad_norm": 0.013303074029338629, "kl": 0.119384765625, "learning_rate": 4.967612673356669e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 517 }, { "completion_length": 667.0416870117188, "epoch": 30.470588235294116, "grad_norm": 0.22243187276644674, "kl": 0.102783203125, "learning_rate": 4.951419434695113e-07, "loss": 0.0005, "reward": 0.807638980448246, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6076388955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 518 }, { "completion_length": 686.3888854980469, "epoch": 30.529411764705884, "grad_norm": 0.19489619287202017, "kl": 0.083984375, "learning_rate": 4.935226705620699e-07, "loss": 0.0005, "reward": 0.6152778267860413, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 519 }, { "completion_length": 688.125, "epoch": 30.58823529411765, "grad_norm": 0.20985957434755936, "kl": 0.091552734375, "learning_rate": 4.919034655987492e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 520 }, { "completion_length": 666.3055725097656, "epoch": 30.647058823529413, "grad_norm": 0.002039650913683395, "kl": 0.089111328125, "learning_rate": 4.902843455642433e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 521 }, { "completion_length": 667.8888854980469, "epoch": 30.705882352941178, "grad_norm": 0.0017192657512800436, "kl": 0.089599609375, "learning_rate": 4.88665327442355e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 522 }, { "completion_length": 633.5694580078125, "epoch": 30.764705882352942, "grad_norm": 0.0027172885095982056, "kl": 0.098876953125, "learning_rate": 4.870464282158184e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 523 }, { "completion_length": 653.2083435058594, "epoch": 30.823529411764707, "grad_norm": 0.15117181002656124, "kl": 0.087158203125, "learning_rate": 4.854276648661202e-07, "loss": 0.0007, "reward": 1.0159723460674286, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 524 }, { "completion_length": 629.9027709960938, "epoch": 30.88235294117647, "grad_norm": 0.0020277602109898324, "kl": 0.09130859375, "learning_rate": 4.838090543733221e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 525 }, { "completion_length": 616.2222290039062, "epoch": 30.941176470588236, "grad_norm": 0.002386767401156888, "kl": 0.105712890625, "learning_rate": 4.821906137158821e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 526 }, { "completion_length": 689.5972290039062, "epoch": 31.0, "grad_norm": 0.23926575173342943, "kl": 0.093505859375, "learning_rate": 4.805723598704771e-07, "loss": 0.0007, "reward": 1.0159723162651062, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 527 }, { "completion_length": 670.5277709960938, "epoch": 31.058823529411764, "grad_norm": 0.2051831443568084, "kl": 0.0927734375, "learning_rate": 4.789543098118241e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 528 }, { "completion_length": 619.0694580078125, "epoch": 31.11764705882353, "grad_norm": 0.00391976449402006, "kl": 0.095703125, "learning_rate": 4.773364805125024e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 529 }, { "completion_length": 630.3750305175781, "epoch": 31.176470588235293, "grad_norm": 0.1807515989142006, "kl": 0.08935546875, "learning_rate": 4.75718888942776e-07, "loss": 0.0003, "reward": 0.49513889849185944, "reward_std": 0.1072767972946167, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2951388880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 530 }, { "completion_length": 674.7778015136719, "epoch": 31.235294117647058, "grad_norm": 0.0022751103913034166, "kl": 0.088134765625, "learning_rate": 4.7410155207041476e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 531 }, { "completion_length": 579.25, "epoch": 31.294117647058822, "grad_norm": 0.003047946567417057, "kl": 0.099365234375, "learning_rate": 4.7248448686051753e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 532 }, { "completion_length": 618.2777709960938, "epoch": 31.352941176470587, "grad_norm": 0.21816526338808423, "kl": 0.107177734375, "learning_rate": 4.708677102753331e-07, "loss": 0.0007, "reward": 1.0145833492279053, "reward_std": 0.05989522486925125, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 533 }, { "completion_length": 605.0278015136719, "epoch": 31.41176470588235, "grad_norm": 0.0025557376980021493, "kl": 0.083740234375, "learning_rate": 4.692512392740826e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 534 }, { "completion_length": 653.8194274902344, "epoch": 31.470588235294116, "grad_norm": 0.0019064482914519587, "kl": 0.09716796875, "learning_rate": 4.676350908127821e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 535 }, { "completion_length": 603.0694580078125, "epoch": 31.529411764705884, "grad_norm": 0.20694022326692277, "kl": 0.096435546875, "learning_rate": 4.6601928184406407e-07, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 536 }, { "completion_length": 615.7361145019531, "epoch": 31.58823529411765, "grad_norm": 0.0020267779216497314, "kl": 0.099365234375, "learning_rate": 4.6440382931700025e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 537 }, { "completion_length": 624.9305419921875, "epoch": 31.647058823529413, "grad_norm": 0.003155810021294478, "kl": 0.10009765625, "learning_rate": 4.6278875017692305e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 538 }, { "completion_length": 588.8055725097656, "epoch": 31.705882352941178, "grad_norm": 0.19742489440756536, "kl": 0.114013671875, "learning_rate": 4.611740613652484e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 539 }, { "completion_length": 610.8194580078125, "epoch": 31.764705882352942, "grad_norm": 0.0032728624019634084, "kl": 0.104736328125, "learning_rate": 4.595597798192979e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 540 }, { "completion_length": 602.625, "epoch": 31.823529411764707, "grad_norm": 0.3001098253810415, "kl": 0.103759765625, "learning_rate": 4.5794592247212115e-07, "loss": 0.0007, "reward": 1.1548397541046143, "reward_std": 0.1627010852098465, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.9548611342906952, "rewards/thinker_reward_func": 0.09997858107089996, "step": 541 }, { "completion_length": 641.5694580078125, "epoch": 31.88235294117647, "grad_norm": 0.001958400220476122, "kl": 0.084716796875, "learning_rate": 4.56332506252318e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 542 }, { "completion_length": 588.5, "epoch": 31.941176470588236, "grad_norm": 0.20853295865253144, "kl": 0.108154296875, "learning_rate": 4.547195480838611e-07, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 543 }, { "completion_length": 650.8611145019531, "epoch": 32.0, "grad_norm": 0.005624258369141414, "kl": 0.089599609375, "learning_rate": 4.5310706488591854e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 544 }, { "completion_length": 635.4027709960938, "epoch": 32.05882352941177, "grad_norm": 0.0019251194114456517, "kl": 0.091064453125, "learning_rate": 4.5149507357267597e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 545 }, { "completion_length": 639.7083129882812, "epoch": 32.11764705882353, "grad_norm": 0.21392752388291783, "kl": 0.1005859375, "learning_rate": 4.498835910531595e-07, "loss": 0.0007, "reward": 0.9812500476837158, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.78125, "rewards/thinker_reward_func": 0.10000000894069672, "step": 546 }, { "completion_length": 642.4027709960938, "epoch": 32.1764705882353, "grad_norm": 0.2168764966796085, "kl": 0.090576171875, "learning_rate": 4.4827263423105815e-07, "loss": 0.0007, "reward": 1.0319445133209229, "reward_std": 0.004811250604689121, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 547 }, { "completion_length": 632.263916015625, "epoch": 32.23529411764706, "grad_norm": 0.00199180324915503, "kl": 0.094970703125, "learning_rate": 4.466622200045468e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 548 }, { "completion_length": 680.7778015136719, "epoch": 32.294117647058826, "grad_norm": 0.265267638965584, "kl": 0.09228515625, "learning_rate": 4.4505236526610856e-07, "loss": 0.0005, "reward": 0.9972222745418549, "reward_std": 0.12509256973862648, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 549 }, { "completion_length": 600.4027709960938, "epoch": 32.35294117647059, "grad_norm": 0.00209139620157151, "kl": 0.094482421875, "learning_rate": 4.434430869023579e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 550 }, { "completion_length": 626.375, "epoch": 32.411764705882355, "grad_norm": 0.0018864622735923619, "kl": 0.100830078125, "learning_rate": 4.418344017938633e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 551 }, { "completion_length": 555.4166564941406, "epoch": 32.470588235294116, "grad_norm": 0.006035161683808028, "kl": 0.108642578125, "learning_rate": 4.4022632681497056e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 552 }, { "completion_length": 660.8611145019531, "epoch": 32.529411764705884, "grad_norm": 0.013534998130227686, "kl": 0.125244140625, "learning_rate": 4.3861887883362505e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 553 }, { "completion_length": 644.5555725097656, "epoch": 32.588235294117645, "grad_norm": 0.0019870643004086647, "kl": 0.094482421875, "learning_rate": 4.370120747111955e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 554 }, { "completion_length": 599.2083435058594, "epoch": 32.64705882352941, "grad_norm": 0.34931976799975545, "kl": 0.12060546875, "learning_rate": 4.354059313022969e-07, "loss": 0.0002, "reward": 0.40694449096918106, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 555 }, { "completion_length": 639.8194580078125, "epoch": 32.705882352941174, "grad_norm": 0.0033315760457455334, "kl": 0.1015625, "learning_rate": 4.3380046545461357e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 556 }, { "completion_length": 572.3194274902344, "epoch": 32.76470588235294, "grad_norm": 0.18940697901582593, "kl": 0.10986328125, "learning_rate": 4.321956940087224e-07, "loss": 0.0007, "reward": 1.0159723162651062, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 557 }, { "completion_length": 670.0416564941406, "epoch": 32.8235294117647, "grad_norm": 0.0023290395283508077, "kl": 0.08837890625, "learning_rate": 4.305916337979167e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 558 }, { "completion_length": 616.3055725097656, "epoch": 32.88235294117647, "grad_norm": 0.27160918955014524, "kl": 0.109130859375, "learning_rate": 4.289883016480291e-07, "loss": 0.0007, "reward": 0.8423611521720886, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.642361119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 559 }, { "completion_length": 568.1527709960938, "epoch": 32.94117647058823, "grad_norm": 0.0027362535606609514, "kl": 0.109130859375, "learning_rate": 4.2738571437725496e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 560 }, { "completion_length": 609.7361145019531, "epoch": 33.0, "grad_norm": 0.004432988631188706, "kl": 0.10205078125, "learning_rate": 4.257838887959763e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 561 }, { "completion_length": 628.9305725097656, "epoch": 33.05882352941177, "grad_norm": 0.20919133233438908, "kl": 0.1044921875, "learning_rate": 4.2418284170658595e-07, "loss": 0.0005, "reward": 0.8249889314174652, "reward_std": 3.849327549687587e-05, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.0999888963997364, "step": 562 }, { "completion_length": 595.0416870117188, "epoch": 33.11764705882353, "grad_norm": 0.31927353395589253, "kl": 0.111572265625, "learning_rate": 4.2258258990331007e-07, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 563 }, { "completion_length": 586.3055419921875, "epoch": 33.1764705882353, "grad_norm": 0.003671442084196311, "kl": 0.107177734375, "learning_rate": 4.209831501720328e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 564 }, { "completion_length": 640.6528015136719, "epoch": 33.23529411764706, "grad_norm": 0.003947502012303677, "kl": 0.097900390625, "learning_rate": 4.193845392901201e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 565 }, { "completion_length": 619.6527709960938, "epoch": 33.294117647058826, "grad_norm": 0.012711792419105116, "kl": 0.114501953125, "learning_rate": 4.177867740262436e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 566 }, { "completion_length": 636.7361145019531, "epoch": 33.35294117647059, "grad_norm": 0.0031029987756315013, "kl": 0.093994140625, "learning_rate": 4.1618987114020495e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 567 }, { "completion_length": 589.3194580078125, "epoch": 33.411764705882355, "grad_norm": 0.006752858817932837, "kl": 0.1103515625, "learning_rate": 4.145938473827598e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 568 }, { "completion_length": 613.9444580078125, "epoch": 33.470588235294116, "grad_norm": 0.0022640209041260738, "kl": 0.09619140625, "learning_rate": 4.129987194954421e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 569 }, { "completion_length": 612.7916870117188, "epoch": 33.529411764705884, "grad_norm": 0.001972209231063971, "kl": 0.09423828125, "learning_rate": 4.1140450421038866e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 570 }, { "completion_length": 648.3472290039062, "epoch": 33.588235294117645, "grad_norm": 0.004777812450187534, "kl": 0.0947265625, "learning_rate": 4.098112182501633e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 571 }, { "completion_length": 589.3333435058594, "epoch": 33.64705882352941, "grad_norm": 0.002676543414024782, "kl": 0.11181640625, "learning_rate": 4.0821887832758194e-07, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 572 }, { "completion_length": 604.513916015625, "epoch": 33.705882352941174, "grad_norm": 0.003126305966525678, "kl": 0.100830078125, "learning_rate": 4.0662750114553685e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 573 }, { "completion_length": 602.7361145019531, "epoch": 33.76470588235294, "grad_norm": 0.2287417085182106, "kl": 0.1025390625, "learning_rate": 4.050371033968215e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 574 }, { "completion_length": 616.6388854980469, "epoch": 33.8235294117647, "grad_norm": 0.002120225899457506, "kl": 0.09716796875, "learning_rate": 4.0344770176395606e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 575 }, { "completion_length": 613.6805725097656, "epoch": 33.88235294117647, "grad_norm": 0.002229871639512088, "kl": 0.09033203125, "learning_rate": 4.018593129190113e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 576 }, { "completion_length": 655.9861145019531, "epoch": 33.94117647058823, "grad_norm": 0.0024283426734427777, "kl": 0.103271484375, "learning_rate": 4.0027195352343456e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 577 }, { "completion_length": 632.1805419921875, "epoch": 34.0, "grad_norm": 0.0019608466884738986, "kl": 0.092041015625, "learning_rate": 3.98685640227875e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 578 }, { "completion_length": 632.3888854980469, "epoch": 34.05882352941177, "grad_norm": 0.23082793678123312, "kl": 0.10205078125, "learning_rate": 3.971003896720082e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 579 }, { "completion_length": 593.1944580078125, "epoch": 34.11764705882353, "grad_norm": 0.24787015815373223, "kl": 0.093017578125, "learning_rate": 3.955162184843624e-07, "loss": 0.0007, "reward": 0.842361181974411, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6423611044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 580 }, { "completion_length": 652.3194580078125, "epoch": 34.1764705882353, "grad_norm": 0.1986313099378057, "kl": 0.0966796875, "learning_rate": 3.93933143282144e-07, "loss": 0.0007, "reward": 1.0159723460674286, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 581 }, { "completion_length": 611.4028015136719, "epoch": 34.23529411764706, "grad_norm": 0.2446370528404323, "kl": 0.09814453125, "learning_rate": 3.923511806710625e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 582 }, { "completion_length": 567.9027709960938, "epoch": 34.294117647058826, "grad_norm": 0.002428257951164011, "kl": 0.110595703125, "learning_rate": 3.907703472451573e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 583 }, { "completion_length": 589.9444274902344, "epoch": 34.35294117647059, "grad_norm": 0.19591986049991308, "kl": 0.09716796875, "learning_rate": 3.8919065958662295e-07, "loss": 0.0004, "reward": 0.6166174784302711, "reward_std": 0.00013619875244330615, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09995080158114433, "step": 584 }, { "completion_length": 620.7638854980469, "epoch": 34.411764705882355, "grad_norm": 0.0019079858252399981, "kl": 0.096923828125, "learning_rate": 3.8761213426563543e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 585 }, { "completion_length": 612.3194580078125, "epoch": 34.470588235294116, "grad_norm": 0.0023054955853117182, "kl": 0.096923828125, "learning_rate": 3.860347878401784e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 586 }, { "completion_length": 569.5694580078125, "epoch": 34.529411764705884, "grad_norm": 0.0025027601430914633, "kl": 0.10791015625, "learning_rate": 3.844586368558694e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 587 }, { "completion_length": 598.375, "epoch": 34.588235294117645, "grad_norm": 0.2761684290609445, "kl": 0.118408203125, "learning_rate": 3.828836978457867e-07, "loss": 0.0005, "reward": 0.6165714263916016, "reward_std": 0.00032991368789225817, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09990477189421654, "step": 588 }, { "completion_length": 577.1111450195312, "epoch": 34.64705882352941, "grad_norm": 0.32379840125115616, "kl": 0.121826171875, "learning_rate": 3.813099873302951e-07, "loss": 0.0005, "reward": 0.9986112117767334, "reward_std": 0.12028130143880844, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 589 }, { "completion_length": 634.8611145019531, "epoch": 34.705882352941174, "grad_norm": 0.272652934354164, "kl": 0.093505859375, "learning_rate": 3.7973752181687327e-07, "loss": 0.0007, "reward": 1.189583420753479, "reward_std": 0.14123429358005524, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.9895833432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 590 }, { "completion_length": 660.9861145019531, "epoch": 34.76470588235294, "grad_norm": 0.0022578281105513575, "kl": 0.09326171875, "learning_rate": 3.781663177999401e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 591 }, { "completion_length": 618.3888854980469, "epoch": 34.8235294117647, "grad_norm": 0.002194757002170407, "kl": 0.10107421875, "learning_rate": 3.765963917606828e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 592 }, { "completion_length": 682.7361145019531, "epoch": 34.88235294117647, "grad_norm": 0.18480731492219765, "kl": 0.088134765625, "learning_rate": 3.750277601668823e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 593 }, { "completion_length": 641.0694274902344, "epoch": 34.94117647058823, "grad_norm": 0.003602722393046684, "kl": 0.099365234375, "learning_rate": 3.7346043947274186e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 594 }, { "completion_length": 600.2222290039062, "epoch": 35.0, "grad_norm": 0.23857227083957477, "kl": 0.107666015625, "learning_rate": 3.718944461187138e-07, "loss": 0.0007, "reward": 1.0131945312023163, "reward_std": 0.06976316124200821, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.09861112385988235, "step": 595 }, { "completion_length": 619.0000305175781, "epoch": 35.05882352941177, "grad_norm": 0.0030576997536905814, "kl": 0.10400390625, "learning_rate": 3.7032979653132747e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 596 }, { "completion_length": 649.6944580078125, "epoch": 35.11764705882353, "grad_norm": 0.0020251134392101785, "kl": 0.095703125, "learning_rate": 3.6876650712301647e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 597 }, { "completion_length": 605.7222290039062, "epoch": 35.1764705882353, "grad_norm": 0.2959275326352605, "kl": 0.116943359375, "learning_rate": 3.6720459429194737e-07, "loss": 0.0002, "reward": 0.599278599023819, "reward_std": 0.060234132601181045, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.09997302666306496, "step": 598 }, { "completion_length": 669.5833435058594, "epoch": 35.23529411764706, "grad_norm": 0.003913594253239374, "kl": 0.101806640625, "learning_rate": 3.656440744218464e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 599 }, { "completion_length": 577.25, "epoch": 35.294117647058826, "grad_norm": 0.0022738785106000367, "kl": 0.109619140625, "learning_rate": 3.640849638818285e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 600 }, { "completion_length": 618.6944580078125, "epoch": 35.35294117647059, "grad_norm": 0.3132359846297209, "kl": 0.10791015625, "learning_rate": 3.625272790262257e-07, "loss": 0.0007, "reward": 1.206944614648819, "reward_std": 0.12028131633996964, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0069444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 601 }, { "completion_length": 684.6666564941406, "epoch": 35.411764705882355, "grad_norm": 0.24808406023578863, "kl": 0.09375, "learning_rate": 3.60971036194415e-07, "loss": 0.0005, "reward": 0.807632565498352, "reward_std": 0.060162645033415174, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.09999365732073784, "step": 602 }, { "completion_length": 627.6527709960938, "epoch": 35.470588235294116, "grad_norm": 0.15273399292450637, "kl": 0.10498046875, "learning_rate": 3.594162517106472e-07, "loss": 0.0007, "reward": 0.8236111998558044, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 603 }, { "completion_length": 680.6805725097656, "epoch": 35.529411764705884, "grad_norm": 0.0018198937573433056, "kl": 0.093994140625, "learning_rate": 3.578629418838757e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 604 }, { "completion_length": 637.9722290039062, "epoch": 35.588235294117645, "grad_norm": 0.002578814545656273, "kl": 0.098876953125, "learning_rate": 3.563111230075859e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 605 }, { "completion_length": 662.1944580078125, "epoch": 35.64705882352941, "grad_norm": 0.18532074357814848, "kl": 0.094482421875, "learning_rate": 3.547608113596233e-07, "loss": 0.0007, "reward": 0.998611181974411, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 606 }, { "completion_length": 608.8055419921875, "epoch": 35.705882352941174, "grad_norm": 0.002285577019814061, "kl": 0.11767578125, "learning_rate": 3.532120232020236e-07, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 607 }, { "completion_length": 658.0555725097656, "epoch": 35.76470588235294, "grad_norm": 0.002262120916868422, "kl": 0.094482421875, "learning_rate": 3.516647747808417e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 608 }, { "completion_length": 614.2777709960938, "epoch": 35.8235294117647, "grad_norm": 0.0026384606322782263, "kl": 0.11279296875, "learning_rate": 3.501190823259812e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 609 }, { "completion_length": 606.2916870117188, "epoch": 35.88235294117647, "grad_norm": 0.24404000700045048, "kl": 0.104248046875, "learning_rate": 3.485749620510247e-07, "loss": 0.0007, "reward": 0.824965164065361, "reward_std": 0.00012096853606635705, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.09996508806943893, "step": 610 }, { "completion_length": 595.888916015625, "epoch": 35.94117647058823, "grad_norm": 0.001937343509427642, "kl": 0.11474609375, "learning_rate": 3.470324301530631e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 611 }, { "completion_length": 612.4722290039062, "epoch": 36.0, "grad_norm": 0.0023882640809025123, "kl": 0.11328125, "learning_rate": 3.454915028125263e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 612 }, { "completion_length": 677.125, "epoch": 36.05882352941177, "grad_norm": 0.002769727882292675, "kl": 0.104248046875, "learning_rate": 3.4395219619301285e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 613 }, { "completion_length": 627.3472290039062, "epoch": 36.11764705882353, "grad_norm": 0.17325942253586007, "kl": 0.103759765625, "learning_rate": 3.424145264411208e-07, "loss": 0.0007, "reward": 0.9986112117767334, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.798611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 614 }, { "completion_length": 635.1666564941406, "epoch": 36.1764705882353, "grad_norm": 0.22970408578198406, "kl": 0.103759765625, "learning_rate": 3.408785096862782e-07, "loss": 0.0003, "reward": 0.5993056371808052, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 615 }, { "completion_length": 614.263916015625, "epoch": 36.23529411764706, "grad_norm": 0.0024137481369346943, "kl": 0.11279296875, "learning_rate": 3.393441620405739e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 616 }, { "completion_length": 684.6944580078125, "epoch": 36.294117647058826, "grad_norm": 0.18131506320305146, "kl": 0.093505859375, "learning_rate": 3.378114995985889e-07, "loss": 0.0003, "reward": 0.5993055775761604, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 617 }, { "completion_length": 687.7638854980469, "epoch": 36.35294117647059, "grad_norm": 0.0018767209802869938, "kl": 0.093994140625, "learning_rate": 3.362805384372267e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 618 }, { "completion_length": 657.138916015625, "epoch": 36.411764705882355, "grad_norm": 0.2535356146239037, "kl": 0.11181640625, "learning_rate": 3.3475129461554566e-07, "loss": 0.0005, "reward": 0.8048611879348755, "reward_std": 0.06662814924493432, "rewards/format_reward_func": 0.0972222350537777, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 619 }, { "completion_length": 638.4305725097656, "epoch": 36.470588235294116, "grad_norm": 0.0029645261527679644, "kl": 0.121337890625, "learning_rate": 3.3322378417458977e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 620 }, { "completion_length": 629.5277709960938, "epoch": 36.529411764705884, "grad_norm": 0.19881850527719166, "kl": 0.10595703125, "learning_rate": 3.3169802313722073e-07, "loss": 0.0004, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 621 }, { "completion_length": 656.9028015136719, "epoch": 36.588235294117645, "grad_norm": 0.001885166944371476, "kl": 0.10009765625, "learning_rate": 3.301740275079497e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 622 }, { "completion_length": 602.1388854980469, "epoch": 36.64705882352941, "grad_norm": 0.0044868698789657276, "kl": 0.13525390625, "learning_rate": 3.2865181327277005e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 623 }, { "completion_length": 612.7916870117188, "epoch": 36.705882352941174, "grad_norm": 0.0023277770232076277, "kl": 0.11181640625, "learning_rate": 3.2713139639898854e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 624 }, { "completion_length": 657.3194580078125, "epoch": 36.76470588235294, "grad_norm": 0.22309208617612847, "kl": 0.099609375, "learning_rate": 3.2561279283505884e-07, "loss": 0.0002, "reward": 0.21736110746860504, "reward_std": 0.060140661895275116, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 625 }, { "completion_length": 658.1527709960938, "epoch": 36.8235294117647, "grad_norm": 0.23527255722332502, "kl": 0.1025390625, "learning_rate": 3.240960185104137e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 626 }, { "completion_length": 608.3750305175781, "epoch": 36.88235294117647, "grad_norm": 0.0033399661101758743, "kl": 0.1103515625, "learning_rate": 3.2258108933529805e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 627 }, { "completion_length": 604.5555725097656, "epoch": 36.94117647058823, "grad_norm": 0.0026070009227683482, "kl": 0.107421875, "learning_rate": 3.2106802120060194e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 628 }, { "completion_length": 663.1388854980469, "epoch": 37.0, "grad_norm": 0.3010747146119333, "kl": 0.106689453125, "learning_rate": 3.1955682997769447e-07, "loss": 0.0003, "reward": 0.6166666746139526, "reward_std": 0.12028131633996964, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 629 }, { "completion_length": 620.0277709960938, "epoch": 37.05882352941177, "grad_norm": 0.20958135415135792, "kl": 0.111083984375, "learning_rate": 3.1804753151825627e-07, "loss": 0.0004, "reward": 0.42569445818662643, "reward_std": 0.060140661895275116, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.225694440305233, "rewards/thinker_reward_func": 0.10000000894069672, "step": 630 }, { "completion_length": 593.4027709960938, "epoch": 37.11764705882353, "grad_norm": 0.0022570421858214137, "kl": 0.112060546875, "learning_rate": 3.16540141654114e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 631 }, { "completion_length": 642.5, "epoch": 37.1764705882353, "grad_norm": 0.0036044458243788256, "kl": 0.10498046875, "learning_rate": 3.15034676197074e-07, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 632 }, { "completion_length": 667.5694580078125, "epoch": 37.23529411764706, "grad_norm": 0.2891263057919398, "kl": 0.10205078125, "learning_rate": 3.135311509387567e-07, "loss": 0.0002, "reward": 0.5819444730877876, "reward_std": 0.12028131633996964, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 633 }, { "completion_length": 706.0833435058594, "epoch": 37.294117647058826, "grad_norm": 0.0064162256979646, "kl": 0.098388671875, "learning_rate": 3.120295816504305e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 634 }, { "completion_length": 639.6111450195312, "epoch": 37.35294117647059, "grad_norm": 0.0022751727482567194, "kl": 0.11181640625, "learning_rate": 3.105299840828466e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 635 }, { "completion_length": 594.3611145019531, "epoch": 37.411764705882355, "grad_norm": 0.24719563384454468, "kl": 0.114013671875, "learning_rate": 3.090323739660742e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 636 }, { "completion_length": 624.1666870117188, "epoch": 37.470588235294116, "grad_norm": 0.0018666547519340668, "kl": 0.097900390625, "learning_rate": 3.075367670093344e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 637 }, { "completion_length": 632.6666870117188, "epoch": 37.529411764705884, "grad_norm": 0.19900518752269467, "kl": 0.115478515625, "learning_rate": 3.0604317890083674e-07, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 638 }, { "completion_length": 599.625, "epoch": 37.588235294117645, "grad_norm": 0.002249881894424432, "kl": 0.11279296875, "learning_rate": 3.045516253076137e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 639 }, { "completion_length": 585.8888854980469, "epoch": 37.64705882352941, "grad_norm": 0.0035513091296305297, "kl": 0.122314453125, "learning_rate": 3.030621218753565e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 640 }, { "completion_length": 625.6666870117188, "epoch": 37.705882352941174, "grad_norm": 0.39931410431174796, "kl": 0.11328125, "learning_rate": 3.0157468422825147e-07, "loss": 0.0005, "reward": 0.6152778193354607, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 641 }, { "completion_length": 619.6666564941406, "epoch": 37.76470588235294, "grad_norm": 0.0019216190378345177, "kl": 0.099853515625, "learning_rate": 3.0008932796881546e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 642 }, { "completion_length": 621.7361145019531, "epoch": 37.8235294117647, "grad_norm": 0.0022244393017574887, "kl": 0.10693359375, "learning_rate": 2.9860606867773317e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 643 }, { "completion_length": 614.3333435058594, "epoch": 37.88235294117647, "grad_norm": 0.006738208922204531, "kl": 0.1162109375, "learning_rate": 2.9712492191369244e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 644 }, { "completion_length": 611.8055419921875, "epoch": 37.94117647058823, "grad_norm": 0.008411527849952907, "kl": 0.119384765625, "learning_rate": 2.95645903213222e-07, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 645 }, { "completion_length": 628.875, "epoch": 38.0, "grad_norm": 0.26469395179809024, "kl": 0.113525390625, "learning_rate": 2.9416902809052814e-07, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 646 }, { "completion_length": 649.8333435058594, "epoch": 38.05882352941177, "grad_norm": 0.0024202460584565274, "kl": 0.09619140625, "learning_rate": 2.9269431203733206e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 647 }, { "completion_length": 652.8194274902344, "epoch": 38.11764705882353, "grad_norm": 0.2619418960062034, "kl": 0.11279296875, "learning_rate": 2.9122177052270747e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 648 }, { "completion_length": 636.4861145019531, "epoch": 38.1764705882353, "grad_norm": 0.0052401308207808825, "kl": 0.116943359375, "learning_rate": 2.897514189929177e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 649 }, { "completion_length": 676.3333435058594, "epoch": 38.23529411764706, "grad_norm": 0.001908160771517022, "kl": 0.0947265625, "learning_rate": 2.8828327287125507e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 650 }, { "completion_length": 639.9861450195312, "epoch": 38.294117647058826, "grad_norm": 0.002434993125137898, "kl": 0.10546875, "learning_rate": 2.8681734755787716e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 651 }, { "completion_length": 654.9861145019531, "epoch": 38.35294117647059, "grad_norm": 0.1787234046060118, "kl": 0.095947265625, "learning_rate": 2.853536584296471e-07, "loss": 0.0005, "reward": 0.807638980448246, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6076388955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 652 }, { "completion_length": 580.375, "epoch": 38.411764705882355, "grad_norm": 0.002098398170107202, "kl": 0.111328125, "learning_rate": 2.8389222083997117e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 653 }, { "completion_length": 630.7361145019531, "epoch": 38.470588235294116, "grad_norm": 0.0026924134221773693, "kl": 0.107177734375, "learning_rate": 2.8243305011863837e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 654 }, { "completion_length": 608.75, "epoch": 38.529411764705884, "grad_norm": 0.0023225748349202727, "kl": 0.109619140625, "learning_rate": 2.8097616157165885e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 655 }, { "completion_length": 620.8611145019531, "epoch": 38.588235294117645, "grad_norm": 0.0019822371709369018, "kl": 0.10595703125, "learning_rate": 2.7952157048110406e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 656 }, { "completion_length": 612.0833435058594, "epoch": 38.64705882352941, "grad_norm": 0.003695115707672888, "kl": 0.116455078125, "learning_rate": 2.7806929210494646e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 657 }, { "completion_length": 636.1666870117188, "epoch": 38.705882352941174, "grad_norm": 0.015380330919353256, "kl": 0.138671875, "learning_rate": 2.766193416768988e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 658 }, { "completion_length": 660.2361450195312, "epoch": 38.76470588235294, "grad_norm": 0.20494418129909475, "kl": 0.090576171875, "learning_rate": 2.751717344062552e-07, "loss": 0.0007, "reward": 1.0159723162651062, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 659 }, { "completion_length": 625.4444580078125, "epoch": 38.8235294117647, "grad_norm": 0.17195622412577904, "kl": 0.098876953125, "learning_rate": 2.7372648547773056e-07, "loss": 0.0007, "reward": 0.9812500476837158, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7812500149011612, "rewards/thinker_reward_func": 0.10000000894069672, "step": 660 }, { "completion_length": 607.3888854980469, "epoch": 38.88235294117647, "grad_norm": 0.22684826241837402, "kl": 0.10546875, "learning_rate": 2.722836100513027e-07, "loss": 0.0004, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 661 }, { "completion_length": 664.0416564941406, "epoch": 38.94117647058823, "grad_norm": 0.0022581914513899078, "kl": 0.099609375, "learning_rate": 2.708431232620516e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 662 }, { "completion_length": 633.0278015136719, "epoch": 39.0, "grad_norm": 0.002224792272369058, "kl": 0.111328125, "learning_rate": 2.6940504022000244e-07, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 663 }, { "completion_length": 604.7083435058594, "epoch": 39.05882352941177, "grad_norm": 0.002303789497383321, "kl": 0.103271484375, "learning_rate": 2.679693760099658e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 664 }, { "completion_length": 660.1111145019531, "epoch": 39.11764705882353, "grad_norm": 0.1370547411562005, "kl": 0.106201171875, "learning_rate": 2.665361456913797e-07, "loss": 0.0005, "reward": 0.6152778267860413, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 665 }, { "completion_length": 667.5833435058594, "epoch": 39.1764705882353, "grad_norm": 0.20430398588284635, "kl": 0.093505859375, "learning_rate": 2.651053642981522e-07, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 666 }, { "completion_length": 587.375, "epoch": 39.23529411764706, "grad_norm": 0.21894486759812723, "kl": 0.1142578125, "learning_rate": 2.6367704683850287e-07, "loss": 0.0007, "reward": 0.998611181974411, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7986111044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 667 }, { "completion_length": 649.8611145019531, "epoch": 39.294117647058826, "grad_norm": 0.16020329198231462, "kl": 0.091796875, "learning_rate": 2.6225120829480627e-07, "loss": 0.0007, "reward": 1.0159723162651062, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 668 }, { "completion_length": 692.5, "epoch": 39.35294117647059, "grad_norm": 0.002050295679491381, "kl": 0.09130859375, "learning_rate": 2.6082786362343374e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 669 }, { "completion_length": 650.0138854980469, "epoch": 39.411764705882355, "grad_norm": 0.004647644184902222, "kl": 0.109619140625, "learning_rate": 2.5940702775459744e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 670 }, { "completion_length": 612.1666564941406, "epoch": 39.470588235294116, "grad_norm": 0.0029496189706835563, "kl": 0.108154296875, "learning_rate": 2.579887155921936e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 671 }, { "completion_length": 591.1944580078125, "epoch": 39.529411764705884, "grad_norm": 0.0024338800130562555, "kl": 0.112060546875, "learning_rate": 2.5657294201364523e-07, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 672 }, { "completion_length": 647.8888854980469, "epoch": 39.588235294117645, "grad_norm": 0.001839979022064137, "kl": 0.09765625, "learning_rate": 2.551597218697475e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 673 }, { "completion_length": 662.25, "epoch": 39.64705882352941, "grad_norm": 0.0032406429348348803, "kl": 0.09326171875, "learning_rate": 2.537490699845109e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 674 }, { "completion_length": 636.1666564941406, "epoch": 39.705882352941174, "grad_norm": 0.002144512894141054, "kl": 0.09521484375, "learning_rate": 2.523410011550064e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 675 }, { "completion_length": 601.9444274902344, "epoch": 39.76470588235294, "grad_norm": 0.0022069854409465547, "kl": 0.09912109375, "learning_rate": 2.5093553015120934e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 676 }, { "completion_length": 590.7777709960938, "epoch": 39.8235294117647, "grad_norm": 0.0026042949994705966, "kl": 0.1064453125, "learning_rate": 2.495326717158457e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 677 }, { "completion_length": 651.1805725097656, "epoch": 39.88235294117647, "grad_norm": 0.0029400708342050097, "kl": 0.102294921875, "learning_rate": 2.4813244056423686e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 678 }, { "completion_length": 627.6528015136719, "epoch": 39.94117647058823, "grad_norm": 0.0029400414116070605, "kl": 0.100830078125, "learning_rate": 2.467348513841447e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 679 }, { "completion_length": 631.4305725097656, "epoch": 40.0, "grad_norm": 0.0021649704898155818, "kl": 0.104248046875, "learning_rate": 2.4533991883561867e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 680 }, { "completion_length": 655.8472290039062, "epoch": 40.05882352941177, "grad_norm": 0.15324723005420532, "kl": 0.1015625, "learning_rate": 2.439476575508408e-07, "loss": 0.0005, "reward": 0.6152777969837189, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 681 }, { "completion_length": 620.2638854980469, "epoch": 40.11764705882353, "grad_norm": 0.0022375663582715724, "kl": 0.105224609375, "learning_rate": 2.425580821339733e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 682 }, { "completion_length": 602.1388854980469, "epoch": 40.1764705882353, "grad_norm": 0.2585477323536325, "kl": 0.1982421875, "learning_rate": 2.411712071610048e-07, "loss": 0.0006, "reward": 0.8048611581325531, "reward_std": 0.06976316124200821, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.09861112385988235, "step": 683 }, { "completion_length": 603.5, "epoch": 40.23529411764706, "grad_norm": 0.00247648500187273, "kl": 0.09716796875, "learning_rate": 2.3978704717959776e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 684 }, { "completion_length": 624.5, "epoch": 40.294117647058826, "grad_norm": 0.007793392241077998, "kl": 0.099853515625, "learning_rate": 2.3840561670893495e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 685 }, { "completion_length": 667.2638854980469, "epoch": 40.35294117647059, "grad_norm": 0.16502568908923393, "kl": 0.090087890625, "learning_rate": 2.3702693023956848e-07, "loss": 0.0002, "reward": 0.39097223430871964, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1909722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 686 }, { "completion_length": 641.0, "epoch": 40.411764705882355, "grad_norm": 0.20404033532444907, "kl": 0.095458984375, "learning_rate": 2.3565100223326735e-07, "loss": 0.0003, "reward": 0.5472222343087196, "reward_std": 0.10257624089717865, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.347222238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 687 }, { "completion_length": 662.263916015625, "epoch": 40.470588235294116, "grad_norm": 0.22513339641815855, "kl": 0.0966796875, "learning_rate": 2.3427784712286475e-07, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 688 }, { "completion_length": 625.6805725097656, "epoch": 40.529411764705884, "grad_norm": 0.001717387170534929, "kl": 0.1015625, "learning_rate": 2.3290747931210848e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 689 }, { "completion_length": 576.4722290039062, "epoch": 40.588235294117645, "grad_norm": 0.002793740972350746, "kl": 0.11376953125, "learning_rate": 2.3153991317550808e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 690 }, { "completion_length": 598.8472290039062, "epoch": 40.64705882352941, "grad_norm": 0.0022537303459848478, "kl": 0.099853515625, "learning_rate": 2.3017516305818546e-07, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 691 }, { "completion_length": 680.1805725097656, "epoch": 40.705882352941174, "grad_norm": 0.2944472911395818, "kl": 0.093505859375, "learning_rate": 2.288132432757233e-07, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 692 }, { "completion_length": 597.2916870117188, "epoch": 40.76470588235294, "grad_norm": 0.24994982611799824, "kl": 0.102783203125, "learning_rate": 2.2745416811401584e-07, "loss": 0.0002, "reward": 0.33888890594244003, "reward_std": 0.10257624089717865, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.1388888955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 693 }, { "completion_length": 638.1666870117188, "epoch": 40.8235294117647, "grad_norm": 0.0023522228960599478, "kl": 0.098876953125, "learning_rate": 2.2609795182911857e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 694 }, { "completion_length": 653.2361145019531, "epoch": 40.88235294117647, "grad_norm": 0.0024194325609846103, "kl": 0.09765625, "learning_rate": 2.247446086470982e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 695 }, { "completion_length": 696.4166870117188, "epoch": 40.94117647058823, "grad_norm": 0.13636299278784517, "kl": 0.09765625, "learning_rate": 2.2339415276388474e-07, "loss": 0.0007, "reward": 0.8236111998558044, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 696 }, { "completion_length": 602.9027709960938, "epoch": 41.0, "grad_norm": 0.002476863510704758, "kl": 0.108642578125, "learning_rate": 2.220465983451209e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 697 }, { "completion_length": 581.5278015136719, "epoch": 41.05882352941177, "grad_norm": 0.24433855959244657, "kl": 0.10498046875, "learning_rate": 2.207019595260154e-07, "loss": 0.0005, "reward": 0.7555556148290634, "reward_std": 0.10257624089717865, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5555555522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 698 }, { "completion_length": 629.4722290039062, "epoch": 41.11764705882353, "grad_norm": 0.2293950263208292, "kl": 0.1025390625, "learning_rate": 2.1936025041119265e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 699 }, { "completion_length": 646.2361145019531, "epoch": 41.1764705882353, "grad_norm": 0.002890845342748659, "kl": 0.09716796875, "learning_rate": 2.180214850745467e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 700 }, { "completion_length": 634.7222290039062, "epoch": 41.23529411764706, "grad_norm": 0.0037650397547944374, "kl": 0.09033203125, "learning_rate": 2.1668567755909257e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 701 }, { "completion_length": 633.4861145019531, "epoch": 41.294117647058826, "grad_norm": 0.2058070573292669, "kl": 0.10595703125, "learning_rate": 2.1535284187681864e-07, "loss": 0.0005, "reward": 0.807638943195343, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 702 }, { "completion_length": 635.2916870117188, "epoch": 41.35294117647059, "grad_norm": 0.0021762106812245386, "kl": 0.108642578125, "learning_rate": 2.1402299200854085e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 703 }, { "completion_length": 648.9166870117188, "epoch": 41.411764705882355, "grad_norm": 0.001866422009831787, "kl": 0.090087890625, "learning_rate": 2.1269614190375474e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 704 }, { "completion_length": 624.9166564941406, "epoch": 41.470588235294116, "grad_norm": 0.001996130294407124, "kl": 0.09619140625, "learning_rate": 2.113723054804904e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 705 }, { "completion_length": 611.1944580078125, "epoch": 41.529411764705884, "grad_norm": 0.21217624018729767, "kl": 0.091796875, "learning_rate": 2.1005149662516514e-07, "loss": 0.0005, "reward": 0.7727437019348145, "reward_std": 0.09411981701850891, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5729166716337204, "rewards/thinker_reward_func": 0.09982699528336525, "step": 706 }, { "completion_length": 645.375, "epoch": 41.588235294117645, "grad_norm": 0.00212590440414918, "kl": 0.093994140625, "learning_rate": 2.0873372919243897e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 707 }, { "completion_length": 640.875, "epoch": 41.64705882352941, "grad_norm": 0.0019577823516360358, "kl": 0.10400390625, "learning_rate": 2.0741901700506898e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 708 }, { "completion_length": 616.5416564941406, "epoch": 41.705882352941174, "grad_norm": 0.0019006640195535047, "kl": 0.09228515625, "learning_rate": 2.0610737385376348e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 709 }, { "completion_length": 580.9027709960938, "epoch": 41.76470588235294, "grad_norm": 0.0027481477232315556, "kl": 0.10498046875, "learning_rate": 2.0479881349703882e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 710 }, { "completion_length": 603.7638854980469, "epoch": 41.8235294117647, "grad_norm": 0.0041019326824366965, "kl": 0.105224609375, "learning_rate": 2.0349334966107362e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 711 }, { "completion_length": 604.8888854980469, "epoch": 41.88235294117647, "grad_norm": 0.0028522732160019626, "kl": 0.100341796875, "learning_rate": 2.0219099603956607e-07, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 712 }, { "completion_length": 598.375, "epoch": 41.94117647058823, "grad_norm": 0.0025055453963798726, "kl": 0.11083984375, "learning_rate": 2.00891766293589e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 713 }, { "completion_length": 658.5833435058594, "epoch": 42.0, "grad_norm": 0.014531253971099995, "kl": 0.140625, "learning_rate": 1.9959567405144822e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 714 }, { "completion_length": 580.3472290039062, "epoch": 42.05882352941177, "grad_norm": 0.25104907326328824, "kl": 0.1103515625, "learning_rate": 1.9830273290853766e-07, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 715 }, { "completion_length": 657.9166564941406, "epoch": 42.11764705882353, "grad_norm": 0.003407519906287343, "kl": 0.095947265625, "learning_rate": 1.9701295642719833e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 716 }, { "completion_length": 603.2916870117188, "epoch": 42.1764705882353, "grad_norm": 0.2365018716760629, "kl": 0.0927734375, "learning_rate": 1.957263581365749e-07, "loss": 0.0003, "reward": 0.58194450289011, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 717 }, { "completion_length": 636.4166870117188, "epoch": 42.23529411764706, "grad_norm": 0.0017994368476216404, "kl": 0.08154296875, "learning_rate": 1.9444295153247487e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 718 }, { "completion_length": 617.625, "epoch": 42.294117647058826, "grad_norm": 0.0030400856035548247, "kl": 0.09619140625, "learning_rate": 1.9316275007722626e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 719 }, { "completion_length": 582.25, "epoch": 42.35294117647059, "grad_norm": 0.15851546194085, "kl": 0.095947265625, "learning_rate": 1.918857671995363e-07, "loss": 0.0009, "reward": 1.224305659532547, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0243055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 720 }, { "completion_length": 588.5138854980469, "epoch": 42.411764705882355, "grad_norm": 0.0022173216874681197, "kl": 0.1015625, "learning_rate": 1.9061201629435148e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 721 }, { "completion_length": 597.125, "epoch": 42.470588235294116, "grad_norm": 0.002201884658361648, "kl": 0.0966796875, "learning_rate": 1.893415107227157e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 722 }, { "completion_length": 627.8750305175781, "epoch": 42.529411764705884, "grad_norm": 0.001807167558606127, "kl": 0.090087890625, "learning_rate": 1.880742638116315e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 723 }, { "completion_length": 610.4305725097656, "epoch": 42.588235294117645, "grad_norm": 0.1802708015524608, "kl": 0.09521484375, "learning_rate": 1.8681028885391904e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 724 }, { "completion_length": 592.4722290039062, "epoch": 42.64705882352941, "grad_norm": 0.0027037912992572714, "kl": 0.1103515625, "learning_rate": 1.8554959910807772e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 725 }, { "completion_length": 576.4722290039062, "epoch": 42.705882352941174, "grad_norm": 0.2540211328212326, "kl": 0.1025390625, "learning_rate": 1.8429220779814652e-07, "loss": 0.0007, "reward": 0.9812500476837158, "reward_std": 0.094222292304039, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.78125, "rewards/thinker_reward_func": 0.10000000894069672, "step": 726 }, { "completion_length": 593.4305725097656, "epoch": 42.76470588235294, "grad_norm": 0.2530799379634833, "kl": 0.1044921875, "learning_rate": 1.83038128113565e-07, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 727 }, { "completion_length": 631.5833435058594, "epoch": 42.8235294117647, "grad_norm": 0.00309935725902381, "kl": 0.091064453125, "learning_rate": 1.8178737320903597e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 728 }, { "completion_length": 574.2638854980469, "epoch": 42.88235294117647, "grad_norm": 0.2213851246613247, "kl": 0.114013671875, "learning_rate": 1.8053995620438622e-07, "loss": 0.0004, "reward": 0.6165357604622841, "reward_std": 0.00045363104436546564, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09986905753612518, "step": 729 }, { "completion_length": 588.9444580078125, "epoch": 42.94117647058823, "grad_norm": 0.001972208695981724, "kl": 0.09423828125, "learning_rate": 1.7929589018443014e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 730 }, { "completion_length": 706.0833435058594, "epoch": 43.0, "grad_norm": 0.28346847720896895, "kl": 0.081298828125, "learning_rate": 1.780551881988313e-07, "loss": 0.0003, "reward": 0.9625000655651093, "reward_std": 0.2061862125992775, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.7638888955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 731 }, { "completion_length": 608.8611145019531, "epoch": 43.05882352941177, "grad_norm": 0.0025752478822620743, "kl": 0.095947265625, "learning_rate": 1.7681786326196664e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 732 }, { "completion_length": 613.6805725097656, "epoch": 43.11764705882353, "grad_norm": 0.002278269799713724, "kl": 0.105224609375, "learning_rate": 1.755839283527893e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 733 }, { "completion_length": 625.0138854980469, "epoch": 43.1764705882353, "grad_norm": 0.0035182435067623963, "kl": 0.104736328125, "learning_rate": 1.7435339641469238e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 734 }, { "completion_length": 612.1805725097656, "epoch": 43.23529411764706, "grad_norm": 0.0018483191108469763, "kl": 0.092529296875, "learning_rate": 1.7312628035537386e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 735 }, { "completion_length": 629.263916015625, "epoch": 43.294117647058826, "grad_norm": 0.002600420466345068, "kl": 0.095947265625, "learning_rate": 1.7190259304670035e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 736 }, { "completion_length": 638.388916015625, "epoch": 43.35294117647059, "grad_norm": 0.0020405152696449476, "kl": 0.09521484375, "learning_rate": 1.7068234732457287e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 737 }, { "completion_length": 572.3194580078125, "epoch": 43.411764705882355, "grad_norm": 0.002044942210927235, "kl": 0.107421875, "learning_rate": 1.6946555598879137e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 738 }, { "completion_length": 642.0, "epoch": 43.470588235294116, "grad_norm": 0.0018609787604296603, "kl": 0.091552734375, "learning_rate": 1.6825223180292136e-07, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 739 }, { "completion_length": 632.4444580078125, "epoch": 43.529411764705884, "grad_norm": 0.0024329170028942687, "kl": 0.09326171875, "learning_rate": 1.6704238749415955e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 740 }, { "completion_length": 608.3472290039062, "epoch": 43.588235294117645, "grad_norm": 0.006637454788780438, "kl": 0.112060546875, "learning_rate": 1.6583603575319999e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 741 }, { "completion_length": 615.5833129882812, "epoch": 43.64705882352941, "grad_norm": 0.25844543540819287, "kl": 0.10791015625, "learning_rate": 1.646331892341018e-07, "loss": 0.0004, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 742 }, { "completion_length": 617.875, "epoch": 43.705882352941174, "grad_norm": 0.2521314161780168, "kl": 0.095458984375, "learning_rate": 1.6343386055415543e-07, "loss": 0.0007, "reward": 0.8423611521720886, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.642361119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 743 }, { "completion_length": 604.1388854980469, "epoch": 43.76470588235294, "grad_norm": 0.002228823708479495, "kl": 0.104248046875, "learning_rate": 1.622380622937518e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 744 }, { "completion_length": 623.8888854980469, "epoch": 43.8235294117647, "grad_norm": 0.002203924560595351, "kl": 0.091064453125, "learning_rate": 1.6104580699624837e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 745 }, { "completion_length": 596.1944580078125, "epoch": 43.88235294117647, "grad_norm": 0.1695829982591805, "kl": 0.09326171875, "learning_rate": 1.5985710716783933e-07, "loss": 0.0007, "reward": 0.9638881087303162, "reward_std": 0.1025789939178594, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7638888955116272, "rewards/thinker_reward_func": 0.09999921545386314, "step": 746 }, { "completion_length": 624.0972290039062, "epoch": 43.94117647058823, "grad_norm": 0.19067893102526953, "kl": 0.091552734375, "learning_rate": 1.586719752774231e-07, "loss": 0.0003, "reward": 0.40830952674150467, "reward_std": 8.247890946222469e-05, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09997620061039925, "step": 747 }, { "completion_length": 585.7777709960938, "epoch": 44.0, "grad_norm": 0.0022434368678033276, "kl": 0.105224609375, "learning_rate": 1.574904237564726e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 748 }, { "completion_length": 617.2222290039062, "epoch": 44.05882352941177, "grad_norm": 0.002680349261401592, "kl": 0.094970703125, "learning_rate": 1.5631246499890428e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 749 }, { "completion_length": 565.6388854980469, "epoch": 44.11764705882353, "grad_norm": 0.002529682103243587, "kl": 0.10888671875, "learning_rate": 1.5513811136094785e-07, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 750 }, { "completion_length": 581.6666870117188, "epoch": 44.1764705882353, "grad_norm": 0.0019761074306734764, "kl": 0.097412109375, "learning_rate": 1.5396737516101755e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 751 }, { "completion_length": 616.2361145019531, "epoch": 44.23529411764706, "grad_norm": 0.18365989698670515, "kl": 0.103759765625, "learning_rate": 1.5280026867958185e-07, "loss": 0.0005, "reward": 0.6152777969837189, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 752 }, { "completion_length": 609.75, "epoch": 44.294117647058826, "grad_norm": 0.002147909644481664, "kl": 0.093505859375, "learning_rate": 1.5163680415903578e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 753 }, { "completion_length": 620.2916564941406, "epoch": 44.35294117647059, "grad_norm": 0.0021071252927442156, "kl": 0.0966796875, "learning_rate": 1.5047699380357132e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 754 }, { "completion_length": 600.4722290039062, "epoch": 44.411764705882355, "grad_norm": 0.0019612421769359266, "kl": 0.10791015625, "learning_rate": 1.493208497790504e-07, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 755 }, { "completion_length": 584.3055725097656, "epoch": 44.470588235294116, "grad_norm": 0.3652957624685727, "kl": 0.1005859375, "learning_rate": 1.4816838421287693e-07, "loss": 0.0001, "reward": 0.581944465637207, "reward_std": 0.12028130888938904, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 756 }, { "completion_length": 613.4166564941406, "epoch": 44.529411764705884, "grad_norm": 0.0025114380780229435, "kl": 0.091552734375, "learning_rate": 1.4701960919386907e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 757 }, { "completion_length": 648.7916564941406, "epoch": 44.588235294117645, "grad_norm": 0.0025101643826317987, "kl": 0.094482421875, "learning_rate": 1.4587453677213346e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 758 }, { "completion_length": 679.8333435058594, "epoch": 44.64705882352941, "grad_norm": 0.002047125364787484, "kl": 0.0859375, "learning_rate": 1.447331789589377e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 759 }, { "completion_length": 592.1944580078125, "epoch": 44.705882352941174, "grad_norm": 0.0019873559327538396, "kl": 0.095947265625, "learning_rate": 1.435955477265855e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 760 }, { "completion_length": 664.1528015136719, "epoch": 44.76470588235294, "grad_norm": 0.0035551578555427657, "kl": 0.092041015625, "learning_rate": 1.4246165500828971e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 761 }, { "completion_length": 620.6111145019531, "epoch": 44.8235294117647, "grad_norm": 0.001867612870683223, "kl": 0.08935546875, "learning_rate": 1.413315126980487e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 762 }, { "completion_length": 584.8333435058594, "epoch": 44.88235294117647, "grad_norm": 0.002316565243548902, "kl": 0.104736328125, "learning_rate": 1.402051326505207e-07, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 763 }, { "completion_length": 599.6805725097656, "epoch": 44.94117647058823, "grad_norm": 0.0018280524014176556, "kl": 0.1064453125, "learning_rate": 1.3908252668089898e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 764 }, { "completion_length": 646.2916564941406, "epoch": 45.0, "grad_norm": 0.26242480842421, "kl": 0.088134765625, "learning_rate": 1.3796370656478934e-07, "loss": 0.0001, "reward": 0.19996508210897446, "reward_std": 0.00012096918726456352, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.09996509179472923, "step": 765 }, { "completion_length": 579.6805725097656, "epoch": 45.05882352941177, "grad_norm": 0.004407116435565167, "kl": 0.10107421875, "learning_rate": 1.368486840380851e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 766 }, { "completion_length": 653.4722290039062, "epoch": 45.11764705882353, "grad_norm": 0.14618217534775813, "kl": 0.094482421875, "learning_rate": 1.3573747079684516e-07, "loss": 0.0003, "reward": 0.6166515722870827, "reward_std": 5.223857078817673e-05, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09998492896556854, "step": 767 }, { "completion_length": 598.9583435058594, "epoch": 45.1764705882353, "grad_norm": 0.1901125501566773, "kl": 0.095947265625, "learning_rate": 1.3463007849717033e-07, "loss": 0.001, "reward": 1.2416072487831116, "reward_std": 0.00020619668066501617, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.09994048625230789, "step": 768 }, { "completion_length": 630.9583435058594, "epoch": 45.23529411764706, "grad_norm": 0.002104158973254316, "kl": 0.0927734375, "learning_rate": 1.3352651875508204e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 769 }, { "completion_length": 628.1805725097656, "epoch": 45.294117647058826, "grad_norm": 0.19726611945451988, "kl": 0.1005859375, "learning_rate": 1.3242680314639993e-07, "loss": 0.0005, "reward": 0.807638943195343, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 770 }, { "completion_length": 596.4166564941406, "epoch": 45.35294117647059, "grad_norm": 0.004825269713710835, "kl": 0.103515625, "learning_rate": 1.3133094320661998e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 771 }, { "completion_length": 619.8750305175781, "epoch": 45.411764705882355, "grad_norm": 0.001883564311594303, "kl": 0.091552734375, "learning_rate": 1.3023895043079475e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 772 }, { "completion_length": 591.0000305175781, "epoch": 45.470588235294116, "grad_norm": 0.19859401695295856, "kl": 0.100341796875, "learning_rate": 1.2915083627341128e-07, "loss": 0.0007, "reward": 0.8236111998558044, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 773 }, { "completion_length": 619.9722290039062, "epoch": 45.529411764705884, "grad_norm": 0.0024713650945580564, "kl": 0.092041015625, "learning_rate": 1.2806661214827285e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 774 }, { "completion_length": 635.0694580078125, "epoch": 45.588235294117645, "grad_norm": 0.0019299509900107688, "kl": 0.0927734375, "learning_rate": 1.2698628942837697e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 775 }, { "completion_length": 617.4444580078125, "epoch": 45.64705882352941, "grad_norm": 0.002295282562923045, "kl": 0.10498046875, "learning_rate": 1.2590987944579805e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 776 }, { "completion_length": 620.6527709960938, "epoch": 45.705882352941174, "grad_norm": 0.0016477382889029852, "kl": 0.100341796875, "learning_rate": 1.2483739349156725e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 777 }, { "completion_length": 628.5555725097656, "epoch": 45.76470588235294, "grad_norm": 0.001964757790832419, "kl": 0.0966796875, "learning_rate": 1.2376884281555483e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 778 }, { "completion_length": 576.6111145019531, "epoch": 45.8235294117647, "grad_norm": 0.0023229551472366016, "kl": 0.106689453125, "learning_rate": 1.2270423862635188e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 779 }, { "completion_length": 614.375, "epoch": 45.88235294117647, "grad_norm": 0.004783158785764202, "kl": 0.10205078125, "learning_rate": 1.2164359209115232e-07, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 780 }, { "completion_length": 582.1666870117188, "epoch": 45.94117647058823, "grad_norm": 0.0030173537513098884, "kl": 0.112060546875, "learning_rate": 1.2058691433563672e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 781 }, { "completion_length": 576.9583435058594, "epoch": 46.0, "grad_norm": 0.25714277018952986, "kl": 0.107666015625, "learning_rate": 1.1953421644385443e-07, "loss": 0.0007, "reward": 1.0159723460674286, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 782 }, { "completion_length": 697.3333435058594, "epoch": 46.05882352941177, "grad_norm": 0.2402229219151842, "kl": 0.08447265625, "learning_rate": 1.1848550945810848e-07, "loss": 0.0001, "reward": 0.5819444730877876, "reward_std": 0.12028131633996964, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 783 }, { "completion_length": 587.9028015136719, "epoch": 46.11764705882353, "grad_norm": 0.002158192460111868, "kl": 0.09521484375, "learning_rate": 1.1744080437883857e-07, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 784 }, { "completion_length": 609.0, "epoch": 46.1764705882353, "grad_norm": 0.00790720629630125, "kl": 0.107666015625, "learning_rate": 1.1640011216450691e-07, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 785 }, { "completion_length": 665.2777709960938, "epoch": 46.23529411764706, "grad_norm": 0.017730154710735215, "kl": 0.1025390625, "learning_rate": 1.1536344373148244e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 786 }, { "completion_length": 621.7222290039062, "epoch": 46.294117647058826, "grad_norm": 0.18487119728810594, "kl": 0.100830078125, "learning_rate": 1.1433080995392614e-07, "loss": 0.0005, "reward": 0.7902778387069702, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 787 }, { "completion_length": 596.8194580078125, "epoch": 46.35294117647059, "grad_norm": 0.0021699200402085545, "kl": 0.103271484375, "learning_rate": 1.1330222166367809e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 788 }, { "completion_length": 605.7083435058594, "epoch": 46.411764705882355, "grad_norm": 0.16674733341563674, "kl": 0.093994140625, "learning_rate": 1.1227768965014245e-07, "loss": 0.0007, "reward": 1.0159723460674286, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 789 }, { "completion_length": 596.3472290039062, "epoch": 46.470588235294116, "grad_norm": 0.0027061184151870674, "kl": 0.098388671875, "learning_rate": 1.1125722466017545e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 790 }, { "completion_length": 592.6528015136719, "epoch": 46.529411764705884, "grad_norm": 0.23725419738846623, "kl": 0.110595703125, "learning_rate": 1.1024083739797169e-07, "loss": 0.0005, "reward": 0.807638943195343, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 791 }, { "completion_length": 624.3611145019531, "epoch": 46.588235294117645, "grad_norm": 0.0018170116380979272, "kl": 0.09423828125, "learning_rate": 1.0922853852495278e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 792 }, { "completion_length": 637.6805725097656, "epoch": 46.64705882352941, "grad_norm": 0.0016733734513935743, "kl": 0.09619140625, "learning_rate": 1.0822033865965503e-07, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 793 }, { "completion_length": 637.6805419921875, "epoch": 46.705882352941174, "grad_norm": 0.002177570835142978, "kl": 0.095458984375, "learning_rate": 1.0721624837761766e-07, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 794 }, { "completion_length": 614.625, "epoch": 46.76470588235294, "grad_norm": 0.00535938018998337, "kl": 0.115966796875, "learning_rate": 1.0621627821127288e-07, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 795 }, { "completion_length": 569.5416564941406, "epoch": 46.8235294117647, "grad_norm": 0.00629140582401368, "kl": 0.114501953125, "learning_rate": 1.0522043864983427e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 796 }, { "completion_length": 602.25, "epoch": 46.88235294117647, "grad_norm": 0.1443152825744735, "kl": 0.105712890625, "learning_rate": 1.0422874013918792e-07, "loss": 0.0005, "reward": 0.6340278089046478, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4340277761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 797 }, { "completion_length": 602.7083435058594, "epoch": 46.94117647058823, "grad_norm": 0.002145817499337329, "kl": 0.1083984375, "learning_rate": 1.0324119308178164e-07, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 798 }, { "completion_length": 597.3750305175781, "epoch": 47.0, "grad_norm": 0.2054575000882298, "kl": 0.11328125, "learning_rate": 1.0225780783651689e-07, "loss": 0.0009, "reward": 1.224305659532547, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0243055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 799 }, { "completion_length": 669.5, "epoch": 47.05882352941177, "grad_norm": 0.004311579497981754, "kl": 0.08935546875, "learning_rate": 1.0127859471863969e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 800 }, { "completion_length": 624.2778015136719, "epoch": 47.11764705882353, "grad_norm": 0.0016711418360582215, "kl": 0.096923828125, "learning_rate": 1.0030356399963202e-07, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 801 }, { "completion_length": 547.0277709960938, "epoch": 47.1764705882353, "grad_norm": 0.24048856862030124, "kl": 0.113525390625, "learning_rate": 9.933272590710507e-08, "loss": 0.0009, "reward": 1.2243057191371918, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0243055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 802 }, { "completion_length": 570.1527709960938, "epoch": 47.23529411764706, "grad_norm": 0.002578165291587765, "kl": 0.109375, "learning_rate": 9.836609062469064e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 803 }, { "completion_length": 628.7916870117188, "epoch": 47.294117647058826, "grad_norm": 0.0025604385556009285, "kl": 0.093017578125, "learning_rate": 9.740366829193586e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 804 }, { "completion_length": 599.2222290039062, "epoch": 47.35294117647059, "grad_norm": 0.18131970995100646, "kl": 0.100830078125, "learning_rate": 9.644546900419531e-08, "loss": 0.0003, "reward": 0.5993056371808052, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 805 }, { "completion_length": 586.8194580078125, "epoch": 47.411764705882355, "grad_norm": 0.0035233544724012997, "kl": 0.12109375, "learning_rate": 9.549150281252632e-08, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 806 }, { "completion_length": 647.6250305175781, "epoch": 47.470588235294116, "grad_norm": 0.0022167759931249645, "kl": 0.097412109375, "learning_rate": 9.454177972358257e-08, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 807 }, { "completion_length": 635.388916015625, "epoch": 47.529411764705884, "grad_norm": 0.008684598771165786, "kl": 0.125, "learning_rate": 9.35963096995101e-08, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 808 }, { "completion_length": 600.3333435058594, "epoch": 47.588235294117645, "grad_norm": 0.0026545005299467746, "kl": 0.10693359375, "learning_rate": 9.265510265784188e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 809 }, { "completion_length": 584.7777709960938, "epoch": 47.64705882352941, "grad_norm": 0.24544562556762337, "kl": 0.097900390625, "learning_rate": 9.171816847139447e-08, "loss": 0.0007, "reward": 1.0159723460674286, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 810 }, { "completion_length": 606.8333435058594, "epoch": 47.705882352941174, "grad_norm": 0.001708866644289014, "kl": 0.0947265625, "learning_rate": 9.078551696816433e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 811 }, { "completion_length": 587.5000305175781, "epoch": 47.76470588235294, "grad_norm": 0.0028541842915566907, "kl": 0.11474609375, "learning_rate": 8.985715793122406e-08, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 812 }, { "completion_length": 632.6388854980469, "epoch": 47.8235294117647, "grad_norm": 0.0020434435747296873, "kl": 0.107666015625, "learning_rate": 8.893310109862101e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 813 }, { "completion_length": 660.6111145019531, "epoch": 47.88235294117647, "grad_norm": 0.0022314028215896205, "kl": 0.086669921875, "learning_rate": 8.801335616327377e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 814 }, { "completion_length": 590.3888854980469, "epoch": 47.94117647058823, "grad_norm": 0.002341511773101906, "kl": 0.1083984375, "learning_rate": 8.70979327728718e-08, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 815 }, { "completion_length": 602.3194580078125, "epoch": 48.0, "grad_norm": 0.0025563026026453528, "kl": 0.10400390625, "learning_rate": 8.618684052977304e-08, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 816 }, { "completion_length": 660.625, "epoch": 48.05882352941177, "grad_norm": 0.1505585466915935, "kl": 0.09423828125, "learning_rate": 8.528008899090411e-08, "loss": 0.0005, "reward": 0.6152778267860413, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 817 }, { "completion_length": 586.6527709960938, "epoch": 48.11764705882353, "grad_norm": 0.19429974088959054, "kl": 0.1162109375, "learning_rate": 8.437768766765974e-08, "loss": 0.0003, "reward": 0.6166420876979828, "reward_std": 8.522997086402029e-05, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09997540712356567, "step": 818 }, { "completion_length": 689.0833435058594, "epoch": 48.1764705882353, "grad_norm": 0.0019612557388931353, "kl": 0.092529296875, "learning_rate": 8.347964602580243e-08, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 819 }, { "completion_length": 581.6805419921875, "epoch": 48.23529411764706, "grad_norm": 0.002362741011605635, "kl": 0.105712890625, "learning_rate": 8.258597348536451e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 820 }, { "completion_length": 581.0000305175781, "epoch": 48.294117647058826, "grad_norm": 0.0024501974691150904, "kl": 0.11083984375, "learning_rate": 8.169667942054759e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 821 }, { "completion_length": 653.0833435058594, "epoch": 48.35294117647059, "grad_norm": 0.0026342123733109415, "kl": 0.09619140625, "learning_rate": 8.081177315962601e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 822 }, { "completion_length": 592.4722290039062, "epoch": 48.411764705882355, "grad_norm": 0.0023002143481546163, "kl": 0.10986328125, "learning_rate": 7.99312639848474e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 823 }, { "completion_length": 628.2916564941406, "epoch": 48.470588235294116, "grad_norm": 0.0026877688745779445, "kl": 0.111572265625, "learning_rate": 7.905516113233651e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 824 }, { "completion_length": 643.5277709960938, "epoch": 48.529411764705884, "grad_norm": 0.0021380692481277625, "kl": 0.0986328125, "learning_rate": 7.81834737919978e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 825 }, { "completion_length": 586.5277709960938, "epoch": 48.588235294117645, "grad_norm": 0.25635519473767854, "kl": 0.110107421875, "learning_rate": 7.731621110741871e-08, "loss": 0.0007, "reward": 1.0159723162651062, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722238779068, "rewards/thinker_reward_func": 0.10000000894069672, "step": 826 }, { "completion_length": 579.75, "epoch": 48.64705882352941, "grad_norm": 0.3227386151475049, "kl": 0.099609375, "learning_rate": 7.645338217577474e-08, "loss": 0.0007, "reward": 1.0333334505558014, "reward_std": 0.12028132006525993, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333432674408, "rewards/thinker_reward_func": 0.10000000894069672, "step": 827 }, { "completion_length": 618.6805725097656, "epoch": 48.705882352941174, "grad_norm": 0.0021994289237972924, "kl": 0.1015625, "learning_rate": 7.559499604773279e-08, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 828 }, { "completion_length": 660.4027709960938, "epoch": 48.76470588235294, "grad_norm": 0.1657129148017453, "kl": 0.093994140625, "learning_rate": 7.474106172735745e-08, "loss": 0.0005, "reward": 0.7902778759598732, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777910232544, "rewards/thinker_reward_func": 0.10000000894069672, "step": 829 }, { "completion_length": 640.1528015136719, "epoch": 48.8235294117647, "grad_norm": 0.003849516085467909, "kl": 0.10693359375, "learning_rate": 7.389158817201541e-08, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 830 }, { "completion_length": 619.2361145019531, "epoch": 48.88235294117647, "grad_norm": 0.0024808491568688363, "kl": 0.09765625, "learning_rate": 7.304658429228245e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 831 }, { "completion_length": 653.9027709960938, "epoch": 48.94117647058823, "grad_norm": 0.2023681428812567, "kl": 0.09228515625, "learning_rate": 7.220605895184945e-08, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 832 }, { "completion_length": 634.6805725097656, "epoch": 49.0, "grad_norm": 0.0017666692725524695, "kl": 0.090087890625, "learning_rate": 7.137002096742939e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 833 }, { "completion_length": 636.513916015625, "epoch": 49.05882352941177, "grad_norm": 0.0023995711011694817, "kl": 0.095458984375, "learning_rate": 7.053847910866511e-08, "loss": 0.0012, "reward": 1.4500001668930054, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.25, "rewards/thinker_reward_func": 0.10000000894069672, "step": 834 }, { "completion_length": 584.4444580078125, "epoch": 49.11764705882353, "grad_norm": 0.002175400698337513, "kl": 0.1044921875, "learning_rate": 6.971144209803736e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 835 }, { "completion_length": 587.875, "epoch": 49.1764705882353, "grad_norm": 0.0018790899266451833, "kl": 0.099365234375, "learning_rate": 6.888891861077301e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 836 }, { "completion_length": 603.2916564941406, "epoch": 49.23529411764706, "grad_norm": 0.21771342310641323, "kl": 0.105224609375, "learning_rate": 6.807091727475412e-08, "loss": 0.0004, "reward": 0.5993055775761604, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 837 }, { "completion_length": 595.4583129882812, "epoch": 49.294117647058826, "grad_norm": 0.0023938784769300513, "kl": 0.11181640625, "learning_rate": 6.725744667042776e-08, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 838 }, { "completion_length": 657.9583129882812, "epoch": 49.35294117647059, "grad_norm": 0.14985912856987527, "kl": 0.085693359375, "learning_rate": 6.644851533071555e-08, "loss": 0.0005, "reward": 0.6152777969837189, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 839 }, { "completion_length": 675.8194580078125, "epoch": 49.411764705882355, "grad_norm": 0.15353398597026424, "kl": 0.10986328125, "learning_rate": 6.564413174092443e-08, "loss": 0.0007, "reward": 0.8236111402511597, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 840 }, { "completion_length": 652.013916015625, "epoch": 49.470588235294116, "grad_norm": 0.002186440719839726, "kl": 0.0859375, "learning_rate": 6.484430433865784e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 841 }, { "completion_length": 617.3333435058594, "epoch": 49.529411764705884, "grad_norm": 0.004916473856641647, "kl": 0.107666015625, "learning_rate": 6.404904151372647e-08, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 842 }, { "completion_length": 655.0555419921875, "epoch": 49.588235294117645, "grad_norm": 0.0018828370243739081, "kl": 0.09130859375, "learning_rate": 6.325835160806131e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 843 }, { "completion_length": 546.4583435058594, "epoch": 49.64705882352941, "grad_norm": 0.002028853434468954, "kl": 0.10888671875, "learning_rate": 6.247224291562509e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 844 }, { "completion_length": 624.375, "epoch": 49.705882352941174, "grad_norm": 0.003213977743241682, "kl": 0.10400390625, "learning_rate": 6.16907236823262e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 845 }, { "completion_length": 628.2777709960938, "epoch": 49.76470588235294, "grad_norm": 0.2483386627810907, "kl": 0.09423828125, "learning_rate": 6.091380210593144e-08, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 846 }, { "completion_length": 628.2777709960938, "epoch": 49.8235294117647, "grad_norm": 0.0024103461362001495, "kl": 0.091064453125, "learning_rate": 6.014148633598054e-08, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 847 }, { "completion_length": 650.3472290039062, "epoch": 49.88235294117647, "grad_norm": 0.002116914019348955, "kl": 0.099365234375, "learning_rate": 5.937378447370067e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 848 }, { "completion_length": 603.7222290039062, "epoch": 49.94117647058823, "grad_norm": 0.002470236105100124, "kl": 0.099609375, "learning_rate": 5.86107045719208e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 849 }, { "completion_length": 587.5555725097656, "epoch": 50.0, "grad_norm": 0.25916779216231484, "kl": 0.117919921875, "learning_rate": 5.785225463498828e-08, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 850 }, { "completion_length": 608.6805725097656, "epoch": 50.05882352941177, "grad_norm": 0.0026290973049297645, "kl": 0.106201171875, "learning_rate": 5.70984426186838e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 851 }, { "completion_length": 626.4861145019531, "epoch": 50.11764705882353, "grad_norm": 0.002466197768679532, "kl": 0.10400390625, "learning_rate": 5.634927643013898e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 852 }, { "completion_length": 607.3333435058594, "epoch": 50.1764705882353, "grad_norm": 0.0018217276547340424, "kl": 0.104248046875, "learning_rate": 5.5604763927752376e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 853 }, { "completion_length": 663.8194580078125, "epoch": 50.23529411764706, "grad_norm": 0.19210735381264346, "kl": 0.102294921875, "learning_rate": 5.486491292110795e-08, "loss": 0.0003, "reward": 0.5979167446494102, "reward_std": 0.0649519027210772, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 854 }, { "completion_length": 598.1944580078125, "epoch": 50.294117647058826, "grad_norm": 0.2719939270959717, "kl": 0.109375, "learning_rate": 5.412973117089287e-08, "loss": 0.0005, "reward": 0.8076214790344238, "reward_std": 0.06013518571853638, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.09998254850506783, "step": 855 }, { "completion_length": 605.5972290039062, "epoch": 50.35294117647059, "grad_norm": 0.002908723969040003, "kl": 0.101806640625, "learning_rate": 5.3399226388815446e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 856 }, { "completion_length": 591.1250305175781, "epoch": 50.411764705882355, "grad_norm": 0.0021227720721027246, "kl": 0.106201171875, "learning_rate": 5.267340623752553e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 857 }, { "completion_length": 626.875, "epoch": 50.470588235294116, "grad_norm": 0.1987945310654739, "kl": 0.09765625, "learning_rate": 5.195227833053273e-08, "loss": 0.0003, "reward": 0.42569447308778763, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2256944477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 858 }, { "completion_length": 590.2083435058594, "epoch": 50.529411764705884, "grad_norm": 0.22046045506771822, "kl": 0.10302734375, "learning_rate": 5.123585023212784e-08, "loss": 0.0007, "reward": 1.0159723460674286, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 859 }, { "completion_length": 606.8472290039062, "epoch": 50.588235294117645, "grad_norm": 0.33213000460980724, "kl": 0.102294921875, "learning_rate": 5.052412945730239e-08, "loss": 0.0002, "reward": 0.5819444730877876, "reward_std": 0.12028130143880844, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3819444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 860 }, { "completion_length": 686.2638854980469, "epoch": 50.64705882352941, "grad_norm": 0.0030468846659928426, "kl": 0.085693359375, "learning_rate": 4.9817123471670606e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 861 }, { "completion_length": 602.8333129882812, "epoch": 50.705882352941174, "grad_norm": 0.0020425866260726415, "kl": 0.0927734375, "learning_rate": 4.9114839691390853e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 862 }, { "completion_length": 618.375, "epoch": 50.76470588235294, "grad_norm": 0.002156920839230316, "kl": 0.100830078125, "learning_rate": 4.841728548308743e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 863 }, { "completion_length": 619.9305419921875, "epoch": 50.8235294117647, "grad_norm": 0.0021464086066589087, "kl": 0.105224609375, "learning_rate": 4.7724468163774067e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 864 }, { "completion_length": 587.1805419921875, "epoch": 50.88235294117647, "grad_norm": 0.002574198825996098, "kl": 0.101318359375, "learning_rate": 4.703639500077655e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 865 }, { "completion_length": 581.5833435058594, "epoch": 50.94117647058823, "grad_norm": 0.002743541110728178, "kl": 0.109619140625, "learning_rate": 4.6353073211656876e-08, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 866 }, { "completion_length": 599.3055725097656, "epoch": 51.0, "grad_norm": 0.0017133418721925922, "kl": 0.102783203125, "learning_rate": 4.5674509964137134e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 867 }, { "completion_length": 616.8055725097656, "epoch": 51.05882352941177, "grad_norm": 0.0017669091069084531, "kl": 0.096435546875, "learning_rate": 4.500071237602482e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 868 }, { "completion_length": 614.8333435058594, "epoch": 51.11764705882353, "grad_norm": 0.20663618255868105, "kl": 0.094482421875, "learning_rate": 4.4331687515137605e-08, "loss": 0.0005, "reward": 0.8249699175357819, "reward_std": 0.00010447400563862175, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.09996984899044037, "step": 869 }, { "completion_length": 609.9166564941406, "epoch": 51.1764705882353, "grad_norm": 0.002552211035823391, "kl": 0.10205078125, "learning_rate": 4.366744239922998e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 870 }, { "completion_length": 556.4583435058594, "epoch": 51.23529411764706, "grad_norm": 0.0031475840944503886, "kl": 0.117431640625, "learning_rate": 4.30079839959186e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 871 }, { "completion_length": 628.2638854980469, "epoch": 51.294117647058826, "grad_norm": 0.0028195555320165923, "kl": 0.102783203125, "learning_rate": 4.235331922261026e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 872 }, { "completion_length": 634.9027709960938, "epoch": 51.35294117647059, "grad_norm": 0.0018045621714610528, "kl": 0.085693359375, "learning_rate": 4.170345494642863e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 873 }, { "completion_length": 648.9722290039062, "epoch": 51.411764705882355, "grad_norm": 0.0016276551134003928, "kl": 0.095947265625, "learning_rate": 4.10583979841424e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 874 }, { "completion_length": 636.5555725097656, "epoch": 51.470588235294116, "grad_norm": 0.0019196614617128299, "kl": 0.091796875, "learning_rate": 4.041815510209395e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 875 }, { "completion_length": 601.2083435058594, "epoch": 51.529411764705884, "grad_norm": 0.001999983385224045, "kl": 0.099365234375, "learning_rate": 3.9782733016128e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 876 }, { "completion_length": 645.1666870117188, "epoch": 51.588235294117645, "grad_norm": 0.0019565782910828926, "kl": 0.102294921875, "learning_rate": 3.915213839152176e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 877 }, { "completion_length": 620.125, "epoch": 51.64705882352941, "grad_norm": 0.002972074787894227, "kl": 0.103759765625, "learning_rate": 3.852637784291424e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 878 }, { "completion_length": 643.6944274902344, "epoch": 51.705882352941174, "grad_norm": 0.0018663321538961368, "kl": 0.095458984375, "learning_rate": 3.790545793423761e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 879 }, { "completion_length": 634.9722290039062, "epoch": 51.76470588235294, "grad_norm": 0.2297608161832218, "kl": 0.093017578125, "learning_rate": 3.7289385178647935e-08, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 880 }, { "completion_length": 591.1666564941406, "epoch": 51.8235294117647, "grad_norm": 0.0028829190033842823, "kl": 0.1064453125, "learning_rate": 3.6678166038456804e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 881 }, { "completion_length": 622.8333435058594, "epoch": 51.88235294117647, "grad_norm": 0.0021976671423162243, "kl": 0.097412109375, "learning_rate": 3.60718069250639e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 882 }, { "completion_length": 559.0833435058594, "epoch": 51.94117647058823, "grad_norm": 0.0027650586373662335, "kl": 0.115478515625, "learning_rate": 3.5470314198889186e-08, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 883 }, { "completion_length": 602.8472290039062, "epoch": 52.0, "grad_norm": 0.002230147892890665, "kl": 0.09814453125, "learning_rate": 3.487369416930691e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 884 }, { "completion_length": 632.8611145019531, "epoch": 52.05882352941177, "grad_norm": 0.19851907363734764, "kl": 0.085693359375, "learning_rate": 3.4281953094578875e-08, "loss": 0.0003, "reward": 0.5993055775761604, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 885 }, { "completion_length": 576.5278015136719, "epoch": 52.11764705882353, "grad_norm": 0.005548199358486985, "kl": 0.111083984375, "learning_rate": 3.369509718178887e-08, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 886 }, { "completion_length": 595.9166564941406, "epoch": 52.1764705882353, "grad_norm": 0.2308892142066783, "kl": 0.092041015625, "learning_rate": 3.311313258677778e-08, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 887 }, { "completion_length": 632.1528015136719, "epoch": 52.23529411764706, "grad_norm": 0.015213086630244461, "kl": 0.116943359375, "learning_rate": 3.253606541407872e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 888 }, { "completion_length": 589.7083435058594, "epoch": 52.294117647058826, "grad_norm": 0.0037361130580794138, "kl": 0.12060546875, "learning_rate": 3.1963901716853425e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 889 }, { "completion_length": 577.9305725097656, "epoch": 52.35294117647059, "grad_norm": 0.317123422680254, "kl": 0.112060546875, "learning_rate": 3.1396647496828244e-08, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.14123429730534554, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.78125, "rewards/thinker_reward_func": 0.10000000894069672, "step": 890 }, { "completion_length": 642.8194580078125, "epoch": 52.411764705882355, "grad_norm": 0.002282719633563645, "kl": 0.1064453125, "learning_rate": 3.083430870423148e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 891 }, { "completion_length": 631.75, "epoch": 52.470588235294116, "grad_norm": 0.0048178973882237915, "kl": 0.095947265625, "learning_rate": 3.027689123773108e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 892 }, { "completion_length": 634.7916870117188, "epoch": 52.529411764705884, "grad_norm": 0.0022071866397880904, "kl": 0.09716796875, "learning_rate": 2.9724400944372396e-08, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 893 }, { "completion_length": 607.1527709960938, "epoch": 52.588235294117645, "grad_norm": 0.25405122862623924, "kl": 0.103271484375, "learning_rate": 2.9176843619517277e-08, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 894 }, { "completion_length": 623.7083129882812, "epoch": 52.64705882352941, "grad_norm": 0.2425007357496233, "kl": 0.09814453125, "learning_rate": 2.8634225006782864e-08, "loss": 0.0005, "reward": 0.6340278089046478, "reward_std": 0.060140661895275116, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4340277761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 895 }, { "completion_length": 575.0138854980469, "epoch": 52.705882352941174, "grad_norm": 0.28077316206950914, "kl": 0.10888671875, "learning_rate": 2.8096550797981788e-08, "loss": 0.0007, "reward": 1.2242786884307861, "reward_std": 0.060234132601181045, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0243055522441864, "rewards/thinker_reward_func": 0.09997302666306496, "step": 896 }, { "completion_length": 618.0694580078125, "epoch": 52.76470588235294, "grad_norm": 0.003227144068664985, "kl": 0.103759765625, "learning_rate": 2.7563826633061892e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 897 }, { "completion_length": 647.0694274902344, "epoch": 52.8235294117647, "grad_norm": 0.061409539816753775, "kl": 0.128662109375, "learning_rate": 2.703605810004772e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 898 }, { "completion_length": 619.3888854980469, "epoch": 52.88235294117647, "grad_norm": 0.17112859262755473, "kl": 0.100830078125, "learning_rate": 2.6513250734981395e-08, "loss": 0.0002, "reward": 0.21736112236976624, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0173611119389534, "rewards/thinker_reward_func": 0.10000000894069672, "step": 899 }, { "completion_length": 706.125, "epoch": 52.94117647058823, "grad_norm": 0.003239651473125946, "kl": 0.0908203125, "learning_rate": 2.5995410021864783e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 900 }, { "completion_length": 635.0694580078125, "epoch": 53.0, "grad_norm": 0.20454129202695223, "kl": 0.098876953125, "learning_rate": 2.5482541392601918e-08, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 901 }, { "completion_length": 701.7083435058594, "epoch": 53.05882352941177, "grad_norm": 0.001756585964956592, "kl": 0.082763671875, "learning_rate": 2.497465022694206e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 902 }, { "completion_length": 662.9861145019531, "epoch": 53.11764705882353, "grad_norm": 0.24028812760204432, "kl": 0.091552734375, "learning_rate": 2.4471741852423233e-08, "loss": 0.0005, "reward": 0.8076389729976654, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 903 }, { "completion_length": 685.0, "epoch": 53.1764705882353, "grad_norm": 0.003707722566271541, "kl": 0.10107421875, "learning_rate": 2.397382154431621e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 904 }, { "completion_length": 581.3333435058594, "epoch": 53.23529411764706, "grad_norm": 0.2068375626226311, "kl": 0.111328125, "learning_rate": 2.348089452556956e-08, "loss": 0.0007, "reward": 1.0332976579666138, "reward_std": 0.00012372220226097852, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.09996429458260536, "step": 905 }, { "completion_length": 611.4305419921875, "epoch": 53.294117647058826, "grad_norm": 0.24032324520700457, "kl": 0.093505859375, "learning_rate": 2.2992965966754376e-08, "loss": 0.0005, "reward": 0.807638943195343, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 906 }, { "completion_length": 601.875, "epoch": 53.35294117647059, "grad_norm": 0.0026034310302394106, "kl": 0.102783203125, "learning_rate": 2.2510040986010335e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 907 }, { "completion_length": 615.2361145019531, "epoch": 53.411764705882355, "grad_norm": 0.1320588196907937, "kl": 0.09912109375, "learning_rate": 2.2032124648992013e-08, "loss": 0.0003, "reward": 0.4083261862397194, "reward_std": 2.4744076654314995e-05, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.09999286755919456, "step": 908 }, { "completion_length": 581.8888854980469, "epoch": 53.470588235294116, "grad_norm": 0.001794207008894464, "kl": 0.093505859375, "learning_rate": 2.1559221968815545e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 909 }, { "completion_length": 573.9028015136719, "epoch": 53.529411764705884, "grad_norm": 0.0018100618269696245, "kl": 0.102294921875, "learning_rate": 2.109133790600648e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 910 }, { "completion_length": 600.2778015136719, "epoch": 53.588235294117645, "grad_norm": 0.002130650342664254, "kl": 0.101318359375, "learning_rate": 2.0628477368447028e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 911 }, { "completion_length": 631.9444580078125, "epoch": 53.64705882352941, "grad_norm": 0.0017511164452310311, "kl": 0.096923828125, "learning_rate": 2.0170645211325332e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 912 }, { "completion_length": 616.2222290039062, "epoch": 53.705882352941174, "grad_norm": 0.00429142246892664, "kl": 0.10498046875, "learning_rate": 1.9717846237084e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 913 }, { "completion_length": 592.4583129882812, "epoch": 53.76470588235294, "grad_norm": 0.00214907867633882, "kl": 0.10498046875, "learning_rate": 1.9270085195370046e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 914 }, { "completion_length": 606.9722290039062, "epoch": 53.8235294117647, "grad_norm": 0.0024072107290789746, "kl": 0.10400390625, "learning_rate": 1.882736678298491e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 915 }, { "completion_length": 607.7777709960938, "epoch": 53.88235294117647, "grad_norm": 0.002110495399740595, "kl": 0.09765625, "learning_rate": 1.8389695643835246e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 916 }, { "completion_length": 662.0972290039062, "epoch": 53.94117647058823, "grad_norm": 0.0020701287687981307, "kl": 0.095947265625, "learning_rate": 1.7957076368884272e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 917 }, { "completion_length": 611.4861145019531, "epoch": 54.0, "grad_norm": 0.0018801897481089222, "kl": 0.0927734375, "learning_rate": 1.7529513496103322e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 918 }, { "completion_length": 664.013916015625, "epoch": 54.05882352941177, "grad_norm": 0.0025640486607643427, "kl": 0.092529296875, "learning_rate": 1.7107011510424763e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 919 }, { "completion_length": 592.4722290039062, "epoch": 54.11764705882353, "grad_norm": 0.1849809569856275, "kl": 0.10595703125, "learning_rate": 1.6689574843694432e-08, "loss": 0.0007, "reward": 0.842361181974411, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6423611044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 920 }, { "completion_length": 591.2916870117188, "epoch": 54.1764705882353, "grad_norm": 0.0021670300842798317, "kl": 0.09619140625, "learning_rate": 1.6277207874625443e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 921 }, { "completion_length": 611.5555725097656, "epoch": 54.23529411764706, "grad_norm": 0.19375320764063336, "kl": 0.094970703125, "learning_rate": 1.5869914928752117e-08, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 922 }, { "completion_length": 641.7361145019531, "epoch": 54.294117647058826, "grad_norm": 0.14278137039642758, "kl": 0.10595703125, "learning_rate": 1.5467700278384787e-08, "loss": 0.0007, "reward": 0.8236111402511597, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 923 }, { "completion_length": 650.0833435058594, "epoch": 54.35294117647059, "grad_norm": 0.001689960379211232, "kl": 0.089599609375, "learning_rate": 1.507056814256491e-08, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 924 }, { "completion_length": 621.7778015136719, "epoch": 54.411764705882355, "grad_norm": 0.002055355269456252, "kl": 0.096435546875, "learning_rate": 1.4678522687020412e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 925 }, { "completion_length": 630.875, "epoch": 54.470588235294116, "grad_norm": 0.0018501522219515126, "kl": 0.092529296875, "learning_rate": 1.4291568024122846e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 926 }, { "completion_length": 621.0833129882812, "epoch": 54.529411764705884, "grad_norm": 0.0022258927892546475, "kl": 0.1025390625, "learning_rate": 1.390970821284343e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 927 }, { "completion_length": 603.5694274902344, "epoch": 54.588235294117645, "grad_norm": 0.3025335013939656, "kl": 0.109130859375, "learning_rate": 1.3532947258710903e-08, "loss": 0.0005, "reward": 0.8423612117767334, "reward_std": 0.14123430475592613, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6423611044883728, "rewards/thinker_reward_func": 0.10000000894069672, "step": 928 }, { "completion_length": 616.7361145019531, "epoch": 54.64705882352941, "grad_norm": 0.0018318279173200184, "kl": 0.103515625, "learning_rate": 1.3161289113769403e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 929 }, { "completion_length": 607.6528015136719, "epoch": 54.705882352941174, "grad_norm": 0.002008128823330834, "kl": 0.096923828125, "learning_rate": 1.2794737676536993e-08, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 930 }, { "completion_length": 620.2083435058594, "epoch": 54.76470588235294, "grad_norm": 0.0021936936979812115, "kl": 0.09716796875, "learning_rate": 1.2433296791964754e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 931 }, { "completion_length": 600.3611450195312, "epoch": 54.8235294117647, "grad_norm": 0.27295326916797924, "kl": 0.102294921875, "learning_rate": 1.207697025139659e-08, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 932 }, { "completion_length": 641.2638854980469, "epoch": 54.88235294117647, "grad_norm": 0.0028202080579395284, "kl": 0.098388671875, "learning_rate": 1.1725761792529377e-08, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 933 }, { "completion_length": 614.7361450195312, "epoch": 54.94117647058823, "grad_norm": 0.001795194183051758, "kl": 0.10498046875, "learning_rate": 1.1379675099373487e-08, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 934 }, { "completion_length": 572.6666564941406, "epoch": 55.0, "grad_norm": 0.23803277040259235, "kl": 0.110107421875, "learning_rate": 1.1038713802214717e-08, "loss": 0.0005, "reward": 0.7902778089046478, "reward_std": 0.08109364658594131, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.5902777761220932, "rewards/thinker_reward_func": 0.10000000894069672, "step": 935 }, { "completion_length": 640.9583435058594, "epoch": 55.05882352941177, "grad_norm": 0.014141339644774483, "kl": 0.1083984375, "learning_rate": 1.0702881477575587e-08, "loss": 0.0006, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 936 }, { "completion_length": 619.7083435058594, "epoch": 55.11764705882353, "grad_norm": 0.002057964540339226, "kl": 0.09716796875, "learning_rate": 1.0372181648178435e-08, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 937 }, { "completion_length": 632.3888854980469, "epoch": 55.1764705882353, "grad_norm": 0.0017544519254519366, "kl": 0.09130859375, "learning_rate": 1.004661778290783e-08, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 938 }, { "completion_length": 631.5416870117188, "epoch": 55.23529411764706, "grad_norm": 0.002665619266498779, "kl": 0.099365234375, "learning_rate": 9.726193296774766e-09, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 939 }, { "completion_length": 600.6388854980469, "epoch": 55.294117647058826, "grad_norm": 0.002807039355009999, "kl": 0.100830078125, "learning_rate": 9.410911550880474e-09, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 940 }, { "completion_length": 603.75, "epoch": 55.35294117647059, "grad_norm": 0.0020511428954038415, "kl": 0.104248046875, "learning_rate": 9.100775852381227e-09, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 941 }, { "completion_length": 661.625, "epoch": 55.411764705882355, "grad_norm": 0.005009161167888163, "kl": 0.10302734375, "learning_rate": 8.795789454453861e-09, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 942 }, { "completion_length": 611.8055725097656, "epoch": 55.470588235294116, "grad_norm": 0.0017941388721251087, "kl": 0.095458984375, "learning_rate": 8.495955556261203e-09, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 943 }, { "completion_length": 647.0138854980469, "epoch": 55.529411764705884, "grad_norm": 0.0018862703162135504, "kl": 0.089111328125, "learning_rate": 8.201277302919085e-09, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 944 }, { "completion_length": 581.5278015136719, "epoch": 55.588235294117645, "grad_norm": 0.00205695784358306, "kl": 0.107666015625, "learning_rate": 7.91175778546288e-09, "loss": 0.0006, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 945 }, { "completion_length": 572.2638854980469, "epoch": 55.64705882352941, "grad_norm": 0.32432743107591483, "kl": 0.1162109375, "learning_rate": 7.627400040815412e-09, "loss": 0.0007, "reward": 1.2069445848464966, "reward_std": 0.12028130143880844, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0069444477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 946 }, { "completion_length": 625.4861145019531, "epoch": 55.705882352941174, "grad_norm": 0.20073632595822757, "kl": 0.09521484375, "learning_rate": 7.348207051754818e-09, "loss": 0.0003, "reward": 0.6166103482246399, "reward_std": 0.00019519856141414493, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.09994366019964218, "step": 947 }, { "completion_length": 637.6944580078125, "epoch": 55.76470588235294, "grad_norm": 0.19769406024099026, "kl": 0.095947265625, "learning_rate": 7.074181746883401e-09, "loss": 0.0005, "reward": 0.6340278163552284, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4340277910232544, "rewards/thinker_reward_func": 0.10000000894069672, "step": 948 }, { "completion_length": 633.6666870117188, "epoch": 55.8235294117647, "grad_norm": 0.0019000055593388818, "kl": 0.092529296875, "learning_rate": 6.805327000596994e-09, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 949 }, { "completion_length": 677.875, "epoch": 55.88235294117647, "grad_norm": 0.13852069696542818, "kl": 0.089599609375, "learning_rate": 6.541645633054649e-09, "loss": 0.0007, "reward": 0.8236111700534821, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 950 }, { "completion_length": 644.4444274902344, "epoch": 55.94117647058823, "grad_norm": 0.202666273892625, "kl": 0.103515625, "learning_rate": 6.2831404101492125e-09, "loss": 0.0003, "reward": 0.6166357398033142, "reward_std": 0.00010722703154897317, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.0999690555036068, "step": 951 }, { "completion_length": 647.0694580078125, "epoch": 56.0, "grad_norm": 0.373153397032659, "kl": 0.08837890625, "learning_rate": 6.029814043478021e-09, "loss": 0.0005, "reward": 1.015872299671173, "reward_std": 0.060487065464258194, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.0999000109732151, "step": 952 }, { "completion_length": 621.5972290039062, "epoch": 56.05882352941177, "grad_norm": 0.356287327687913, "kl": 0.09228515625, "learning_rate": 5.781669190314808e-09, "loss": 0.0003, "reward": 0.8076389729976654, "reward_std": 0.22232794016599655, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6076388955116272, "rewards/thinker_reward_func": 0.10000000894069672, "step": 953 }, { "completion_length": 639.6944580078125, "epoch": 56.11764705882353, "grad_norm": 0.0020173755883324293, "kl": 0.084716796875, "learning_rate": 5.538708453581786e-09, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 954 }, { "completion_length": 596.8611145019531, "epoch": 56.1764705882353, "grad_norm": 0.0018023777264869606, "kl": 0.100341796875, "learning_rate": 5.3009343818219975e-09, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 955 }, { "completion_length": 651.3888854980469, "epoch": 56.23529411764706, "grad_norm": 0.002857597827885179, "kl": 0.102294921875, "learning_rate": 5.068349469173005e-09, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 956 }, { "completion_length": 571.9305725097656, "epoch": 56.294117647058826, "grad_norm": 0.24939544648654122, "kl": 0.1357421875, "learning_rate": 4.840956155340415e-09, "loss": 0.0004, "reward": 0.5993055775761604, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 957 }, { "completion_length": 648.6111450195312, "epoch": 56.35294117647059, "grad_norm": 0.14469172307407077, "kl": 0.094970703125, "learning_rate": 4.618756825572611e-09, "loss": 0.0003, "reward": 0.42569444328546524, "reward_std": 0.060140661895275116, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.225694440305233, "rewards/thinker_reward_func": 0.10000000894069672, "step": 958 }, { "completion_length": 598.6388854980469, "epoch": 56.411764705882355, "grad_norm": 0.0021677120649398612, "kl": 0.1064453125, "learning_rate": 4.401753810635289e-09, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 959 }, { "completion_length": 678.4722290039062, "epoch": 56.470588235294116, "grad_norm": 0.0018203865152098394, "kl": 0.08251953125, "learning_rate": 4.189949386787462e-09, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 960 }, { "completion_length": 608.5, "epoch": 56.529411764705884, "grad_norm": 0.00624562417867537, "kl": 0.115234375, "learning_rate": 3.983345775757263e-09, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 961 }, { "completion_length": 649.4444274902344, "epoch": 56.588235294117645, "grad_norm": 0.0020118090083802186, "kl": 0.089111328125, "learning_rate": 3.781945144718912e-09, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 962 }, { "completion_length": 635.625, "epoch": 56.64705882352941, "grad_norm": 0.1866004239842811, "kl": 0.093994140625, "learning_rate": 3.5857496062695615e-09, "loss": 0.0005, "reward": 0.8076389133930206, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 963 }, { "completion_length": 578.3888854980469, "epoch": 56.705882352941174, "grad_norm": 0.0022283786396554315, "kl": 0.10888671875, "learning_rate": 3.3947612184077046e-09, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 964 }, { "completion_length": 662.388916015625, "epoch": 56.76470588235294, "grad_norm": 0.18603069332757732, "kl": 0.094970703125, "learning_rate": 3.2089819845111944e-09, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 965 }, { "completion_length": 583.8055725097656, "epoch": 56.8235294117647, "grad_norm": 0.0031964437540406227, "kl": 0.106201171875, "learning_rate": 3.028413853316092e-09, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 966 }, { "completion_length": 645.2361145019531, "epoch": 56.88235294117647, "grad_norm": 0.001971552679023735, "kl": 0.08984375, "learning_rate": 2.85305871889685e-09, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 967 }, { "completion_length": 599.763916015625, "epoch": 56.94117647058823, "grad_norm": 0.0019390963053994028, "kl": 0.1083984375, "learning_rate": 2.6829184206457188e-09, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 968 }, { "completion_length": 604.3888854980469, "epoch": 57.0, "grad_norm": 0.004071151763699602, "kl": 0.12109375, "learning_rate": 2.517994743254037e-09, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 969 }, { "completion_length": 581.8333435058594, "epoch": 57.05882352941177, "grad_norm": 0.007142118444368923, "kl": 0.11279296875, "learning_rate": 2.3582894166930267e-09, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 970 }, { "completion_length": 634.8055725097656, "epoch": 57.11764705882353, "grad_norm": 0.0032924292254727138, "kl": 0.0947265625, "learning_rate": 2.2038041161960287e-09, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 971 }, { "completion_length": 604.9305725097656, "epoch": 57.1764705882353, "grad_norm": 0.0028828699539646917, "kl": 0.107421875, "learning_rate": 2.054540462240739e-09, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 972 }, { "completion_length": 634.1805419921875, "epoch": 57.23529411764706, "grad_norm": 0.00338362556121485, "kl": 0.10546875, "learning_rate": 1.9105000205322796e-09, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 973 }, { "completion_length": 648.3333435058594, "epoch": 57.294117647058826, "grad_norm": 0.002008338323809387, "kl": 0.092041015625, "learning_rate": 1.7716843019867645e-09, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 974 }, { "completion_length": 621.6527709960938, "epoch": 57.35294117647059, "grad_norm": 0.002198324432374229, "kl": 0.09521484375, "learning_rate": 1.638094762715314e-09, "loss": 0.0011, "reward": 1.2416667938232422, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 1.0416666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 975 }, { "completion_length": 585.9166564941406, "epoch": 57.411764705882355, "grad_norm": 0.0024821077749193387, "kl": 0.103271484375, "learning_rate": 1.5097328040090119e-09, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 976 }, { "completion_length": 677.8472290039062, "epoch": 57.470588235294116, "grad_norm": 0.0020040370126848833, "kl": 0.100830078125, "learning_rate": 1.386599772324082e-09, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 977 }, { "completion_length": 615.2638854980469, "epoch": 57.529411764705884, "grad_norm": 0.0018148788382351254, "kl": 0.1015625, "learning_rate": 1.2686969592676789e-09, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 978 }, { "completion_length": 640.9166870117188, "epoch": 57.588235294117645, "grad_norm": 0.0021940418881188474, "kl": 0.10107421875, "learning_rate": 1.1560256015846758e-09, "loss": 0.0005, "reward": 0.6166666746139526, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 979 }, { "completion_length": 590.1111145019531, "epoch": 57.64705882352941, "grad_norm": 0.0024266409149902935, "kl": 0.114501953125, "learning_rate": 1.0485868811441756e-09, "loss": 0.0002, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.0, "rewards/thinker_reward_func": 0.10000000894069672, "step": 980 }, { "completion_length": 602.0, "epoch": 57.705882352941174, "grad_norm": 0.2352559230664401, "kl": 0.09326171875, "learning_rate": 9.463819249275752e-10, "loss": 0.0009, "reward": 1.0506945550441742, "reward_std": 0.060140661895275116, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8506944477558136, "rewards/thinker_reward_func": 0.10000000894069672, "step": 981 }, { "completion_length": 592.8333435058594, "epoch": 57.76470588235294, "grad_norm": 0.0021183300193563116, "kl": 0.097412109375, "learning_rate": 8.494118050164645e-10, "loss": 0.0004, "reward": 0.40833333879709244, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 982 }, { "completion_length": 613.138916015625, "epoch": 57.8235294117647, "grad_norm": 0.0021923503864732575, "kl": 0.09423828125, "learning_rate": 7.576775385815249e-10, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 983 }, { "completion_length": 628.2638854980469, "epoch": 57.88235294117647, "grad_norm": 0.002094047949587591, "kl": 0.095458984375, "learning_rate": 6.711800878718144e-10, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 984 }, { "completion_length": 601.1250305175781, "epoch": 57.94117647058823, "grad_norm": 0.37631584136680507, "kl": 0.101318359375, "learning_rate": 5.899203602046654e-10, "loss": 0.0005, "reward": 0.807638943195343, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 985 }, { "completion_length": 672.5972290039062, "epoch": 58.0, "grad_norm": 0.1461520369545127, "kl": 0.10009765625, "learning_rate": 5.138992079561366e-10, "loss": 0.0003, "reward": 0.40694449096918106, "reward_std": 0.004811252001672983, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.2083333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 986 }, { "completion_length": 615.2222290039062, "epoch": 58.05882352941177, "grad_norm": 0.0020521656545110967, "kl": 0.10546875, "learning_rate": 4.431174285521866e-10, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 987 }, { "completion_length": 628.013916015625, "epoch": 58.11764705882353, "grad_norm": 0.0023334025692873105, "kl": 0.10107421875, "learning_rate": 3.775757644601807e-10, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 988 }, { "completion_length": 627.9861145019531, "epoch": 58.1764705882353, "grad_norm": 0.002434267178941488, "kl": 0.099365234375, "learning_rate": 3.172749031811195e-10, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 989 }, { "completion_length": 600.6805419921875, "epoch": 58.23529411764706, "grad_norm": 0.00320325642174361, "kl": 0.11376953125, "learning_rate": 2.6221547724253333e-10, "loss": 0.0007, "reward": 0.8250000849366188, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.625, "rewards/thinker_reward_func": 0.10000000894069672, "step": 990 }, { "completion_length": 602.5, "epoch": 58.294117647058826, "grad_norm": 0.002409262877411222, "kl": 0.1064453125, "learning_rate": 2.1239806419176554e-10, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333283662796, "rewards/thinker_reward_func": 0.10000000894069672, "step": 991 }, { "completion_length": 674.3472290039062, "epoch": 58.35294117647059, "grad_norm": 0.2894252555876314, "kl": 0.097412109375, "learning_rate": 1.6782318658992157e-10, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.14123429358005524, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.7812500149011612, "rewards/thinker_reward_func": 0.10000000894069672, "step": 992 }, { "completion_length": 589.2778015136719, "epoch": 58.411764705882355, "grad_norm": 0.17944759355400225, "kl": 0.110107421875, "learning_rate": 1.2849131200631803e-10, "loss": 0.0005, "reward": 0.8249540328979492, "reward_std": 0.00015945517225190997, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.09995397925376892, "step": 993 }, { "completion_length": 614.4444580078125, "epoch": 58.470588235294116, "grad_norm": 0.1956864821472297, "kl": 0.096923828125, "learning_rate": 9.440285301370865e-11, "loss": 0.0003, "reward": 0.5993055701255798, "reward_std": 0.06014065071940422, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.3993055522441864, "rewards/thinker_reward_func": 0.10000000894069672, "step": 994 }, { "completion_length": 622.8888854980469, "epoch": 58.529411764705884, "grad_norm": 0.0019797362605306003, "kl": 0.093994140625, "learning_rate": 6.555816718389895e-11, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 995 }, { "completion_length": 618.6111145019531, "epoch": 58.588235294117645, "grad_norm": 0.34627818903146707, "kl": 0.098876953125, "learning_rate": 4.195755708408244e-11, "loss": 0.0005, "reward": 0.8062500357627869, "reward_std": 0.06495190411806107, "rewards/format_reward_func": 0.09861112385988235, "rewards/solution_reward_func": 0.607638880610466, "rewards/thinker_reward_func": 0.10000000894069672, "step": 996 }, { "completion_length": 585.7916870117188, "epoch": 58.64705882352941, "grad_norm": 0.0019807956306383354, "kl": 0.09814453125, "learning_rate": 2.3601270273398978e-11, "loss": 0.0005, "reward": 0.6166667118668556, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.4166666567325592, "rewards/thinker_reward_func": 0.10000000894069672, "step": 997 }, { "completion_length": 629.4861145019531, "epoch": 58.705882352941174, "grad_norm": 0.0019450736448499769, "kl": 0.091064453125, "learning_rate": 1.0489499300603277e-11, "loss": 0.0007, "reward": 0.8250000476837158, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.6249999850988388, "rewards/thinker_reward_func": 0.10000000894069672, "step": 998 }, { "completion_length": 633.6944274902344, "epoch": 58.76470588235294, "grad_norm": 0.17804794219453765, "kl": 0.091064453125, "learning_rate": 2.6223817020665227e-12, "loss": 0.0007, "reward": 1.0159722864627838, "reward_std": 0.06014065816998482, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8159722089767456, "rewards/thinker_reward_func": 0.10000000894069672, "step": 999 }, { "completion_length": 566.5000305175781, "epoch": 58.8235294117647, "grad_norm": 0.0026944558514564623, "kl": 0.1083984375, "learning_rate": 0.0, "loss": 0.0009, "reward": 1.033333420753479, "reward_std": 0.0, "rewards/format_reward_func": 0.10000000894069672, "rewards/solution_reward_func": 0.8333333134651184, "rewards/thinker_reward_func": 0.10000000894069672, "step": 1000 }, { "epoch": 58.8235294117647, "step": 1000, "total_flos": 0.0, "train_loss": 0.0004573261429768181, "train_runtime": 34727.9126, "train_samples_per_second": 0.173, "train_steps_per_second": 0.029 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 59, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }