checkpointS-100 / trainer_state.json
C10X's picture
Upload trainer_state.json with huggingface_hub
ef627a1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7142857142857143,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 200.0,
"epoch": 0.007142857142857143,
"grad_norm": 0.00043654805631376803,
"kl": 1.1190734767296817e-05,
"learning_rate": 3.5714285714285714e-06,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.125,
"step": 1
},
{
"completion_length": 292.875,
"epoch": 0.014285714285714285,
"grad_norm": 0.1742347776889801,
"kl": 5.753119239670923e-06,
"learning_rate": 7.142857142857143e-06,
"loss": 0.0,
"reward": -0.3997499942779541,
"reward_std": 0.15906040370464325,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.3997499942779541,
"step": 2
},
{
"completion_length": 445.5,
"epoch": 0.02142857142857143,
"grad_norm": 0.0913790687918663,
"kl": 5.179101663088659e-06,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.0,
"reward": -0.7059999704360962,
"reward_std": 0.729467511177063,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9559999704360962,
"step": 3
},
{
"completion_length": 397.875,
"epoch": 0.02857142857142857,
"grad_norm": 0.18434225022792816,
"kl": 1.612883534107823e-05,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.0,
"reward": -0.7477500438690186,
"reward_std": 0.5131211876869202,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7477500438690186,
"step": 4
},
{
"completion_length": 374.25,
"epoch": 0.03571428571428571,
"grad_norm": 0.15434053540229797,
"kl": 3.564520739018917e-05,
"learning_rate": 1.785714285714286e-05,
"loss": 0.0,
"reward": -0.4762499928474426,
"reward_std": 0.7022607922554016,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7262499928474426,
"step": 5
},
{
"completion_length": 264.25,
"epoch": 0.04285714285714286,
"grad_norm": 0.14609746634960175,
"kl": 0.0002129438507836312,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.0,
"reward": -0.1522500216960907,
"reward_std": 0.6662349700927734,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.4022500216960907,
"step": 6
},
{
"completion_length": 221.5,
"epoch": 0.05,
"grad_norm": 0.22101597487926483,
"kl": 0.0012037234846502542,
"learning_rate": 2.5e-05,
"loss": 0.0,
"reward": -0.11937500536441803,
"reward_std": 0.2714921534061432,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11937499791383743,
"step": 7
},
{
"completion_length": 247.875,
"epoch": 0.05714285714285714,
"grad_norm": 0.22249825298786163,
"kl": 0.0021207458339631557,
"learning_rate": 2.857142857142857e-05,
"loss": 0.0001,
"reward": -0.19712500274181366,
"reward_std": 0.36591196060180664,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.19712500274181366,
"step": 8
},
{
"completion_length": 511.75,
"epoch": 0.06428571428571428,
"grad_norm": 0.11614850163459778,
"kl": 0.0037874511908739805,
"learning_rate": 3.2142857142857144e-05,
"loss": 0.0002,
"reward": -0.7689999938011169,
"reward_std": 1.3408536911010742,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0189999341964722,
"step": 9
},
{
"completion_length": 221.625,
"epoch": 0.07142857142857142,
"grad_norm": 0.21436981856822968,
"kl": 0.012746547348797321,
"learning_rate": 3.571428571428572e-05,
"loss": 0.0005,
"reward": -0.23862498998641968,
"reward_std": 0.11265870183706284,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.23862498998641968,
"step": 10
},
{
"completion_length": 292.75,
"epoch": 0.07857142857142857,
"grad_norm": 0.19347621500492096,
"kl": 0.011033562943339348,
"learning_rate": 3.928571428571429e-05,
"loss": 0.0004,
"reward": -0.4468750059604645,
"reward_std": 0.16547630727291107,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.4468749761581421,
"step": 11
},
{
"completion_length": 110.625,
"epoch": 0.08571428571428572,
"grad_norm": 0.44592323899269104,
"kl": 0.060783885419368744,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.0024,
"reward": 1.424625039100647,
"reward_std": 1.0278345346450806,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17462500929832458,
"step": 12
},
{
"completion_length": 678.75,
"epoch": 0.09285714285714286,
"grad_norm": 0.1104876920580864,
"kl": 0.013322519138455391,
"learning_rate": 4.642857142857143e-05,
"loss": 0.0005,
"reward": 0.468500018119812,
"reward_std": 0.9677569270133972,
"rewards/correctness_reward_func": 1.0,
"rewards/length_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.656499981880188,
"step": 13
},
{
"completion_length": 495.25,
"epoch": 0.1,
"grad_norm": 0.13062289357185364,
"kl": 0.021864598616957664,
"learning_rate": 5e-05,
"loss": 0.0009,
"reward": -0.5831249952316284,
"reward_std": 0.676399827003479,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5831249952316284,
"step": 14
},
{
"completion_length": 781.375,
"epoch": 0.10714285714285714,
"grad_norm": 0.09523279964923859,
"kl": 0.013723693788051605,
"learning_rate": 4.999222955002041e-05,
"loss": 0.0005,
"reward": -0.13512498140335083,
"reward_std": 0.9264968633651733,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13512498140335083,
"step": 15
},
{
"completion_length": 405.5,
"epoch": 0.11428571428571428,
"grad_norm": 0.15296146273612976,
"kl": 0.03523610904812813,
"learning_rate": 4.996892303047306e-05,
"loss": 0.0014,
"reward": 2.193000078201294,
"reward_std": 0.43767958879470825,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19300000369548798,
"step": 16
},
{
"completion_length": 214.75,
"epoch": 0.12142857142857143,
"grad_norm": 0.2985721230506897,
"kl": 0.0879557877779007,
"learning_rate": 4.9930094929529506e-05,
"loss": 0.0035,
"reward": -0.011874988675117493,
"reward_std": 0.6198051571846008,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.011875003576278687,
"step": 17
},
{
"completion_length": 121.25,
"epoch": 0.12857142857142856,
"grad_norm": 0.417566180229187,
"kl": 0.22521352767944336,
"learning_rate": 4.987576938413504e-05,
"loss": 0.009,
"reward": 2.4171249866485596,
"reward_std": 0.1779337227344513,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.41712498664855957,
"step": 18
},
{
"completion_length": 179.0,
"epoch": 0.1357142857142857,
"grad_norm": 0.285080224275589,
"kl": 0.10962098836898804,
"learning_rate": 4.9805980165004304e-05,
"loss": 0.0044,
"reward": 1.75,
"reward_std": 1.0350983142852783,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 19
},
{
"completion_length": 282.875,
"epoch": 0.14285714285714285,
"grad_norm": 0.27518561482429504,
"kl": 0.06473983079195023,
"learning_rate": 4.972077065562821e-05,
"loss": 0.0026,
"reward": 1.2120000123977661,
"reward_std": 1.0715053081512451,
"rewards/correctness_reward_func": 0.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4620000123977661,
"step": 20
},
{
"completion_length": 146.0,
"epoch": 0.15,
"grad_norm": 0.01954607106745243,
"kl": 0.1001419946551323,
"learning_rate": 4.962019382530521e-05,
"loss": 0.004,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 21
},
{
"completion_length": 233.125,
"epoch": 0.15714285714285714,
"grad_norm": 0.019903944805264473,
"kl": 0.0685255229473114,
"learning_rate": 4.9504312196213596e-05,
"loss": 0.0027,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 22
},
{
"completion_length": 199.125,
"epoch": 0.16428571428571428,
"grad_norm": 0.022628186270594597,
"kl": 0.08269491791725159,
"learning_rate": 4.937319780454559e-05,
"loss": 0.0033,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 23
},
{
"completion_length": 388.25,
"epoch": 0.17142857142857143,
"grad_norm": 0.12237099558115005,
"kl": 0.031046129763126373,
"learning_rate": 4.922693215572695e-05,
"loss": 0.0012,
"reward": 1.5,
"reward_std": 1.0690449476242065,
"rewards/correctness_reward_func": 1.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 24
},
{
"completion_length": 262.875,
"epoch": 0.17857142857142858,
"grad_norm": 0.026575006544589996,
"kl": 0.08844916522502899,
"learning_rate": 4.90656061737503e-05,
"loss": 0.0035,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 25
},
{
"completion_length": 290.125,
"epoch": 0.18571428571428572,
"grad_norm": 0.5788246989250183,
"kl": 0.06681957095861435,
"learning_rate": 4.888932014465352e-05,
"loss": 0.0027,
"reward": 0.375,
"reward_std": 0.2314550280570984,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.375,
"step": 26
},
{
"completion_length": 224.375,
"epoch": 0.19285714285714287,
"grad_norm": 0.4455007314682007,
"kl": 0.2504517138004303,
"learning_rate": 4.86981836541783e-05,
"loss": 0.01,
"reward": 1.625,
"reward_std": 1.2174328565597534,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.375,
"step": 27
},
{
"completion_length": 232.125,
"epoch": 0.2,
"grad_norm": 0.23774927854537964,
"kl": 0.1069759875535965,
"learning_rate": 4.849231551964771e-05,
"loss": 0.0043,
"reward": 1.921875,
"reward_std": 0.8937718272209167,
"rewards/correctness_reward_func": 1.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.421875,
"step": 28
},
{
"completion_length": 458.625,
"epoch": 0.20714285714285716,
"grad_norm": 0.09729224443435669,
"kl": 0.03021763078868389,
"learning_rate": 4.827184371610511e-05,
"loss": 0.0012,
"reward": 1.75,
"reward_std": 1.0350983142852783,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 29
},
{
"completion_length": 422.375,
"epoch": 0.21428571428571427,
"grad_norm": 0.12195795774459839,
"kl": 0.04913492873311043,
"learning_rate": 4.803690529676019e-05,
"loss": 0.002,
"reward": 2.25,
"reward_std": 0.5345224738121033,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4375,
"step": 30
},
{
"completion_length": 270.875,
"epoch": 0.22142857142857142,
"grad_norm": 0.21031659841537476,
"kl": 0.0698024183511734,
"learning_rate": 4.778764630779183e-05,
"loss": 0.0028,
"reward": 2.21875,
"reward_std": 0.6999680995941162,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.46875,
"step": 31
},
{
"completion_length": 377.5,
"epoch": 0.22857142857142856,
"grad_norm": 0.010702410712838173,
"kl": 0.037429843097925186,
"learning_rate": 4.752422169756048e-05,
"loss": 0.0015,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 32
},
{
"completion_length": 360.125,
"epoch": 0.2357142857142857,
"grad_norm": 0.14157415926456451,
"kl": 0.08244010806083679,
"learning_rate": 4.724679522028672e-05,
"loss": 0.0033,
"reward": 1.6486248970031738,
"reward_std": 1.1074291467666626,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3986250162124634,
"step": 33
},
{
"completion_length": 116.125,
"epoch": 0.24285714285714285,
"grad_norm": 1.3088772296905518,
"kl": 0.7027589082717896,
"learning_rate": 4.6955539334255716e-05,
"loss": 0.0281,
"reward": 1.75,
"reward_std": 1.0350983142852783,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 34
},
{
"completion_length": 311.0,
"epoch": 0.25,
"grad_norm": 0.7312092781066895,
"kl": 0.07371841371059418,
"learning_rate": 4.665063509461097e-05,
"loss": 0.0029,
"reward": 0.6698750257492065,
"reward_std": 0.7472349405288696,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.41987499594688416,
"step": 35
},
{
"completion_length": 329.625,
"epoch": 0.2571428571428571,
"grad_norm": 0.006386851891875267,
"kl": 0.039320699870586395,
"learning_rate": 4.6332272040803895e-05,
"loss": 0.0016,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 36
},
{
"completion_length": 365.875,
"epoch": 0.2642857142857143,
"grad_norm": 0.20290029048919678,
"kl": 0.05483713746070862,
"learning_rate": 4.600064807876929e-05,
"loss": 0.0022,
"reward": 1.6407499313354492,
"reward_std": 1.0948845148086548,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.390749990940094,
"step": 37
},
{
"completion_length": 287.0,
"epoch": 0.2714285714285714,
"grad_norm": 0.19799265265464783,
"kl": 0.13818374276161194,
"learning_rate": 4.5655969357899874e-05,
"loss": 0.0055,
"reward": 1.875999927520752,
"reward_std": 0.9384979605674744,
"rewards/correctness_reward_func": 1.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.37599998712539673,
"step": 38
},
{
"completion_length": 289.875,
"epoch": 0.2785714285714286,
"grad_norm": 0.37082406878471375,
"kl": 0.14701411128044128,
"learning_rate": 4.529845014289642e-05,
"loss": 0.0059,
"reward": 2.0653750896453857,
"reward_std": 0.6956271529197693,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3153750002384186,
"step": 39
},
{
"completion_length": 725.125,
"epoch": 0.2857142857142857,
"grad_norm": 0.11310902237892151,
"kl": 0.05320233479142189,
"learning_rate": 4.4928312680573064e-05,
"loss": 0.0021,
"reward": 0.5361250042915344,
"reward_std": 0.2760908007621765,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.1875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3486250042915344,
"step": 40
},
{
"completion_length": 373.625,
"epoch": 0.29285714285714287,
"grad_norm": 0.005955233704298735,
"kl": 0.09833689779043198,
"learning_rate": 4.454578706170075e-05,
"loss": 0.0039,
"reward": 0.36500000953674316,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36500000953674316,
"step": 41
},
{
"completion_length": 337.75,
"epoch": 0.3,
"grad_norm": 0.18655544519424438,
"kl": 0.13665920495986938,
"learning_rate": 4.415111107797445e-05,
"loss": 0.0055,
"reward": 1.3464999198913574,
"reward_std": 1.0498690605163574,
"rewards/correctness_reward_func": 1.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3465000092983246,
"step": 42
},
{
"completion_length": 463.75,
"epoch": 0.30714285714285716,
"grad_norm": 0.18999366462230682,
"kl": 0.1055900901556015,
"learning_rate": 4.374453007419336e-05,
"loss": 0.0042,
"reward": 0.3206250071525574,
"reward_std": 0.08869842439889908,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3206250071525574,
"step": 43
},
{
"completion_length": 350.25,
"epoch": 0.3142857142857143,
"grad_norm": 0.16436553001403809,
"kl": 0.12820430099964142,
"learning_rate": 4.332629679574566e-05,
"loss": 0.0051,
"reward": 2.3486249446868896,
"reward_std": 0.04361177235841751,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3486250042915344,
"step": 44
},
{
"completion_length": 396.625,
"epoch": 0.32142857142857145,
"grad_norm": 0.12755045294761658,
"kl": 0.10351184010505676,
"learning_rate": 4.2896671231492966e-05,
"loss": 0.0041,
"reward": 2.1152501106262207,
"reward_std": 0.707207977771759,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36524999141693115,
"step": 45
},
{
"completion_length": 323.0,
"epoch": 0.32857142857142857,
"grad_norm": 0.1433708518743515,
"kl": 0.12643718719482422,
"learning_rate": 4.245592045215182e-05,
"loss": 0.0051,
"reward": 2.085624933242798,
"reward_std": 0.6997721791267395,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3356249928474426,
"step": 46
},
{
"completion_length": 465.875,
"epoch": 0.3357142857142857,
"grad_norm": 0.14527598023414612,
"kl": 0.09910832345485687,
"learning_rate": 4.2004318444272985e-05,
"loss": 0.004,
"reward": 1.0987499952316284,
"reward_std": 1.0445955991744995,
"rewards/correctness_reward_func": 0.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3487499952316284,
"step": 47
},
{
"completion_length": 443.875,
"epoch": 0.34285714285714286,
"grad_norm": 0.11678742617368698,
"kl": 0.09625618904829025,
"learning_rate": 4.154214593992149e-05,
"loss": 0.0039,
"reward": 2.1332499980926514,
"reward_std": 0.716006875038147,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.38324999809265137,
"step": 48
},
{
"completion_length": 298.875,
"epoch": 0.35,
"grad_norm": 0.2462267130613327,
"kl": 0.14918765425682068,
"learning_rate": 4.1069690242163484e-05,
"loss": 0.006,
"reward": 1.3640000820159912,
"reward_std": 1.0677127838134766,
"rewards/correctness_reward_func": 1.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36400002241134644,
"step": 49
},
{
"completion_length": 604.125,
"epoch": 0.35714285714285715,
"grad_norm": 0.11957182735204697,
"kl": 0.06581288576126099,
"learning_rate": 4.058724504646834e-05,
"loss": 0.0026,
"reward": 0.42887499928474426,
"reward_std": 0.1762550324201584,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36637499928474426,
"step": 50
},
{
"completion_length": 630.75,
"epoch": 0.36428571428571427,
"grad_norm": 0.20287089049816132,
"kl": 0.15163259208202362,
"learning_rate": 4.009511025813694e-05,
"loss": 0.0061,
"reward": 0.3474999964237213,
"reward_std": 0.10380475223064423,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3475000262260437,
"step": 51
},
{
"completion_length": 409.875,
"epoch": 0.37142857142857144,
"grad_norm": 0.15633811056613922,
"kl": 0.12077239155769348,
"learning_rate": 3.959359180586975e-05,
"loss": 0.0048,
"reward": 1.6152499914169312,
"reward_std": 1.0375232696533203,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36524999141693115,
"step": 52
},
{
"completion_length": 473.875,
"epoch": 0.37857142857142856,
"grad_norm": 0.1574457734823227,
"kl": 0.08710946887731552,
"learning_rate": 3.908300145159055e-05,
"loss": 0.0035,
"reward": 0.8807500004768372,
"reward_std": 0.9173192381858826,
"rewards/correctness_reward_func": 0.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.38075000047683716,
"step": 53
},
{
"completion_length": 321.25,
"epoch": 0.38571428571428573,
"grad_norm": 0.19621425867080688,
"kl": 0.17770174145698547,
"learning_rate": 3.856365659664399e-05,
"loss": 0.0071,
"reward": 2.1021249294281006,
"reward_std": 0.7031054496765137,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.35212498903274536,
"step": 54
},
{
"completion_length": 273.75,
"epoch": 0.39285714285714285,
"grad_norm": 0.19145628809928894,
"kl": 0.16191110014915466,
"learning_rate": 3.803588008448745e-05,
"loss": 0.0065,
"reward": 0.6318750381469727,
"reward_std": 0.7018797993659973,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.38187500834465027,
"step": 55
},
{
"completion_length": 281.25,
"epoch": 0.4,
"grad_norm": 0.30190473794937134,
"kl": 0.1685158759355545,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0067,
"reward": 0.8665000200271606,
"reward_std": 0.9249003529548645,
"rewards/correctness_reward_func": 0.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36650002002716064,
"step": 56
},
{
"completion_length": 340.75,
"epoch": 0.40714285714285714,
"grad_norm": 0.15627902746200562,
"kl": 0.12346489727497101,
"learning_rate": 3.695634946553296e-05,
"loss": 0.0049,
"reward": 0.32987499237060547,
"reward_std": 0.06595548242330551,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.32987502217292786,
"step": 57
},
{
"completion_length": 993.25,
"epoch": 0.4142857142857143,
"grad_norm": 0.0931173712015152,
"kl": 0.018140610307455063,
"learning_rate": 3.6405266433829075e-05,
"loss": 0.0007,
"reward": 0.4088750183582306,
"reward_std": 0.2790803611278534,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1588750034570694,
"step": 58
},
{
"completion_length": 474.875,
"epoch": 0.42142857142857143,
"grad_norm": 0.10973533242940903,
"kl": 0.09137643128633499,
"learning_rate": 3.5847093477938956e-05,
"loss": 0.0037,
"reward": 0.42787498235702515,
"reward_std": 0.17703060805797577,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36537498235702515,
"step": 59
},
{
"completion_length": 326.0,
"epoch": 0.42857142857142855,
"grad_norm": 0.19174934923648834,
"kl": 0.13062816858291626,
"learning_rate": 3.5282177578265296e-05,
"loss": 0.0052,
"reward": 0.6150000095367432,
"reward_std": 0.7071067690849304,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36500000953674316,
"step": 60
},
{
"completion_length": 180.125,
"epoch": 0.4357142857142857,
"grad_norm": 0.48899680376052856,
"kl": 0.4511120319366455,
"learning_rate": 3.471086990686737e-05,
"loss": 0.018,
"reward": 2.324000120162964,
"reward_std": 0.11677447706460953,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3240000009536743,
"step": 61
},
{
"completion_length": 442.0,
"epoch": 0.44285714285714284,
"grad_norm": 0.14630813896656036,
"kl": 0.11019708961248398,
"learning_rate": 3.413352560915988e-05,
"loss": 0.0044,
"reward": 0.17625001072883606,
"reward_std": 0.5338656306266785,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17625001072883606,
"step": 62
},
{
"completion_length": 337.625,
"epoch": 0.45,
"grad_norm": 0.15681755542755127,
"kl": 0.1342012584209442,
"learning_rate": 3.355050358314172e-05,
"loss": 0.0054,
"reward": 1.1151249408721924,
"reward_std": 1.0349948406219482,
"rewards/correctness_reward_func": 0.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36512500047683716,
"step": 63
},
{
"completion_length": 255.375,
"epoch": 0.45714285714285713,
"grad_norm": 0.15988513827323914,
"kl": 0.14537671208381653,
"learning_rate": 3.2962166256292113e-05,
"loss": 0.0059,
"reward": 2.36537504196167,
"reward_std": 0.0005174623220227659,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36537498235702515,
"step": 64
},
{
"completion_length": 303.875,
"epoch": 0.4642857142857143,
"grad_norm": 0.6239961981773376,
"kl": 0.10595186054706573,
"learning_rate": 3.2368879360272606e-05,
"loss": 0.0042,
"reward": 2.1156249046325684,
"reward_std": 0.7073594331741333,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3656249940395355,
"step": 65
},
{
"completion_length": 554.625,
"epoch": 0.4714285714285714,
"grad_norm": 0.0981917455792427,
"kl": 0.06483708322048187,
"learning_rate": 3.177101170357513e-05,
"loss": 0.0026,
"reward": 0.33550000190734863,
"reward_std": 0.08505627512931824,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.33550000190734863,
"step": 66
},
{
"completion_length": 164.0,
"epoch": 0.4785714285714286,
"grad_norm": 0.2880488634109497,
"kl": 0.24433362483978271,
"learning_rate": 3.116893494225734e-05,
"loss": 0.0098,
"reward": 0.41474997997283936,
"reward_std": 0.3112049102783203,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.35224997997283936,
"step": 67
},
{
"completion_length": 327.375,
"epoch": 0.4857142857142857,
"grad_norm": 0.15951593220233917,
"kl": 0.09316325187683105,
"learning_rate": 3.056302334890786e-05,
"loss": 0.0037,
"reward": 1.013374924659729,
"reward_std": 0.8874584436416626,
"rewards/correctness_reward_func": 0.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2633749842643738,
"step": 68
},
{
"completion_length": 183.625,
"epoch": 0.4928571428571429,
"grad_norm": 0.19407892227172852,
"kl": 0.2086183726787567,
"learning_rate": 2.9953653579984942e-05,
"loss": 0.0083,
"reward": 2.365499973297119,
"reward_std": 0.0005344200180843472,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36549997329711914,
"step": 69
},
{
"completion_length": 260.375,
"epoch": 0.5,
"grad_norm": 0.2512143850326538,
"kl": 0.14164677262306213,
"learning_rate": 2.9341204441673266e-05,
"loss": 0.0057,
"reward": 2.020250082015991,
"reward_std": 0.7143486142158508,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.27024999260902405,
"step": 70
},
{
"completion_length": 191.125,
"epoch": 0.5071428571428571,
"grad_norm": 0.3029208779335022,
"kl": 0.11700859665870667,
"learning_rate": 2.872605665440436e-05,
"loss": 0.0047,
"reward": 2.012125015258789,
"reward_std": 0.7928214073181152,
"rewards/correctness_reward_func": 1.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.44962501525878906,
"step": 71
},
{
"completion_length": 500.5,
"epoch": 0.5142857142857142,
"grad_norm": 0.12518590688705444,
"kl": 0.02341982163488865,
"learning_rate": 2.8108592616187133e-05,
"loss": 0.0009,
"reward": -0.19187498092651367,
"reward_std": 0.7031666040420532,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.19187499582767487,
"step": 72
},
{
"completion_length": 710.25,
"epoch": 0.5214285714285715,
"grad_norm": 0.11030412465333939,
"kl": 0.01801430992782116,
"learning_rate": 2.748919616489542e-05,
"loss": 0.0007,
"reward": -0.19962501525878906,
"reward_std": 1.141786813735962,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.3246249854564667,
"step": 73
},
{
"completion_length": 395.0,
"epoch": 0.5285714285714286,
"grad_norm": 0.08873239159584045,
"kl": 0.02234470844268799,
"learning_rate": 2.686825233966061e-05,
"loss": 0.0009,
"reward": 1.4856250286102295,
"reward_std": 0.9701153039932251,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2643750011920929,
"step": 74
},
{
"completion_length": 386.125,
"epoch": 0.5357142857142857,
"grad_norm": 0.09125195443630219,
"kl": 0.029970454052090645,
"learning_rate": 2.624614714151743e-05,
"loss": 0.0012,
"reward": 1.2940000295639038,
"reward_std": 0.5996603965759277,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.45600003004074097,
"step": 75
},
{
"completion_length": 239.375,
"epoch": 0.5428571428571428,
"grad_norm": 0.1535108983516693,
"kl": 0.05824340879917145,
"learning_rate": 2.5623267293451826e-05,
"loss": 0.0023,
"reward": 1.394374966621399,
"reward_std": 0.8498432040214539,
"rewards/correctness_reward_func": 1.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10562500357627869,
"step": 76
},
{
"completion_length": 351.625,
"epoch": 0.55,
"grad_norm": 0.15809431672096252,
"kl": 0.02210851013660431,
"learning_rate": 2.5e-05,
"loss": 0.0009,
"reward": 1.4718749523162842,
"reward_std": 1.3246859312057495,
"rewards/correctness_reward_func": 1.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.02812500298023224,
"step": 77
},
{
"completion_length": 199.375,
"epoch": 0.5571428571428572,
"grad_norm": 0.20733776688575745,
"kl": 0.06919591128826141,
"learning_rate": 2.4376732706548183e-05,
"loss": 0.0028,
"reward": 2.0334999561309814,
"reward_std": 0.9354116320610046,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.22100000083446503,
"step": 78
},
{
"completion_length": 490.625,
"epoch": 0.5642857142857143,
"grad_norm": 0.3020256459712982,
"kl": 0.020964641124010086,
"learning_rate": 2.375385285848257e-05,
"loss": 0.0008,
"reward": 0.2606250047683716,
"reward_std": 1.5538617372512817,
"rewards/correctness_reward_func": 0.75,
"rewards/length_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6143749952316284,
"step": 79
},
{
"completion_length": 257.875,
"epoch": 0.5714285714285714,
"grad_norm": 0.17333918809890747,
"kl": 0.040599275380373,
"learning_rate": 2.3131747660339394e-05,
"loss": 0.0016,
"reward": 1.8927500247955322,
"reward_std": 0.6703715324401855,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14274999499320984,
"step": 80
},
{
"completion_length": 292.625,
"epoch": 0.5785714285714286,
"grad_norm": 0.14176879823207855,
"kl": 0.03724910691380501,
"learning_rate": 2.251080383510459e-05,
"loss": 0.0015,
"reward": 2.164875030517578,
"reward_std": 0.35948193073272705,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.16487500071525574,
"step": 81
},
{
"completion_length": 500.25,
"epoch": 0.5857142857142857,
"grad_norm": 0.1182783842086792,
"kl": 0.023212479427456856,
"learning_rate": 2.189140738381288e-05,
"loss": 0.0009,
"reward": 0.7456250190734863,
"reward_std": 1.171677589416504,
"rewards/correctness_reward_func": 0.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24562500417232513,
"step": 82
},
{
"completion_length": 487.125,
"epoch": 0.5928571428571429,
"grad_norm": 0.12226809561252594,
"kl": 0.019420940428972244,
"learning_rate": 2.1273943345595637e-05,
"loss": 0.0008,
"reward": 0.3148750066757202,
"reward_std": 1.1947816610336304,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06487500667572021,
"step": 83
},
{
"completion_length": 152.875,
"epoch": 0.6,
"grad_norm": 0.27960774302482605,
"kl": 0.07769262790679932,
"learning_rate": 2.0658795558326743e-05,
"loss": 0.0031,
"reward": 2.5625,
"reward_std": 0.1767766922712326,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5,
"step": 84
},
{
"completion_length": 235.75,
"epoch": 0.6071428571428571,
"grad_norm": 0.2650700807571411,
"kl": 0.0834224745631218,
"learning_rate": 2.0046346420015067e-05,
"loss": 0.0033,
"reward": 2.243499994277954,
"reward_std": 0.5382997393608093,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.4309999942779541,
"step": 85
},
{
"completion_length": 324.0,
"epoch": 0.6142857142857143,
"grad_norm": 0.00965797994285822,
"kl": 0.04011977091431618,
"learning_rate": 1.9436976651092144e-05,
"loss": 0.0016,
"reward": 0.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 86
},
{
"completion_length": 227.0,
"epoch": 0.6214285714285714,
"grad_norm": 0.19126032292842865,
"kl": 0.12643316388130188,
"learning_rate": 1.8831065057742657e-05,
"loss": 0.0051,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 87
},
{
"completion_length": 510.75,
"epoch": 0.6285714285714286,
"grad_norm": 0.003844304708763957,
"kl": 0.016257166862487793,
"learning_rate": 1.8228988296424877e-05,
"loss": 0.0007,
"reward": 0.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 88
},
{
"completion_length": 243.0,
"epoch": 0.6357142857142857,
"grad_norm": 0.008268176577985287,
"kl": 0.045144014060497284,
"learning_rate": 1.7631120639727393e-05,
"loss": 0.0018,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 89
},
{
"completion_length": 235.25,
"epoch": 0.6428571428571429,
"grad_norm": 0.14653581380844116,
"kl": 0.038456518203020096,
"learning_rate": 1.7037833743707892e-05,
"loss": 0.0015,
"reward": 2.375,
"reward_std": 0.5824823379516602,
"rewards/correctness_reward_func": 1.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.5,
"step": 90
},
{
"completion_length": 220.125,
"epoch": 0.65,
"grad_norm": 0.14805400371551514,
"kl": 0.07326260954141617,
"learning_rate": 1.6449496416858284e-05,
"loss": 0.0029,
"reward": 1.3125,
"reward_std": 1.1319231986999512,
"rewards/correctness_reward_func": 0.75,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5,
"step": 91
},
{
"completion_length": 449.0,
"epoch": 0.6571428571428571,
"grad_norm": 0.11858955770730972,
"kl": 0.027846258133649826,
"learning_rate": 1.5866474390840125e-05,
"loss": 0.0011,
"reward": 0.75,
"reward_std": 0.7071067690849304,
"rewards/correctness_reward_func": 0.25,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 92
},
{
"completion_length": 219.5,
"epoch": 0.6642857142857143,
"grad_norm": 0.21257686614990234,
"kl": 0.058622974902391434,
"learning_rate": 1.5289130093132632e-05,
"loss": 0.0023,
"reward": 1.046875,
"reward_std": 0.9159731268882751,
"rewards/correctness_reward_func": 0.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.484375,
"step": 93
},
{
"completion_length": 191.125,
"epoch": 0.6714285714285714,
"grad_norm": 0.017704889178276062,
"kl": 0.06602377444505692,
"learning_rate": 1.4717822421734718e-05,
"loss": 0.0026,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 94
},
{
"completion_length": 727.75,
"epoch": 0.6785714285714286,
"grad_norm": 0.07640424370765686,
"kl": 0.015565132722258568,
"learning_rate": 1.4152906522061048e-05,
"loss": 0.0006,
"reward": 1.875,
"reward_std": 1.0264363288879395,
"rewards/correctness_reward_func": 1.25,
"rewards/length_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 95
},
{
"completion_length": 333.625,
"epoch": 0.6857142857142857,
"grad_norm": 0.011164901778101921,
"kl": 0.03891594707965851,
"learning_rate": 1.3594733566170926e-05,
"loss": 0.0016,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 96
},
{
"completion_length": 795.5,
"epoch": 0.6928571428571428,
"grad_norm": 0.07001640647649765,
"kl": 0.014953548088669777,
"learning_rate": 1.3043650534467053e-05,
"loss": 0.0006,
"reward": 1.203125,
"reward_std": 0.9863747954368591,
"rewards/correctness_reward_func": 0.5,
"rewards/length_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.453125,
"step": 97
},
{
"completion_length": 342.5,
"epoch": 0.7,
"grad_norm": 0.015015755780041218,
"kl": 0.03935668244957924,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.0016,
"reward": 2.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 2.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 98
},
{
"completion_length": 550.25,
"epoch": 0.7071428571428572,
"grad_norm": 0.10194458067417145,
"kl": 0.022631347179412842,
"learning_rate": 1.196411991551255e-05,
"loss": 0.0009,
"reward": 1.5,
"reward_std": 1.0690449476242065,
"rewards/correctness_reward_func": 1.0,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 99
},
{
"completion_length": 247.375,
"epoch": 0.7142857142857143,
"grad_norm": 0.15261337161064148,
"kl": 0.08087805658578873,
"learning_rate": 1.1436343403356017e-05,
"loss": 0.0032,
"reward": 2.0625,
"reward_std": 0.979704737663269,
"rewards/correctness_reward_func": 1.5,
"rewards/length_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 140,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}