|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7142857142857143, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 200.0, |
|
"epoch": 0.007142857142857143, |
|
"grad_norm": 0.00043654805631376803, |
|
"kl": 1.1190734767296817e-05, |
|
"learning_rate": 3.5714285714285714e-06, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.125, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 292.875, |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 0.1742347776889801, |
|
"kl": 5.753119239670923e-06, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 0.0, |
|
"reward": -0.3997499942779541, |
|
"reward_std": 0.15906040370464325, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.3997499942779541, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 445.5, |
|
"epoch": 0.02142857142857143, |
|
"grad_norm": 0.0913790687918663, |
|
"kl": 5.179101663088659e-06, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 0.0, |
|
"reward": -0.7059999704360962, |
|
"reward_std": 0.729467511177063, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.9559999704360962, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 397.875, |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.18434225022792816, |
|
"kl": 1.612883534107823e-05, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.0, |
|
"reward": -0.7477500438690186, |
|
"reward_std": 0.5131211876869202, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.7477500438690186, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 374.25, |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 0.15434053540229797, |
|
"kl": 3.564520739018917e-05, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 0.0, |
|
"reward": -0.4762499928474426, |
|
"reward_std": 0.7022607922554016, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.7262499928474426, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 264.25, |
|
"epoch": 0.04285714285714286, |
|
"grad_norm": 0.14609746634960175, |
|
"kl": 0.0002129438507836312, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 0.0, |
|
"reward": -0.1522500216960907, |
|
"reward_std": 0.6662349700927734, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.4022500216960907, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 221.5, |
|
"epoch": 0.05, |
|
"grad_norm": 0.22101597487926483, |
|
"kl": 0.0012037234846502542, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0, |
|
"reward": -0.11937500536441803, |
|
"reward_std": 0.2714921534061432, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11937499791383743, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 247.875, |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.22249825298786163, |
|
"kl": 0.0021207458339631557, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.0001, |
|
"reward": -0.19712500274181366, |
|
"reward_std": 0.36591196060180664, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.19712500274181366, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 511.75, |
|
"epoch": 0.06428571428571428, |
|
"grad_norm": 0.11614850163459778, |
|
"kl": 0.0037874511908739805, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 0.0002, |
|
"reward": -0.7689999938011169, |
|
"reward_std": 1.3408536911010742, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -1.0189999341964722, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 221.625, |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.21436981856822968, |
|
"kl": 0.012746547348797321, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.0005, |
|
"reward": -0.23862498998641968, |
|
"reward_std": 0.11265870183706284, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.23862498998641968, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 292.75, |
|
"epoch": 0.07857142857142857, |
|
"grad_norm": 0.19347621500492096, |
|
"kl": 0.011033562943339348, |
|
"learning_rate": 3.928571428571429e-05, |
|
"loss": 0.0004, |
|
"reward": -0.4468750059604645, |
|
"reward_std": 0.16547630727291107, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.4468749761581421, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 110.625, |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.44592323899269104, |
|
"kl": 0.060783885419368744, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.0024, |
|
"reward": 1.424625039100647, |
|
"reward_std": 1.0278345346450806, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.17462500929832458, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 678.75, |
|
"epoch": 0.09285714285714286, |
|
"grad_norm": 0.1104876920580864, |
|
"kl": 0.013322519138455391, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 0.0005, |
|
"reward": 0.468500018119812, |
|
"reward_std": 0.9677569270133972, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/length_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.656499981880188, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 495.25, |
|
"epoch": 0.1, |
|
"grad_norm": 0.13062289357185364, |
|
"kl": 0.021864598616957664, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0009, |
|
"reward": -0.5831249952316284, |
|
"reward_std": 0.676399827003479, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.5831249952316284, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 781.375, |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 0.09523279964923859, |
|
"kl": 0.013723693788051605, |
|
"learning_rate": 4.999222955002041e-05, |
|
"loss": 0.0005, |
|
"reward": -0.13512498140335083, |
|
"reward_std": 0.9264968633651733, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13512498140335083, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 405.5, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.15296146273612976, |
|
"kl": 0.03523610904812813, |
|
"learning_rate": 4.996892303047306e-05, |
|
"loss": 0.0014, |
|
"reward": 2.193000078201294, |
|
"reward_std": 0.43767958879470825, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.19300000369548798, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 214.75, |
|
"epoch": 0.12142857142857143, |
|
"grad_norm": 0.2985721230506897, |
|
"kl": 0.0879557877779007, |
|
"learning_rate": 4.9930094929529506e-05, |
|
"loss": 0.0035, |
|
"reward": -0.011874988675117493, |
|
"reward_std": 0.6198051571846008, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.011875003576278687, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 121.25, |
|
"epoch": 0.12857142857142856, |
|
"grad_norm": 0.417566180229187, |
|
"kl": 0.22521352767944336, |
|
"learning_rate": 4.987576938413504e-05, |
|
"loss": 0.009, |
|
"reward": 2.4171249866485596, |
|
"reward_std": 0.1779337227344513, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.41712498664855957, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 179.0, |
|
"epoch": 0.1357142857142857, |
|
"grad_norm": 0.285080224275589, |
|
"kl": 0.10962098836898804, |
|
"learning_rate": 4.9805980165004304e-05, |
|
"loss": 0.0044, |
|
"reward": 1.75, |
|
"reward_std": 1.0350983142852783, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 282.875, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.27518561482429504, |
|
"kl": 0.06473983079195023, |
|
"learning_rate": 4.972077065562821e-05, |
|
"loss": 0.0026, |
|
"reward": 1.2120000123977661, |
|
"reward_std": 1.0715053081512451, |
|
"rewards/correctness_reward_func": 0.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4620000123977661, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 146.0, |
|
"epoch": 0.15, |
|
"grad_norm": 0.01954607106745243, |
|
"kl": 0.1001419946551323, |
|
"learning_rate": 4.962019382530521e-05, |
|
"loss": 0.004, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 233.125, |
|
"epoch": 0.15714285714285714, |
|
"grad_norm": 0.019903944805264473, |
|
"kl": 0.0685255229473114, |
|
"learning_rate": 4.9504312196213596e-05, |
|
"loss": 0.0027, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 199.125, |
|
"epoch": 0.16428571428571428, |
|
"grad_norm": 0.022628186270594597, |
|
"kl": 0.08269491791725159, |
|
"learning_rate": 4.937319780454559e-05, |
|
"loss": 0.0033, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 388.25, |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.12237099558115005, |
|
"kl": 0.031046129763126373, |
|
"learning_rate": 4.922693215572695e-05, |
|
"loss": 0.0012, |
|
"reward": 1.5, |
|
"reward_std": 1.0690449476242065, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 262.875, |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 0.026575006544589996, |
|
"kl": 0.08844916522502899, |
|
"learning_rate": 4.90656061737503e-05, |
|
"loss": 0.0035, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 290.125, |
|
"epoch": 0.18571428571428572, |
|
"grad_norm": 0.5788246989250183, |
|
"kl": 0.06681957095861435, |
|
"learning_rate": 4.888932014465352e-05, |
|
"loss": 0.0027, |
|
"reward": 0.375, |
|
"reward_std": 0.2314550280570984, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.375, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 224.375, |
|
"epoch": 0.19285714285714287, |
|
"grad_norm": 0.4455007314682007, |
|
"kl": 0.2504517138004303, |
|
"learning_rate": 4.86981836541783e-05, |
|
"loss": 0.01, |
|
"reward": 1.625, |
|
"reward_std": 1.2174328565597534, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.375, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 232.125, |
|
"epoch": 0.2, |
|
"grad_norm": 0.23774927854537964, |
|
"kl": 0.1069759875535965, |
|
"learning_rate": 4.849231551964771e-05, |
|
"loss": 0.0043, |
|
"reward": 1.921875, |
|
"reward_std": 0.8937718272209167, |
|
"rewards/correctness_reward_func": 1.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.421875, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 458.625, |
|
"epoch": 0.20714285714285716, |
|
"grad_norm": 0.09729224443435669, |
|
"kl": 0.03021763078868389, |
|
"learning_rate": 4.827184371610511e-05, |
|
"loss": 0.0012, |
|
"reward": 1.75, |
|
"reward_std": 1.0350983142852783, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 422.375, |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.12195795774459839, |
|
"kl": 0.04913492873311043, |
|
"learning_rate": 4.803690529676019e-05, |
|
"loss": 0.002, |
|
"reward": 2.25, |
|
"reward_std": 0.5345224738121033, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4375, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 270.875, |
|
"epoch": 0.22142857142857142, |
|
"grad_norm": 0.21031659841537476, |
|
"kl": 0.0698024183511734, |
|
"learning_rate": 4.778764630779183e-05, |
|
"loss": 0.0028, |
|
"reward": 2.21875, |
|
"reward_std": 0.6999680995941162, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 377.5, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.010702410712838173, |
|
"kl": 0.037429843097925186, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 0.0015, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 360.125, |
|
"epoch": 0.2357142857142857, |
|
"grad_norm": 0.14157415926456451, |
|
"kl": 0.08244010806083679, |
|
"learning_rate": 4.724679522028672e-05, |
|
"loss": 0.0033, |
|
"reward": 1.6486248970031738, |
|
"reward_std": 1.1074291467666626, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3986250162124634, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 116.125, |
|
"epoch": 0.24285714285714285, |
|
"grad_norm": 1.3088772296905518, |
|
"kl": 0.7027589082717896, |
|
"learning_rate": 4.6955539334255716e-05, |
|
"loss": 0.0281, |
|
"reward": 1.75, |
|
"reward_std": 1.0350983142852783, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 311.0, |
|
"epoch": 0.25, |
|
"grad_norm": 0.7312092781066895, |
|
"kl": 0.07371841371059418, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 0.0029, |
|
"reward": 0.6698750257492065, |
|
"reward_std": 0.7472349405288696, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.41987499594688416, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 329.625, |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.006386851891875267, |
|
"kl": 0.039320699870586395, |
|
"learning_rate": 4.6332272040803895e-05, |
|
"loss": 0.0016, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 365.875, |
|
"epoch": 0.2642857142857143, |
|
"grad_norm": 0.20290029048919678, |
|
"kl": 0.05483713746070862, |
|
"learning_rate": 4.600064807876929e-05, |
|
"loss": 0.0022, |
|
"reward": 1.6407499313354492, |
|
"reward_std": 1.0948845148086548, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.390749990940094, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 287.0, |
|
"epoch": 0.2714285714285714, |
|
"grad_norm": 0.19799265265464783, |
|
"kl": 0.13818374276161194, |
|
"learning_rate": 4.5655969357899874e-05, |
|
"loss": 0.0055, |
|
"reward": 1.875999927520752, |
|
"reward_std": 0.9384979605674744, |
|
"rewards/correctness_reward_func": 1.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.37599998712539673, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 289.875, |
|
"epoch": 0.2785714285714286, |
|
"grad_norm": 0.37082406878471375, |
|
"kl": 0.14701411128044128, |
|
"learning_rate": 4.529845014289642e-05, |
|
"loss": 0.0059, |
|
"reward": 2.0653750896453857, |
|
"reward_std": 0.6956271529197693, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3153750002384186, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 725.125, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.11310902237892151, |
|
"kl": 0.05320233479142189, |
|
"learning_rate": 4.4928312680573064e-05, |
|
"loss": 0.0021, |
|
"reward": 0.5361250042915344, |
|
"reward_std": 0.2760908007621765, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.1875, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3486250042915344, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 373.625, |
|
"epoch": 0.29285714285714287, |
|
"grad_norm": 0.005955233704298735, |
|
"kl": 0.09833689779043198, |
|
"learning_rate": 4.454578706170075e-05, |
|
"loss": 0.0039, |
|
"reward": 0.36500000953674316, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36500000953674316, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 337.75, |
|
"epoch": 0.3, |
|
"grad_norm": 0.18655544519424438, |
|
"kl": 0.13665920495986938, |
|
"learning_rate": 4.415111107797445e-05, |
|
"loss": 0.0055, |
|
"reward": 1.3464999198913574, |
|
"reward_std": 1.0498690605163574, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3465000092983246, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 463.75, |
|
"epoch": 0.30714285714285716, |
|
"grad_norm": 0.18999366462230682, |
|
"kl": 0.1055900901556015, |
|
"learning_rate": 4.374453007419336e-05, |
|
"loss": 0.0042, |
|
"reward": 0.3206250071525574, |
|
"reward_std": 0.08869842439889908, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3206250071525574, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 350.25, |
|
"epoch": 0.3142857142857143, |
|
"grad_norm": 0.16436553001403809, |
|
"kl": 0.12820430099964142, |
|
"learning_rate": 4.332629679574566e-05, |
|
"loss": 0.0051, |
|
"reward": 2.3486249446868896, |
|
"reward_std": 0.04361177235841751, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3486250042915344, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 396.625, |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 0.12755045294761658, |
|
"kl": 0.10351184010505676, |
|
"learning_rate": 4.2896671231492966e-05, |
|
"loss": 0.0041, |
|
"reward": 2.1152501106262207, |
|
"reward_std": 0.707207977771759, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36524999141693115, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 323.0, |
|
"epoch": 0.32857142857142857, |
|
"grad_norm": 0.1433708518743515, |
|
"kl": 0.12643718719482422, |
|
"learning_rate": 4.245592045215182e-05, |
|
"loss": 0.0051, |
|
"reward": 2.085624933242798, |
|
"reward_std": 0.6997721791267395, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3356249928474426, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 465.875, |
|
"epoch": 0.3357142857142857, |
|
"grad_norm": 0.14527598023414612, |
|
"kl": 0.09910832345485687, |
|
"learning_rate": 4.2004318444272985e-05, |
|
"loss": 0.004, |
|
"reward": 1.0987499952316284, |
|
"reward_std": 1.0445955991744995, |
|
"rewards/correctness_reward_func": 0.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3487499952316284, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 443.875, |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 0.11678742617368698, |
|
"kl": 0.09625618904829025, |
|
"learning_rate": 4.154214593992149e-05, |
|
"loss": 0.0039, |
|
"reward": 2.1332499980926514, |
|
"reward_std": 0.716006875038147, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.38324999809265137, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 298.875, |
|
"epoch": 0.35, |
|
"grad_norm": 0.2462267130613327, |
|
"kl": 0.14918765425682068, |
|
"learning_rate": 4.1069690242163484e-05, |
|
"loss": 0.006, |
|
"reward": 1.3640000820159912, |
|
"reward_std": 1.0677127838134766, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36400002241134644, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 604.125, |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.11957182735204697, |
|
"kl": 0.06581288576126099, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 0.0026, |
|
"reward": 0.42887499928474426, |
|
"reward_std": 0.1762550324201584, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36637499928474426, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 630.75, |
|
"epoch": 0.36428571428571427, |
|
"grad_norm": 0.20287089049816132, |
|
"kl": 0.15163259208202362, |
|
"learning_rate": 4.009511025813694e-05, |
|
"loss": 0.0061, |
|
"reward": 0.3474999964237213, |
|
"reward_std": 0.10380475223064423, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3475000262260437, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 409.875, |
|
"epoch": 0.37142857142857144, |
|
"grad_norm": 0.15633811056613922, |
|
"kl": 0.12077239155769348, |
|
"learning_rate": 3.959359180586975e-05, |
|
"loss": 0.0048, |
|
"reward": 1.6152499914169312, |
|
"reward_std": 1.0375232696533203, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36524999141693115, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 473.875, |
|
"epoch": 0.37857142857142856, |
|
"grad_norm": 0.1574457734823227, |
|
"kl": 0.08710946887731552, |
|
"learning_rate": 3.908300145159055e-05, |
|
"loss": 0.0035, |
|
"reward": 0.8807500004768372, |
|
"reward_std": 0.9173192381858826, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.38075000047683716, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 321.25, |
|
"epoch": 0.38571428571428573, |
|
"grad_norm": 0.19621425867080688, |
|
"kl": 0.17770174145698547, |
|
"learning_rate": 3.856365659664399e-05, |
|
"loss": 0.0071, |
|
"reward": 2.1021249294281006, |
|
"reward_std": 0.7031054496765137, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.35212498903274536, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 273.75, |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 0.19145628809928894, |
|
"kl": 0.16191110014915466, |
|
"learning_rate": 3.803588008448745e-05, |
|
"loss": 0.0065, |
|
"reward": 0.6318750381469727, |
|
"reward_std": 0.7018797993659973, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.38187500834465027, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 281.25, |
|
"epoch": 0.4, |
|
"grad_norm": 0.30190473794937134, |
|
"kl": 0.1685158759355545, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.0067, |
|
"reward": 0.8665000200271606, |
|
"reward_std": 0.9249003529548645, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36650002002716064, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 340.75, |
|
"epoch": 0.40714285714285714, |
|
"grad_norm": 0.15627902746200562, |
|
"kl": 0.12346489727497101, |
|
"learning_rate": 3.695634946553296e-05, |
|
"loss": 0.0049, |
|
"reward": 0.32987499237060547, |
|
"reward_std": 0.06595548242330551, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.32987502217292786, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 993.25, |
|
"epoch": 0.4142857142857143, |
|
"grad_norm": 0.0931173712015152, |
|
"kl": 0.018140610307455063, |
|
"learning_rate": 3.6405266433829075e-05, |
|
"loss": 0.0007, |
|
"reward": 0.4088750183582306, |
|
"reward_std": 0.2790803611278534, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.1588750034570694, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 474.875, |
|
"epoch": 0.42142857142857143, |
|
"grad_norm": 0.10973533242940903, |
|
"kl": 0.09137643128633499, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 0.0037, |
|
"reward": 0.42787498235702515, |
|
"reward_std": 0.17703060805797577, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36537498235702515, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 326.0, |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.19174934923648834, |
|
"kl": 0.13062816858291626, |
|
"learning_rate": 3.5282177578265296e-05, |
|
"loss": 0.0052, |
|
"reward": 0.6150000095367432, |
|
"reward_std": 0.7071067690849304, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36500000953674316, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 180.125, |
|
"epoch": 0.4357142857142857, |
|
"grad_norm": 0.48899680376052856, |
|
"kl": 0.4511120319366455, |
|
"learning_rate": 3.471086990686737e-05, |
|
"loss": 0.018, |
|
"reward": 2.324000120162964, |
|
"reward_std": 0.11677447706460953, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3240000009536743, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 442.0, |
|
"epoch": 0.44285714285714284, |
|
"grad_norm": 0.14630813896656036, |
|
"kl": 0.11019708961248398, |
|
"learning_rate": 3.413352560915988e-05, |
|
"loss": 0.0044, |
|
"reward": 0.17625001072883606, |
|
"reward_std": 0.5338656306266785, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.17625001072883606, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 337.625, |
|
"epoch": 0.45, |
|
"grad_norm": 0.15681755542755127, |
|
"kl": 0.1342012584209442, |
|
"learning_rate": 3.355050358314172e-05, |
|
"loss": 0.0054, |
|
"reward": 1.1151249408721924, |
|
"reward_std": 1.0349948406219482, |
|
"rewards/correctness_reward_func": 0.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36512500047683716, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 255.375, |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.15988513827323914, |
|
"kl": 0.14537671208381653, |
|
"learning_rate": 3.2962166256292113e-05, |
|
"loss": 0.0059, |
|
"reward": 2.36537504196167, |
|
"reward_std": 0.0005174623220227659, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36537498235702515, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 303.875, |
|
"epoch": 0.4642857142857143, |
|
"grad_norm": 0.6239961981773376, |
|
"kl": 0.10595186054706573, |
|
"learning_rate": 3.2368879360272606e-05, |
|
"loss": 0.0042, |
|
"reward": 2.1156249046325684, |
|
"reward_std": 0.7073594331741333, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3656249940395355, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 554.625, |
|
"epoch": 0.4714285714285714, |
|
"grad_norm": 0.0981917455792427, |
|
"kl": 0.06483708322048187, |
|
"learning_rate": 3.177101170357513e-05, |
|
"loss": 0.0026, |
|
"reward": 0.33550000190734863, |
|
"reward_std": 0.08505627512931824, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.33550000190734863, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 164.0, |
|
"epoch": 0.4785714285714286, |
|
"grad_norm": 0.2880488634109497, |
|
"kl": 0.24433362483978271, |
|
"learning_rate": 3.116893494225734e-05, |
|
"loss": 0.0098, |
|
"reward": 0.41474997997283936, |
|
"reward_std": 0.3112049102783203, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.35224997997283936, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 327.375, |
|
"epoch": 0.4857142857142857, |
|
"grad_norm": 0.15951593220233917, |
|
"kl": 0.09316325187683105, |
|
"learning_rate": 3.056302334890786e-05, |
|
"loss": 0.0037, |
|
"reward": 1.013374924659729, |
|
"reward_std": 0.8874584436416626, |
|
"rewards/correctness_reward_func": 0.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.2633749842643738, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 183.625, |
|
"epoch": 0.4928571428571429, |
|
"grad_norm": 0.19407892227172852, |
|
"kl": 0.2086183726787567, |
|
"learning_rate": 2.9953653579984942e-05, |
|
"loss": 0.0083, |
|
"reward": 2.365499973297119, |
|
"reward_std": 0.0005344200180843472, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36549997329711914, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 260.375, |
|
"epoch": 0.5, |
|
"grad_norm": 0.2512143850326538, |
|
"kl": 0.14164677262306213, |
|
"learning_rate": 2.9341204441673266e-05, |
|
"loss": 0.0057, |
|
"reward": 2.020250082015991, |
|
"reward_std": 0.7143486142158508, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.27024999260902405, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 191.125, |
|
"epoch": 0.5071428571428571, |
|
"grad_norm": 0.3029208779335022, |
|
"kl": 0.11700859665870667, |
|
"learning_rate": 2.872605665440436e-05, |
|
"loss": 0.0047, |
|
"reward": 2.012125015258789, |
|
"reward_std": 0.7928214073181152, |
|
"rewards/correctness_reward_func": 1.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.44962501525878906, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 500.5, |
|
"epoch": 0.5142857142857142, |
|
"grad_norm": 0.12518590688705444, |
|
"kl": 0.02341982163488865, |
|
"learning_rate": 2.8108592616187133e-05, |
|
"loss": 0.0009, |
|
"reward": -0.19187498092651367, |
|
"reward_std": 0.7031666040420532, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.19187499582767487, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 710.25, |
|
"epoch": 0.5214285714285715, |
|
"grad_norm": 0.11030412465333939, |
|
"kl": 0.01801430992782116, |
|
"learning_rate": 2.748919616489542e-05, |
|
"loss": 0.0007, |
|
"reward": -0.19962501525878906, |
|
"reward_std": 1.141786813735962, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.3246249854564667, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 395.0, |
|
"epoch": 0.5285714285714286, |
|
"grad_norm": 0.08873239159584045, |
|
"kl": 0.02234470844268799, |
|
"learning_rate": 2.686825233966061e-05, |
|
"loss": 0.0009, |
|
"reward": 1.4856250286102295, |
|
"reward_std": 0.9701153039932251, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.2643750011920929, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 386.125, |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 0.09125195443630219, |
|
"kl": 0.029970454052090645, |
|
"learning_rate": 2.624614714151743e-05, |
|
"loss": 0.0012, |
|
"reward": 1.2940000295639038, |
|
"reward_std": 0.5996603965759277, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.45600003004074097, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 239.375, |
|
"epoch": 0.5428571428571428, |
|
"grad_norm": 0.1535108983516693, |
|
"kl": 0.05824340879917145, |
|
"learning_rate": 2.5623267293451826e-05, |
|
"loss": 0.0023, |
|
"reward": 1.394374966621399, |
|
"reward_std": 0.8498432040214539, |
|
"rewards/correctness_reward_func": 1.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.10562500357627869, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 351.625, |
|
"epoch": 0.55, |
|
"grad_norm": 0.15809431672096252, |
|
"kl": 0.02210851013660431, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0009, |
|
"reward": 1.4718749523162842, |
|
"reward_std": 1.3246859312057495, |
|
"rewards/correctness_reward_func": 1.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.02812500298023224, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 199.375, |
|
"epoch": 0.5571428571428572, |
|
"grad_norm": 0.20733776688575745, |
|
"kl": 0.06919591128826141, |
|
"learning_rate": 2.4376732706548183e-05, |
|
"loss": 0.0028, |
|
"reward": 2.0334999561309814, |
|
"reward_std": 0.9354116320610046, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.22100000083446503, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 490.625, |
|
"epoch": 0.5642857142857143, |
|
"grad_norm": 0.3020256459712982, |
|
"kl": 0.020964641124010086, |
|
"learning_rate": 2.375385285848257e-05, |
|
"loss": 0.0008, |
|
"reward": 0.2606250047683716, |
|
"reward_std": 1.5538617372512817, |
|
"rewards/correctness_reward_func": 0.75, |
|
"rewards/length_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.6143749952316284, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 257.875, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.17333918809890747, |
|
"kl": 0.040599275380373, |
|
"learning_rate": 2.3131747660339394e-05, |
|
"loss": 0.0016, |
|
"reward": 1.8927500247955322, |
|
"reward_std": 0.6703715324401855, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.14274999499320984, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 292.625, |
|
"epoch": 0.5785714285714286, |
|
"grad_norm": 0.14176879823207855, |
|
"kl": 0.03724910691380501, |
|
"learning_rate": 2.251080383510459e-05, |
|
"loss": 0.0015, |
|
"reward": 2.164875030517578, |
|
"reward_std": 0.35948193073272705, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.16487500071525574, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 500.25, |
|
"epoch": 0.5857142857142857, |
|
"grad_norm": 0.1182783842086792, |
|
"kl": 0.023212479427456856, |
|
"learning_rate": 2.189140738381288e-05, |
|
"loss": 0.0009, |
|
"reward": 0.7456250190734863, |
|
"reward_std": 1.171677589416504, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.24562500417232513, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 487.125, |
|
"epoch": 0.5928571428571429, |
|
"grad_norm": 0.12226809561252594, |
|
"kl": 0.019420940428972244, |
|
"learning_rate": 2.1273943345595637e-05, |
|
"loss": 0.0008, |
|
"reward": 0.3148750066757202, |
|
"reward_std": 1.1947816610336304, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.06487500667572021, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 152.875, |
|
"epoch": 0.6, |
|
"grad_norm": 0.27960774302482605, |
|
"kl": 0.07769262790679932, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 0.0031, |
|
"reward": 2.5625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 235.75, |
|
"epoch": 0.6071428571428571, |
|
"grad_norm": 0.2650700807571411, |
|
"kl": 0.0834224745631218, |
|
"learning_rate": 2.0046346420015067e-05, |
|
"loss": 0.0033, |
|
"reward": 2.243499994277954, |
|
"reward_std": 0.5382997393608093, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.4309999942779541, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 324.0, |
|
"epoch": 0.6142857142857143, |
|
"grad_norm": 0.00965797994285822, |
|
"kl": 0.04011977091431618, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 0.0016, |
|
"reward": 0.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 227.0, |
|
"epoch": 0.6214285714285714, |
|
"grad_norm": 0.19126032292842865, |
|
"kl": 0.12643316388130188, |
|
"learning_rate": 1.8831065057742657e-05, |
|
"loss": 0.0051, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 510.75, |
|
"epoch": 0.6285714285714286, |
|
"grad_norm": 0.003844304708763957, |
|
"kl": 0.016257166862487793, |
|
"learning_rate": 1.8228988296424877e-05, |
|
"loss": 0.0007, |
|
"reward": 0.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 243.0, |
|
"epoch": 0.6357142857142857, |
|
"grad_norm": 0.008268176577985287, |
|
"kl": 0.045144014060497284, |
|
"learning_rate": 1.7631120639727393e-05, |
|
"loss": 0.0018, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 235.25, |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 0.14653581380844116, |
|
"kl": 0.038456518203020096, |
|
"learning_rate": 1.7037833743707892e-05, |
|
"loss": 0.0015, |
|
"reward": 2.375, |
|
"reward_std": 0.5824823379516602, |
|
"rewards/correctness_reward_func": 1.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 220.125, |
|
"epoch": 0.65, |
|
"grad_norm": 0.14805400371551514, |
|
"kl": 0.07326260954141617, |
|
"learning_rate": 1.6449496416858284e-05, |
|
"loss": 0.0029, |
|
"reward": 1.3125, |
|
"reward_std": 1.1319231986999512, |
|
"rewards/correctness_reward_func": 0.75, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 449.0, |
|
"epoch": 0.6571428571428571, |
|
"grad_norm": 0.11858955770730972, |
|
"kl": 0.027846258133649826, |
|
"learning_rate": 1.5866474390840125e-05, |
|
"loss": 0.0011, |
|
"reward": 0.75, |
|
"reward_std": 0.7071067690849304, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 219.5, |
|
"epoch": 0.6642857142857143, |
|
"grad_norm": 0.21257686614990234, |
|
"kl": 0.058622974902391434, |
|
"learning_rate": 1.5289130093132632e-05, |
|
"loss": 0.0023, |
|
"reward": 1.046875, |
|
"reward_std": 0.9159731268882751, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 191.125, |
|
"epoch": 0.6714285714285714, |
|
"grad_norm": 0.017704889178276062, |
|
"kl": 0.06602377444505692, |
|
"learning_rate": 1.4717822421734718e-05, |
|
"loss": 0.0026, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 727.75, |
|
"epoch": 0.6785714285714286, |
|
"grad_norm": 0.07640424370765686, |
|
"kl": 0.015565132722258568, |
|
"learning_rate": 1.4152906522061048e-05, |
|
"loss": 0.0006, |
|
"reward": 1.875, |
|
"reward_std": 1.0264363288879395, |
|
"rewards/correctness_reward_func": 1.25, |
|
"rewards/length_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 333.625, |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.011164901778101921, |
|
"kl": 0.03891594707965851, |
|
"learning_rate": 1.3594733566170926e-05, |
|
"loss": 0.0016, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 795.5, |
|
"epoch": 0.6928571428571428, |
|
"grad_norm": 0.07001640647649765, |
|
"kl": 0.014953548088669777, |
|
"learning_rate": 1.3043650534467053e-05, |
|
"loss": 0.0006, |
|
"reward": 1.203125, |
|
"reward_std": 0.9863747954368591, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/length_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.453125, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 342.5, |
|
"epoch": 0.7, |
|
"grad_norm": 0.015015755780041218, |
|
"kl": 0.03935668244957924, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 0.0016, |
|
"reward": 2.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 2.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 550.25, |
|
"epoch": 0.7071428571428572, |
|
"grad_norm": 0.10194458067417145, |
|
"kl": 0.022631347179412842, |
|
"learning_rate": 1.196411991551255e-05, |
|
"loss": 0.0009, |
|
"reward": 1.5, |
|
"reward_std": 1.0690449476242065, |
|
"rewards/correctness_reward_func": 1.0, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 247.375, |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.15261337161064148, |
|
"kl": 0.08087805658578873, |
|
"learning_rate": 1.1436343403356017e-05, |
|
"loss": 0.0032, |
|
"reward": 2.0625, |
|
"reward_std": 0.979704737663269, |
|
"rewards/correctness_reward_func": 1.5, |
|
"rewards/length_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 140, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|