|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3189326556543837, |
|
"eval_steps": 500, |
|
"global_step": 502, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 630.0, |
|
"epoch": 0.0006353240152477764, |
|
"grad_norm": 3.4243216514587402, |
|
"kl": 0.0, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 0.0, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 337.14288330078125, |
|
"epoch": 0.0012706480304955528, |
|
"grad_norm": 5.606422424316406, |
|
"kl": 0.0, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.09759000688791275, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 517.2857666015625, |
|
"epoch": 0.0019059720457433292, |
|
"grad_norm": 3.686295986175537, |
|
"kl": 0.0004425048828125, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0, |
|
"reward": 0.26031747460365295, |
|
"reward_std": 0.19519509375095367, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0317460335791111, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 638.4285888671875, |
|
"epoch": 0.0025412960609911056, |
|
"grad_norm": 3.40989089012146, |
|
"kl": 0.0004863739013671875, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.0, |
|
"reward": 0.1714285910129547, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.4285714626312256, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.4285714626312256, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 376.4285888671875, |
|
"epoch": 0.0031766200762388818, |
|
"grad_norm": 4.474236011505127, |
|
"kl": 0.0008697509765625, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.40000003576278687, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 484.71429443359375, |
|
"epoch": 0.0038119440914866584, |
|
"grad_norm": 4.529554843902588, |
|
"kl": 0.0179443359375, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0002, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 446.0000305175781, |
|
"epoch": 0.0044472681067344345, |
|
"grad_norm": 3.3631019592285156, |
|
"kl": 0.00128936767578125, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 0.0, |
|
"reward": 0.8025974631309509, |
|
"reward_std": 0.4480050206184387, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.40259745717048645, |
|
"rewards/format_reward": 1.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 765.0000610351562, |
|
"epoch": 0.005082592121982211, |
|
"grad_norm": 3.205476999282837, |
|
"kl": 0.000675201416015625, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.0, |
|
"reward": 0.1714285910129547, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.4285714626312256, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.4285714626312256, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 628.5714721679688, |
|
"epoch": 0.005717916137229987, |
|
"grad_norm": 0.008113108575344086, |
|
"kl": 0.001678466796875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 934.1428833007812, |
|
"epoch": 0.0063532401524777635, |
|
"grad_norm": 2.805612087249756, |
|
"kl": 0.00390625, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 733.4285888671875, |
|
"epoch": 0.00698856416772554, |
|
"grad_norm": 0.014334071427583694, |
|
"kl": 0.0016326904296875, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 0.0003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 545.5714721679688, |
|
"epoch": 0.007623888182973317, |
|
"grad_norm": 3.0349507331848145, |
|
"kl": 0.003936767578125, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.7142944335938, |
|
"epoch": 0.008259212198221092, |
|
"grad_norm": 2.7579803466796875, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.5714416503906, |
|
"epoch": 0.008894536213468869, |
|
"grad_norm": 0.06331096589565277, |
|
"kl": 0.0101318359375, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.0004, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 583.5714721679688, |
|
"epoch": 0.009529860228716646, |
|
"grad_norm": 1.8358005285263062, |
|
"kl": 0.0576171875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0009, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 528.4285888671875, |
|
"epoch": 0.010165184243964422, |
|
"grad_norm": 5.376899242401123, |
|
"kl": 0.173828125, |
|
"learning_rate": 4.999952797253148e-06, |
|
"loss": 0.0017, |
|
"reward": 0.971428632736206, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 390.5714416503906, |
|
"epoch": 0.010800508259212199, |
|
"grad_norm": 5.1895365715026855, |
|
"kl": 0.1767578125, |
|
"learning_rate": 4.9998111909931225e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5571428537368774, |
|
"reward_std": 0.12051477283239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.2142857313156128, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.5714416503906, |
|
"epoch": 0.011435832274459974, |
|
"grad_norm": 3.5275230407714844, |
|
"kl": 0.05126953125, |
|
"learning_rate": 4.999575187161439e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6000000238418579, |
|
"reward_std": 0.432049423456192, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 523.1428833007812, |
|
"epoch": 0.01207115628970775, |
|
"grad_norm": 3.147331953048706, |
|
"kl": 0.04541015625, |
|
"learning_rate": 4.9992447956603455e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 330.14288330078125, |
|
"epoch": 0.012706480304955527, |
|
"grad_norm": 108.5281753540039, |
|
"kl": 3.046875, |
|
"learning_rate": 4.998820030352409e-06, |
|
"loss": 0.0305, |
|
"reward": 0.6884711980819702, |
|
"reward_std": 0.4107622504234314, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.31704264879226685, |
|
"rewards/format_reward": 1.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 359.0000305175781, |
|
"epoch": 0.013341804320203304, |
|
"grad_norm": 5.423805236816406, |
|
"kl": 0.1591796875, |
|
"learning_rate": 4.998300909059929e-06, |
|
"loss": 0.0016, |
|
"reward": 0.5269841551780701, |
|
"reward_std": 0.1777612417936325, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1269841343164444, |
|
"rewards/format_reward": 1.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 491.2857360839844, |
|
"epoch": 0.01397712833545108, |
|
"grad_norm": 1545.3717041015625, |
|
"kl": 23.0, |
|
"learning_rate": 4.997687453564198e-06, |
|
"loss": 0.2309, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.6078847646713257, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 503.5714416503906, |
|
"epoch": 0.014612452350698857, |
|
"grad_norm": 8.096309661865234, |
|
"kl": 0.69140625, |
|
"learning_rate": 4.9969796896045775e-06, |
|
"loss": 0.0069, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.10690449923276901, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 604.1428833007812, |
|
"epoch": 0.015247776365946633, |
|
"grad_norm": 45758.625, |
|
"kl": 608.0, |
|
"learning_rate": 4.996177646877426e-06, |
|
"loss": 6.0704, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 605.1428833007812, |
|
"epoch": 0.01588310038119441, |
|
"grad_norm": 3.5074126720428467, |
|
"kl": 0.1083984375, |
|
"learning_rate": 4.995281359034851e-06, |
|
"loss": 0.0011, |
|
"reward": 0.05714286118745804, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.1428571492433548, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.1428571492433548, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.2857360839844, |
|
"epoch": 0.016518424396442185, |
|
"grad_norm": 9.928363800048828, |
|
"kl": 0.421875, |
|
"learning_rate": 4.994290863683296e-06, |
|
"loss": 0.0042, |
|
"reward": 0.11428572237491608, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.2857142984867096, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.2857142984867096, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 420.8571472167969, |
|
"epoch": 0.017153748411689963, |
|
"grad_norm": 30.34365463256836, |
|
"kl": 1.2890625, |
|
"learning_rate": 4.99320620238196e-06, |
|
"loss": 0.0129, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 618.7142944335938, |
|
"epoch": 0.017789072426937738, |
|
"grad_norm": 3.0826454162597656, |
|
"kl": 0.051513671875, |
|
"learning_rate": 4.99202742064106e-06, |
|
"loss": 0.0005, |
|
"reward": 0.22857144474983215, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.5714721679688, |
|
"epoch": 0.018424396442185513, |
|
"grad_norm": 3.330746650695801, |
|
"kl": 0.07666015625, |
|
"learning_rate": 4.990754567919917e-06, |
|
"loss": 0.0008, |
|
"reward": 0.4571428894996643, |
|
"reward_std": 0.35989415645599365, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.1714285910129547, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 440.2857360839844, |
|
"epoch": 0.01905972045743329, |
|
"grad_norm": 3.658856153488159, |
|
"kl": 0.1318359375, |
|
"learning_rate": 4.989387697624881e-06, |
|
"loss": 0.0013, |
|
"reward": 0.3086913228034973, |
|
"reward_std": 0.20198491215705872, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.02297702245414257, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 764.0000610351562, |
|
"epoch": 0.019695044472681066, |
|
"grad_norm": 2.7840328216552734, |
|
"kl": 0.078125, |
|
"learning_rate": 4.987926867107095e-06, |
|
"loss": 0.0008, |
|
"reward": 0.31462588906288147, |
|
"reward_std": 0.19902175664901733, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.028911566361784935, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.4285888671875, |
|
"epoch": 0.020330368487928845, |
|
"grad_norm": 3.031372308731079, |
|
"kl": 0.078125, |
|
"learning_rate": 4.986372137660078e-06, |
|
"loss": 0.0008, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.22677868604660034, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.14288330078125, |
|
"epoch": 0.02096569250317662, |
|
"grad_norm": 3.5374386310577393, |
|
"kl": 0.107421875, |
|
"learning_rate": 4.984723574517165e-06, |
|
"loss": 0.0011, |
|
"reward": 0.3362637758255005, |
|
"reward_std": 0.17483238875865936, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.021978024393320084, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 510.71429443359375, |
|
"epoch": 0.021601016518424398, |
|
"grad_norm": 3.355870246887207, |
|
"kl": 0.20703125, |
|
"learning_rate": 4.9829812468487655e-06, |
|
"loss": 0.0021, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 617.7142944335938, |
|
"epoch": 0.022236340533672173, |
|
"grad_norm": 2.325124979019165, |
|
"kl": 0.0849609375, |
|
"learning_rate": 4.981145227759457e-06, |
|
"loss": 0.0008, |
|
"reward": 0.5142857432365417, |
|
"reward_std": 0.39761197566986084, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 329.4285888671875, |
|
"epoch": 0.022871664548919948, |
|
"grad_norm": 4.9638495445251465, |
|
"kl": 0.224609375, |
|
"learning_rate": 4.979215594284924e-06, |
|
"loss": 0.0022, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 734.5714721679688, |
|
"epoch": 0.023506988564167726, |
|
"grad_norm": 0.06015148386359215, |
|
"kl": 0.08154296875, |
|
"learning_rate": 4.977192427388722e-06, |
|
"loss": 0.0011, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 481.5714416503906, |
|
"epoch": 0.0241423125794155, |
|
"grad_norm": 0.20115888118743896, |
|
"kl": 0.12109375, |
|
"learning_rate": 4.9750758119588824e-06, |
|
"loss": 0.0015, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 817.0000610351562, |
|
"epoch": 0.02477763659466328, |
|
"grad_norm": 2.3490183353424072, |
|
"kl": 0.0888671875, |
|
"learning_rate": 4.972865836804349e-06, |
|
"loss": 0.0009, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 622.1428833007812, |
|
"epoch": 0.025412960609911054, |
|
"grad_norm": 40.009159088134766, |
|
"kl": 2.34375, |
|
"learning_rate": 4.970562594651254e-06, |
|
"loss": 0.0234, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 498.71429443359375, |
|
"epoch": 0.026048284625158832, |
|
"grad_norm": 3.2044177055358887, |
|
"kl": 0.12109375, |
|
"learning_rate": 4.968166182139026e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 480.4285888671875, |
|
"epoch": 0.026683608640406607, |
|
"grad_norm": 0.08217895030975342, |
|
"kl": 0.1484375, |
|
"learning_rate": 4.9656766998163306e-06, |
|
"loss": 0.0018, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.5714721679688, |
|
"epoch": 0.027318932655654382, |
|
"grad_norm": 3.2091763019561768, |
|
"kl": 0.0966796875, |
|
"learning_rate": 4.963094252136865e-06, |
|
"loss": 0.001, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 595.857177734375, |
|
"epoch": 0.02795425667090216, |
|
"grad_norm": 0.07719128578901291, |
|
"kl": 0.115234375, |
|
"learning_rate": 4.960418947454958e-06, |
|
"loss": 0.0014, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 598.0, |
|
"epoch": 0.028589580686149935, |
|
"grad_norm": 0.10212292522192001, |
|
"kl": 0.1298828125, |
|
"learning_rate": 4.957650898021038e-06, |
|
"loss": 0.0016, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 517.857177734375, |
|
"epoch": 0.029224904701397714, |
|
"grad_norm": 2.6977856159210205, |
|
"kl": 0.1337890625, |
|
"learning_rate": 4.954790219976915e-06, |
|
"loss": 0.0013, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.5714721679688, |
|
"epoch": 0.02986022871664549, |
|
"grad_norm": 0.09081219881772995, |
|
"kl": 0.10888671875, |
|
"learning_rate": 4.95183703335091e-06, |
|
"loss": 0.0014, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 375.5714416503906, |
|
"epoch": 0.030495552731893267, |
|
"grad_norm": 3.1593081951141357, |
|
"kl": 0.162109375, |
|
"learning_rate": 4.948791462052819e-06, |
|
"loss": 0.0016, |
|
"reward": 0.9428572654724121, |
|
"reward_std": 0.5740416646003723, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 605.5714721679688, |
|
"epoch": 0.031130876747141042, |
|
"grad_norm": 0.2238318771123886, |
|
"kl": 0.1396484375, |
|
"learning_rate": 4.945653633868716e-06, |
|
"loss": 0.0017, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.0, |
|
"epoch": 0.03176620076238882, |
|
"grad_norm": 3.9522361755371094, |
|
"kl": 0.1796875, |
|
"learning_rate": 4.942423680455584e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 577.0, |
|
"epoch": 0.032401524777636595, |
|
"grad_norm": 3.095000743865967, |
|
"kl": 0.15234375, |
|
"learning_rate": 4.939101737335802e-06, |
|
"loss": 0.0015, |
|
"reward": 0.11428572237491608, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.2857142984867096, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.2857142984867096, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 651.2857666015625, |
|
"epoch": 0.03303684879288437, |
|
"grad_norm": 2.880178451538086, |
|
"kl": 0.14453125, |
|
"learning_rate": 4.935687943891447e-06, |
|
"loss": 0.0015, |
|
"reward": 0.22857144474983215, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 570.4285888671875, |
|
"epoch": 0.033672172808132145, |
|
"grad_norm": 6.103281021118164, |
|
"kl": 0.421875, |
|
"learning_rate": 4.932182443358458e-06, |
|
"loss": 0.0042, |
|
"reward": 0.22857144474983215, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 461.857177734375, |
|
"epoch": 0.03430749682337993, |
|
"grad_norm": 5.561570644378662, |
|
"kl": 0.283203125, |
|
"learning_rate": 4.928585382820616e-06, |
|
"loss": 0.0028, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 710.0000610351562, |
|
"epoch": 0.0349428208386277, |
|
"grad_norm": 0.06716505438089371, |
|
"kl": 0.1875, |
|
"learning_rate": 4.924896913203376e-06, |
|
"loss": 0.0022, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 725.5714721679688, |
|
"epoch": 0.035578144853875476, |
|
"grad_norm": 4.731691837310791, |
|
"kl": 0.130859375, |
|
"learning_rate": 4.921117189267535e-06, |
|
"loss": 0.0013, |
|
"reward": 0.3588235676288605, |
|
"reward_std": 0.43374723196029663, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.13025210797786713, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 665.2857666015625, |
|
"epoch": 0.03621346886912325, |
|
"grad_norm": 2.7444961071014404, |
|
"kl": 0.18359375, |
|
"learning_rate": 4.917246369602742e-06, |
|
"loss": 0.0018, |
|
"reward": 0.4285714626312256, |
|
"reward_std": 0.4680252969264984, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 648.0, |
|
"epoch": 0.036848792884371026, |
|
"grad_norm": 3.6189987659454346, |
|
"kl": 0.20703125, |
|
"learning_rate": 4.9132846166208355e-06, |
|
"loss": 0.0021, |
|
"reward": 0.5142857432365417, |
|
"reward_std": 0.6309479475021362, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 483.14288330078125, |
|
"epoch": 0.03748411689961881, |
|
"grad_norm": 0.15579625964164734, |
|
"kl": 0.26953125, |
|
"learning_rate": 4.9092320965490365e-06, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.857177734375, |
|
"epoch": 0.03811944091486658, |
|
"grad_norm": 3.2243692874908447, |
|
"kl": 0.154296875, |
|
"learning_rate": 4.905088979422971e-06, |
|
"loss": 0.0015, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 446.0000305175781, |
|
"epoch": 0.03875476493011436, |
|
"grad_norm": 3.187040090560913, |
|
"kl": 0.2392578125, |
|
"learning_rate": 4.900855439079536e-06, |
|
"loss": 0.0024, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 619.7142944335938, |
|
"epoch": 0.03939008894536213, |
|
"grad_norm": 3.501086473464966, |
|
"kl": 0.205078125, |
|
"learning_rate": 4.8965316531496055e-06, |
|
"loss": 0.0021, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.09759001433849335, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 598.0, |
|
"epoch": 0.040025412960609914, |
|
"grad_norm": 4.5344557762146, |
|
"kl": 0.1484375, |
|
"learning_rate": 4.892117803050578e-06, |
|
"loss": 0.0015, |
|
"reward": 0.22857144474983215, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 507.0000305175781, |
|
"epoch": 0.04066073697585769, |
|
"grad_norm": 6.242559432983398, |
|
"kl": 0.392578125, |
|
"learning_rate": 4.887614073978761e-06, |
|
"loss": 0.0039, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 757.1428833007812, |
|
"epoch": 0.041296060991105464, |
|
"grad_norm": 10.64748764038086, |
|
"kl": 0.53515625, |
|
"learning_rate": 4.883020654901609e-06, |
|
"loss": 0.0054, |
|
"reward": 0.22857144474983215, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 585.4285888671875, |
|
"epoch": 0.04193138500635324, |
|
"grad_norm": 3.1377880573272705, |
|
"kl": 0.2119140625, |
|
"learning_rate": 4.878337738549785e-06, |
|
"loss": 0.0021, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 604.2857666015625, |
|
"epoch": 0.042566709021601014, |
|
"grad_norm": 0.17088104784488678, |
|
"kl": 0.2490234375, |
|
"learning_rate": 4.873565521409082e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 842.5714721679688, |
|
"epoch": 0.043202033036848796, |
|
"grad_norm": 3.577969551086426, |
|
"kl": 0.384765625, |
|
"learning_rate": 4.868704203712173e-06, |
|
"loss": 0.0038, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 841.0000610351562, |
|
"epoch": 0.04383735705209657, |
|
"grad_norm": 3.18711256980896, |
|
"kl": 0.2080078125, |
|
"learning_rate": 4.86375398943021e-06, |
|
"loss": 0.0021, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 525.4285888671875, |
|
"epoch": 0.044472681067344345, |
|
"grad_norm": 2.3753271102905273, |
|
"kl": 0.177734375, |
|
"learning_rate": 4.858715086264274e-06, |
|
"loss": 0.0018, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 406.8571472167969, |
|
"epoch": 0.04510800508259212, |
|
"grad_norm": 3.5183515548706055, |
|
"kl": 0.2138671875, |
|
"learning_rate": 4.853587705636646e-06, |
|
"loss": 0.0021, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 630.5714721679688, |
|
"epoch": 0.045743329097839895, |
|
"grad_norm": 2.6844465732574463, |
|
"kl": 0.103515625, |
|
"learning_rate": 4.84837206268195e-06, |
|
"loss": 0.001, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 464.14288330078125, |
|
"epoch": 0.04637865311308768, |
|
"grad_norm": 3.63313627243042, |
|
"kl": 0.1591796875, |
|
"learning_rate": 4.8430683762381195e-06, |
|
"loss": 0.0016, |
|
"reward": 0.2571428716182709, |
|
"reward_std": 0.19023796916007996, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 507.2857360839844, |
|
"epoch": 0.04701397712833545, |
|
"grad_norm": 2.707540988922119, |
|
"kl": 0.111328125, |
|
"learning_rate": 4.837676868837213e-06, |
|
"loss": 0.0011, |
|
"reward": 0.4023166298866272, |
|
"reward_std": 0.004252949263900518, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0023166025057435036, |
|
"rewards/format_reward": 1.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 558.857177734375, |
|
"epoch": 0.04764930114358323, |
|
"grad_norm": 3.0911412239074707, |
|
"kl": 0.1474609375, |
|
"learning_rate": 4.832197766696085e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.4285888671875, |
|
"epoch": 0.048284625158831, |
|
"grad_norm": 3.5635647773742676, |
|
"kl": 0.1357421875, |
|
"learning_rate": 4.826631299706887e-06, |
|
"loss": 0.0014, |
|
"reward": 0.5032258033752441, |
|
"reward_std": 0.2984292209148407, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.18894009292125702, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 382.2857360839844, |
|
"epoch": 0.04891994917407878, |
|
"grad_norm": 0.05187558755278587, |
|
"kl": 0.1611328125, |
|
"learning_rate": 4.820977701427424e-06, |
|
"loss": 0.0019, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 466.14288330078125, |
|
"epoch": 0.04955527318932656, |
|
"grad_norm": 0.04543861746788025, |
|
"kl": 0.130859375, |
|
"learning_rate": 4.81523720907136e-06, |
|
"loss": 0.0016, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 551.1428833007812, |
|
"epoch": 0.05019059720457433, |
|
"grad_norm": 0.45131492614746094, |
|
"kl": 0.28515625, |
|
"learning_rate": 4.809410063498254e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 379.5714416503906, |
|
"epoch": 0.05082592121982211, |
|
"grad_norm": 3.854992151260376, |
|
"kl": 0.1591796875, |
|
"learning_rate": 4.8034965092034656e-06, |
|
"loss": 0.0016, |
|
"reward": 1.2000001668930054, |
|
"reward_std": 0.3829708993434906, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.8571429252624512, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 566.857177734375, |
|
"epoch": 0.05146124523506988, |
|
"grad_norm": 41.30952453613281, |
|
"kl": 1.9921875, |
|
"learning_rate": 4.797496794307889e-06, |
|
"loss": 0.0199, |
|
"reward": 0.2489795982837677, |
|
"reward_std": 0.23831285536289215, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.020408164709806442, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 448.71429443359375, |
|
"epoch": 0.052096569250317665, |
|
"grad_norm": 4.601639270782471, |
|
"kl": 0.2412109375, |
|
"learning_rate": 4.791411170547545e-06, |
|
"loss": 0.0024, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 614.5714721679688, |
|
"epoch": 0.05273189326556544, |
|
"grad_norm": 4.209871768951416, |
|
"kl": 0.396484375, |
|
"learning_rate": 4.785239893263017e-06, |
|
"loss": 0.004, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 410.4285888671875, |
|
"epoch": 0.053367217280813214, |
|
"grad_norm": 4.049778938293457, |
|
"kl": 0.171875, |
|
"learning_rate": 4.778983221388742e-06, |
|
"loss": 0.0017, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 421.71429443359375, |
|
"epoch": 0.05400254129606099, |
|
"grad_norm": 5.517159461975098, |
|
"kl": 0.318359375, |
|
"learning_rate": 4.77264141744214e-06, |
|
"loss": 0.0032, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 405.8571472167969, |
|
"epoch": 0.054637865311308764, |
|
"grad_norm": 1.0127395391464233, |
|
"kl": 0.37109375, |
|
"learning_rate": 4.766214747512603e-06, |
|
"loss": 0.004, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.2857360839844, |
|
"epoch": 0.055273189326556546, |
|
"grad_norm": 5.1016316413879395, |
|
"kl": 0.625, |
|
"learning_rate": 4.759703481250331e-06, |
|
"loss": 0.0062, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 415.5714416503906, |
|
"epoch": 0.05590851334180432, |
|
"grad_norm": 3205.96337890625, |
|
"kl": 94.0, |
|
"learning_rate": 4.753107891855015e-06, |
|
"loss": 0.9386, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 162.0, |
|
"epoch": 0.056543837357052096, |
|
"grad_norm": 5.175131797790527, |
|
"kl": 0.349609375, |
|
"learning_rate": 4.746428256064375e-06, |
|
"loss": 0.0035, |
|
"reward": 0.971428632736206, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 255.2857208251953, |
|
"epoch": 0.05717916137229987, |
|
"grad_norm": 4.350471496582031, |
|
"kl": 0.251953125, |
|
"learning_rate": 4.7396648541425534e-06, |
|
"loss": 0.0025, |
|
"reward": 0.9142858386039734, |
|
"reward_std": 0.6202918887138367, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 515.7142944335938, |
|
"epoch": 0.05781448538754765, |
|
"grad_norm": 3.159823417663574, |
|
"kl": 0.1904296875, |
|
"learning_rate": 4.732817969868348e-06, |
|
"loss": 0.0019, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 504.5714416503906, |
|
"epoch": 0.05844980940279543, |
|
"grad_norm": 3.7896342277526855, |
|
"kl": 0.2353515625, |
|
"learning_rate": 4.7258878905233095e-06, |
|
"loss": 0.0023, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 355.5714416503906, |
|
"epoch": 0.0590851334180432, |
|
"grad_norm": 2.87099289894104, |
|
"kl": 0.1962890625, |
|
"learning_rate": 4.718874906879688e-06, |
|
"loss": 0.002, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645264148712, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 561.0, |
|
"epoch": 0.05972045743329098, |
|
"grad_norm": 3.082097053527832, |
|
"kl": 0.220703125, |
|
"learning_rate": 4.711779313188231e-06, |
|
"loss": 0.0022, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 430.0000305175781, |
|
"epoch": 0.06035578144853875, |
|
"grad_norm": 0.06256424635648727, |
|
"kl": 0.2138671875, |
|
"learning_rate": 4.70460140716584e-06, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 695.1428833007812, |
|
"epoch": 0.060991105463786534, |
|
"grad_norm": 3.0291473865509033, |
|
"kl": 0.412109375, |
|
"learning_rate": 4.697341489983076e-06, |
|
"loss": 0.0041, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 459.4285888671875, |
|
"epoch": 0.06162642947903431, |
|
"grad_norm": 3.461120843887329, |
|
"kl": 0.291015625, |
|
"learning_rate": 4.6899998662515215e-06, |
|
"loss": 0.0029, |
|
"reward": 0.4714285731315613, |
|
"reward_std": 0.06681530177593231, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0714285746216774, |
|
"rewards/format_reward": 1.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 626.4285888671875, |
|
"epoch": 0.062261753494282084, |
|
"grad_norm": 0.07474807649850845, |
|
"kl": 0.3203125, |
|
"learning_rate": 4.682576844011007e-06, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.2857666015625, |
|
"epoch": 0.06289707750952986, |
|
"grad_norm": 0.3679976463317871, |
|
"kl": 0.28515625, |
|
"learning_rate": 4.675072734716678e-06, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 627.1428833007812, |
|
"epoch": 0.06353240152477764, |
|
"grad_norm": 39.379425048828125, |
|
"kl": 1.5390625, |
|
"learning_rate": 4.667487853225931e-06, |
|
"loss": 0.0153, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.42983949184417725, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 636.0, |
|
"epoch": 0.06416772554002541, |
|
"grad_norm": 0.5123635530471802, |
|
"kl": 0.1962890625, |
|
"learning_rate": 4.659822517785203e-06, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 623.7142944335938, |
|
"epoch": 0.06480304955527319, |
|
"grad_norm": 2.7022430896759033, |
|
"kl": 0.2314453125, |
|
"learning_rate": 4.6520770500166165e-06, |
|
"loss": 0.0023, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 338.4285888671875, |
|
"epoch": 0.06543837357052097, |
|
"grad_norm": 3.6665234565734863, |
|
"kl": 0.294921875, |
|
"learning_rate": 4.644251774904487e-06, |
|
"loss": 0.0029, |
|
"reward": 1.057142972946167, |
|
"reward_std": 0.5968170166015625, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 563.5714721679688, |
|
"epoch": 0.06607369758576874, |
|
"grad_norm": 0.09322147816419601, |
|
"kl": 0.23828125, |
|
"learning_rate": 4.636347020781684e-06, |
|
"loss": 0.0027, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 734.1428833007812, |
|
"epoch": 0.06670902160101652, |
|
"grad_norm": 2.2183244228363037, |
|
"kl": 0.177734375, |
|
"learning_rate": 4.6283631193158605e-06, |
|
"loss": 0.0018, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 495.4285888671875, |
|
"epoch": 0.06734434561626429, |
|
"grad_norm": 3.992103338241577, |
|
"kl": 0.345703125, |
|
"learning_rate": 4.620300405495532e-06, |
|
"loss": 0.0035, |
|
"reward": 0.2571428716182709, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.4285714626312256, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.4285888671875, |
|
"epoch": 0.06797966963151207, |
|
"grad_norm": 4.281430244445801, |
|
"kl": 0.26953125, |
|
"learning_rate": 4.612159217616022e-06, |
|
"loss": 0.0027, |
|
"reward": 0.2571428716182709, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.4285714626312256, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 463.2857360839844, |
|
"epoch": 0.06861499364675985, |
|
"grad_norm": 3.0798134803771973, |
|
"kl": 0.2158203125, |
|
"learning_rate": 4.603939897265268e-06, |
|
"loss": 0.0022, |
|
"reward": 0.5067504644393921, |
|
"reward_std": 0.28243502974510193, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.10675039887428284, |
|
"rewards/format_reward": 1.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.7142944335938, |
|
"epoch": 0.06925031766200762, |
|
"grad_norm": 3.179516077041626, |
|
"kl": 0.236328125, |
|
"learning_rate": 4.595642789309492e-06, |
|
"loss": 0.0024, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 673.1428833007812, |
|
"epoch": 0.0698856416772554, |
|
"grad_norm": 2.751520872116089, |
|
"kl": 0.228515625, |
|
"learning_rate": 4.587268241878724e-06, |
|
"loss": 0.0023, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 506.71429443359375, |
|
"epoch": 0.07052096569250317, |
|
"grad_norm": 0.06256023049354553, |
|
"kl": 0.2490234375, |
|
"learning_rate": 4.578816606352205e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 705.5714721679688, |
|
"epoch": 0.07115628970775095, |
|
"grad_norm": 0.08651240170001984, |
|
"kl": 0.1826171875, |
|
"learning_rate": 4.570288237343632e-06, |
|
"loss": 0.0021, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 667.2857666015625, |
|
"epoch": 0.07179161372299873, |
|
"grad_norm": 1.87228262424469, |
|
"kl": 0.1552734375, |
|
"learning_rate": 4.561683492686289e-06, |
|
"loss": 0.0016, |
|
"reward": 0.37142857909202576, |
|
"reward_std": 0.335232675075531, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 511.857177734375, |
|
"epoch": 0.0724269377382465, |
|
"grad_norm": 0.06886231899261475, |
|
"kl": 0.21875, |
|
"learning_rate": 4.5530027334180285e-06, |
|
"loss": 0.0025, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 746.857177734375, |
|
"epoch": 0.07306226175349428, |
|
"grad_norm": 2.801109552383423, |
|
"kl": 0.2060546875, |
|
"learning_rate": 4.544246323766122e-06, |
|
"loss": 0.0021, |
|
"reward": 0.22857144474983215, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 639.5714721679688, |
|
"epoch": 0.07369758576874205, |
|
"grad_norm": 2.9071028232574463, |
|
"kl": 0.2373046875, |
|
"learning_rate": 4.535414631131983e-06, |
|
"loss": 0.0024, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 760.2857666015625, |
|
"epoch": 0.07433290978398983, |
|
"grad_norm": 2.4393320083618164, |
|
"kl": 0.201171875, |
|
"learning_rate": 4.526508026075746e-06, |
|
"loss": 0.002, |
|
"reward": 0.2571428716182709, |
|
"reward_std": 0.19023796916007996, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 729.7142944335938, |
|
"epoch": 0.07496823379923762, |
|
"grad_norm": 7.918629169464111, |
|
"kl": 0.671875, |
|
"learning_rate": 4.517526882300721e-06, |
|
"loss": 0.0067, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 745.1428833007812, |
|
"epoch": 0.07560355781448538, |
|
"grad_norm": 2.229508876800537, |
|
"kl": 0.185546875, |
|
"learning_rate": 4.508471576637713e-06, |
|
"loss": 0.0019, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 508.4285888671875, |
|
"epoch": 0.07623888182973317, |
|
"grad_norm": 2.4816155433654785, |
|
"kl": 0.1845703125, |
|
"learning_rate": 4.499342489029211e-06, |
|
"loss": 0.0018, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.857177734375, |
|
"epoch": 0.07687420584498093, |
|
"grad_norm": 3.3153207302093506, |
|
"kl": 0.2490234375, |
|
"learning_rate": 4.490140002513449e-06, |
|
"loss": 0.0025, |
|
"reward": 0.43708792328834534, |
|
"reward_std": 0.0981253907084465, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.03708791360259056, |
|
"rewards/format_reward": 1.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 776.1428833007812, |
|
"epoch": 0.07750952986022872, |
|
"grad_norm": 2.2401111125946045, |
|
"kl": 0.1484375, |
|
"learning_rate": 4.48086450320833e-06, |
|
"loss": 0.0015, |
|
"reward": 0.22857144474983215, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 764.7142944335938, |
|
"epoch": 0.0781448538754765, |
|
"grad_norm": 2.0424013137817383, |
|
"kl": 0.1806640625, |
|
"learning_rate": 4.4715163802952266e-06, |
|
"loss": 0.0018, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 543.7142944335938, |
|
"epoch": 0.07878017789072427, |
|
"grad_norm": 0.1234215497970581, |
|
"kl": 0.25390625, |
|
"learning_rate": 4.462096026002655e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 741.5714721679688, |
|
"epoch": 0.07941550190597205, |
|
"grad_norm": 2.020581007003784, |
|
"kl": 0.1884765625, |
|
"learning_rate": 4.4526038355898144e-06, |
|
"loss": 0.0019, |
|
"reward": 0.36326533555984497, |
|
"reward_std": 0.16554531455039978, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.020408164709806442, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 593.857177734375, |
|
"epoch": 0.08005082592121983, |
|
"grad_norm": 2.3378539085388184, |
|
"kl": 0.2373046875, |
|
"learning_rate": 4.4430402073300035e-06, |
|
"loss": 0.0024, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.5714721679688, |
|
"epoch": 0.0806861499364676, |
|
"grad_norm": 0.06773251295089722, |
|
"kl": 0.234375, |
|
"learning_rate": 4.433405542493909e-06, |
|
"loss": 0.0026, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 703.1428833007812, |
|
"epoch": 0.08132147395171538, |
|
"grad_norm": 3.0085108280181885, |
|
"kl": 0.1630859375, |
|
"learning_rate": 4.4237002453327734e-06, |
|
"loss": 0.0016, |
|
"reward": 0.41020408272743225, |
|
"reward_std": 0.026997461915016174, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.010204082354903221, |
|
"rewards/format_reward": 1.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 363.2857360839844, |
|
"epoch": 0.08195679796696315, |
|
"grad_norm": 3.266486644744873, |
|
"kl": 0.171875, |
|
"learning_rate": 4.4139247230614245e-06, |
|
"loss": 0.0017, |
|
"reward": 0.4520833492279053, |
|
"reward_std": 0.12870661914348602, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0520833320915699, |
|
"rewards/format_reward": 1.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 395.2857360839844, |
|
"epoch": 0.08259212198221093, |
|
"grad_norm": 3.562208414077759, |
|
"kl": 0.322265625, |
|
"learning_rate": 4.404079385841201e-06, |
|
"loss": 0.0032, |
|
"reward": 0.4571428894996643, |
|
"reward_std": 0.2507132887840271, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 367.0000305175781, |
|
"epoch": 0.08322744599745871, |
|
"grad_norm": 0.06771758943796158, |
|
"kl": 0.244140625, |
|
"learning_rate": 4.394164646762734e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 523.857177734375, |
|
"epoch": 0.08386277001270648, |
|
"grad_norm": 2.9015965461730957, |
|
"kl": 0.220703125, |
|
"learning_rate": 4.384180921828618e-06, |
|
"loss": 0.0022, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.2857666015625, |
|
"epoch": 0.08449809402795426, |
|
"grad_norm": 3.0210537910461426, |
|
"kl": 0.1689453125, |
|
"learning_rate": 4.374128629935955e-06, |
|
"loss": 0.0017, |
|
"reward": 0.9833123087882996, |
|
"reward_std": 0.4177902638912201, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5833122730255127, |
|
"rewards/format_reward": 1.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 358.5714416503906, |
|
"epoch": 0.08513341804320203, |
|
"grad_norm": 2.964980363845825, |
|
"kl": 0.1962890625, |
|
"learning_rate": 4.364008192858781e-06, |
|
"loss": 0.002, |
|
"reward": 0.4098522365093231, |
|
"reward_std": 0.016825860366225243, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.009852217510342598, |
|
"rewards/format_reward": 1.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 473.5714416503906, |
|
"epoch": 0.08576874205844981, |
|
"grad_norm": 2.856916666030884, |
|
"kl": 0.2119140625, |
|
"learning_rate": 4.353820035230366e-06, |
|
"loss": 0.0021, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 507.2857360839844, |
|
"epoch": 0.08640406607369759, |
|
"grad_norm": 3.613862991333008, |
|
"kl": 0.169921875, |
|
"learning_rate": 4.3435645845254e-06, |
|
"loss": 0.0017, |
|
"reward": 0.6571429371833801, |
|
"reward_std": 0.5126960277557373, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 451.4285888671875, |
|
"epoch": 0.08703939008894536, |
|
"grad_norm": 0.05980609729886055, |
|
"kl": 0.177734375, |
|
"learning_rate": 4.333242271042054e-06, |
|
"loss": 0.0021, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 515.7142944335938, |
|
"epoch": 0.08767471410419314, |
|
"grad_norm": 3.2973406314849854, |
|
"kl": 0.248046875, |
|
"learning_rate": 4.32285352788393e-06, |
|
"loss": 0.0025, |
|
"reward": 0.5261905193328857, |
|
"reward_std": 0.28607451915740967, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1547619104385376, |
|
"rewards/format_reward": 1.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 384.14288330078125, |
|
"epoch": 0.08831003811944091, |
|
"grad_norm": 2.6424343585968018, |
|
"kl": 0.31640625, |
|
"learning_rate": 4.312398790941882e-06, |
|
"loss": 0.0032, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 326.8571472167969, |
|
"epoch": 0.08894536213468869, |
|
"grad_norm": 3.2031986713409424, |
|
"kl": 0.1884765625, |
|
"learning_rate": 4.301878498875735e-06, |
|
"loss": 0.0019, |
|
"reward": 0.971428632736206, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 366.71429443359375, |
|
"epoch": 0.08958068614993647, |
|
"grad_norm": 0.29011043906211853, |
|
"kl": 0.232421875, |
|
"learning_rate": 4.291293093095873e-06, |
|
"loss": 0.0026, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 353.5714416503906, |
|
"epoch": 0.09021601016518424, |
|
"grad_norm": 0.05228522792458534, |
|
"kl": 0.1728515625, |
|
"learning_rate": 4.280643017744723e-06, |
|
"loss": 0.002, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 571.857177734375, |
|
"epoch": 0.09085133418043202, |
|
"grad_norm": 4.668937683105469, |
|
"kl": 0.255859375, |
|
"learning_rate": 4.269928719678117e-06, |
|
"loss": 0.0026, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 441.0000305175781, |
|
"epoch": 0.09148665819567979, |
|
"grad_norm": 3.7214486598968506, |
|
"kl": 0.42578125, |
|
"learning_rate": 4.2591506484465426e-06, |
|
"loss": 0.0042, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.09759001433849335, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 493.14288330078125, |
|
"epoch": 0.09212198221092757, |
|
"grad_norm": 0.11755923926830292, |
|
"kl": 0.28515625, |
|
"learning_rate": 4.248309256276283e-06, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 365.4285888671875, |
|
"epoch": 0.09275730622617535, |
|
"grad_norm": 2.491922378540039, |
|
"kl": 0.2099609375, |
|
"learning_rate": 4.23740499805044e-06, |
|
"loss": 0.0021, |
|
"reward": 0.38035714626312256, |
|
"reward_std": 0.0817723423242569, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.008928571827709675, |
|
"rewards/format_reward": 1.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 363.4285888671875, |
|
"epoch": 0.09339263024142312, |
|
"grad_norm": 3.340977191925049, |
|
"kl": 0.185546875, |
|
"learning_rate": 4.22643833128985e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 244.57144165039062, |
|
"epoch": 0.0940279542566709, |
|
"grad_norm": 0.05019477382302284, |
|
"kl": 0.2255859375, |
|
"learning_rate": 4.215409716133885e-06, |
|
"loss": 0.0035, |
|
"reward": 1.4000002145767212, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 1.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 372.2857360839844, |
|
"epoch": 0.09466327827191867, |
|
"grad_norm": 3.1343963146209717, |
|
"kl": 0.34375, |
|
"learning_rate": 4.204319615321151e-06, |
|
"loss": 0.0034, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 355.5714416503906, |
|
"epoch": 0.09529860228716645, |
|
"grad_norm": 3.144193172454834, |
|
"kl": 0.35546875, |
|
"learning_rate": 4.193168494170065e-06, |
|
"loss": 0.0035, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.857177734375, |
|
"epoch": 0.09593392630241424, |
|
"grad_norm": 3.4582438468933105, |
|
"kl": 0.27734375, |
|
"learning_rate": 4.181956820559339e-06, |
|
"loss": 0.0028, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 556.857177734375, |
|
"epoch": 0.096569250317662, |
|
"grad_norm": 2.9669277667999268, |
|
"kl": 0.259765625, |
|
"learning_rate": 4.170685064908342e-06, |
|
"loss": 0.0026, |
|
"reward": 0.4870130121707916, |
|
"reward_std": 0.23021474480628967, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.08701299130916595, |
|
"rewards/format_reward": 1.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 561.1428833007812, |
|
"epoch": 0.09720457433290978, |
|
"grad_norm": 0.06397932022809982, |
|
"kl": 0.265625, |
|
"learning_rate": 4.159353700157365e-06, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 382.8571472167969, |
|
"epoch": 0.09783989834815757, |
|
"grad_norm": 0.12080562859773636, |
|
"kl": 0.369140625, |
|
"learning_rate": 4.14796320174778e-06, |
|
"loss": 0.004, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 583.7142944335938, |
|
"epoch": 0.09847522236340533, |
|
"grad_norm": 0.06819948554039001, |
|
"kl": 0.298828125, |
|
"learning_rate": 4.136514047602087e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 347.14288330078125, |
|
"epoch": 0.09911054637865312, |
|
"grad_norm": 0.07698381692171097, |
|
"kl": 0.3984375, |
|
"learning_rate": 4.1250067181038635e-06, |
|
"loss": 0.0043, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 692.1428833007812, |
|
"epoch": 0.09974587039390088, |
|
"grad_norm": 0.17737269401550293, |
|
"kl": 0.28125, |
|
"learning_rate": 4.113441696077608e-06, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 341.5714416503906, |
|
"epoch": 0.10038119440914867, |
|
"grad_norm": 4.598702907562256, |
|
"kl": 0.4296875, |
|
"learning_rate": 4.101819466768484e-06, |
|
"loss": 0.0043, |
|
"reward": 0.8134453892707825, |
|
"reward_std": 0.18902148306369781, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4134454131126404, |
|
"rewards/format_reward": 1.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 455.2857360839844, |
|
"epoch": 0.10101651842439645, |
|
"grad_norm": 0.07675225287675858, |
|
"kl": 0.302734375, |
|
"learning_rate": 4.0901405178219535e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 752.2857666015625, |
|
"epoch": 0.10165184243964422, |
|
"grad_norm": 1.9616703987121582, |
|
"kl": 0.2080078125, |
|
"learning_rate": 4.078405339263326e-06, |
|
"loss": 0.0021, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 349.8571472167969, |
|
"epoch": 0.102287166454892, |
|
"grad_norm": 3.1100926399230957, |
|
"kl": 0.2451171875, |
|
"learning_rate": 4.06661442347719e-06, |
|
"loss": 0.0025, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 616.1428833007812, |
|
"epoch": 0.10292249047013977, |
|
"grad_norm": 2.4743812084198, |
|
"kl": 0.296875, |
|
"learning_rate": 4.054768265186758e-06, |
|
"loss": 0.003, |
|
"reward": 0.4129870533943176, |
|
"reward_std": 0.034360405057668686, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.012987013906240463, |
|
"rewards/format_reward": 1.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 359.4285888671875, |
|
"epoch": 0.10355781448538755, |
|
"grad_norm": 3.092137336730957, |
|
"kl": 0.3046875, |
|
"learning_rate": 4.0428673614331036e-06, |
|
"loss": 0.0031, |
|
"reward": 0.971428632736206, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 439.5714416503906, |
|
"epoch": 0.10419313850063533, |
|
"grad_norm": 1.3710603713989258, |
|
"kl": 0.302734375, |
|
"learning_rate": 4.030912211554316e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 477.5714416503906, |
|
"epoch": 0.1048284625158831, |
|
"grad_norm": 2.647684097290039, |
|
"kl": 0.1826171875, |
|
"learning_rate": 4.018903317164539e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 545.857177734375, |
|
"epoch": 0.10546378653113088, |
|
"grad_norm": 0.07831098884344101, |
|
"kl": 0.1650390625, |
|
"learning_rate": 4.006841182132932e-06, |
|
"loss": 0.0019, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 517.857177734375, |
|
"epoch": 0.10609911054637865, |
|
"grad_norm": 0.6324580311775208, |
|
"kl": 0.296875, |
|
"learning_rate": 3.9947263125625195e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 451.5714416503906, |
|
"epoch": 0.10673443456162643, |
|
"grad_norm": 2.877990961074829, |
|
"kl": 0.166015625, |
|
"learning_rate": 3.982559216768967e-06, |
|
"loss": 0.0017, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 527.2857666015625, |
|
"epoch": 0.10736975857687421, |
|
"grad_norm": 0.31927597522735596, |
|
"kl": 0.310546875, |
|
"learning_rate": 3.970340405259245e-06, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 545.7142944335938, |
|
"epoch": 0.10800508259212198, |
|
"grad_norm": 2.99507474899292, |
|
"kl": 0.28125, |
|
"learning_rate": 3.958070390710214e-06, |
|
"loss": 0.0028, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 430.71429443359375, |
|
"epoch": 0.10864040660736976, |
|
"grad_norm": 3.1845176219940186, |
|
"kl": 0.302734375, |
|
"learning_rate": 3.945749687947109e-06, |
|
"loss": 0.003, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.4285888671875, |
|
"epoch": 0.10927573062261753, |
|
"grad_norm": 2.674358606338501, |
|
"kl": 0.375, |
|
"learning_rate": 3.933378813921942e-06, |
|
"loss": 0.0037, |
|
"reward": 0.46666669845581055, |
|
"reward_std": 0.11547007411718369, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.06666667759418488, |
|
"rewards/format_reward": 1.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 722.1428833007812, |
|
"epoch": 0.10991105463786531, |
|
"grad_norm": 1.7992055416107178, |
|
"kl": 0.205078125, |
|
"learning_rate": 3.920958287691811e-06, |
|
"loss": 0.002, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 436.71429443359375, |
|
"epoch": 0.11054637865311309, |
|
"grad_norm": 0.03782209753990173, |
|
"kl": 0.1591796875, |
|
"learning_rate": 3.908488630397121e-06, |
|
"loss": 0.0019, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 446.14288330078125, |
|
"epoch": 0.11118170266836086, |
|
"grad_norm": 2.619063138961792, |
|
"kl": 0.1796875, |
|
"learning_rate": 3.8959703652397175e-06, |
|
"loss": 0.0018, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 705.5714721679688, |
|
"epoch": 0.11181702668360864, |
|
"grad_norm": 2.71985125541687, |
|
"kl": 0.267578125, |
|
"learning_rate": 3.883404017460935e-06, |
|
"loss": 0.0027, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.71429443359375, |
|
"epoch": 0.11245235069885642, |
|
"grad_norm": 3.090186595916748, |
|
"kl": 0.2109375, |
|
"learning_rate": 3.870790114319559e-06, |
|
"loss": 0.0021, |
|
"reward": 0.6571429371833801, |
|
"reward_std": 0.5126960277557373, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 482.71429443359375, |
|
"epoch": 0.11308767471410419, |
|
"grad_norm": 0.13239029049873352, |
|
"kl": 0.287109375, |
|
"learning_rate": 3.858129185069701e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 671.5714721679688, |
|
"epoch": 0.11372299872935197, |
|
"grad_norm": 2.1125059127807617, |
|
"kl": 0.27734375, |
|
"learning_rate": 3.845421760938597e-06, |
|
"loss": 0.0028, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.4285888671875, |
|
"epoch": 0.11435832274459974, |
|
"grad_norm": 8.883991241455078, |
|
"kl": 0.9453125, |
|
"learning_rate": 3.832668375104312e-06, |
|
"loss": 0.0095, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.0, |
|
"epoch": 0.11499364675984752, |
|
"grad_norm": 12.942805290222168, |
|
"kl": 1.375, |
|
"learning_rate": 3.8198695626733725e-06, |
|
"loss": 0.0141, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 667.2857666015625, |
|
"epoch": 0.1156289707750953, |
|
"grad_norm": 2.5961642265319824, |
|
"kl": 0.23828125, |
|
"learning_rate": 3.8070258606583156e-06, |
|
"loss": 0.0024, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.10690449923276901, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 587.2857666015625, |
|
"epoch": 0.11626429479034307, |
|
"grad_norm": 3.12656831741333, |
|
"kl": 0.251953125, |
|
"learning_rate": 3.7941378079551544e-06, |
|
"loss": 0.0025, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 328.71429443359375, |
|
"epoch": 0.11689961880559085, |
|
"grad_norm": 2.924964427947998, |
|
"kl": 0.25, |
|
"learning_rate": 3.7812059453207677e-06, |
|
"loss": 0.0025, |
|
"reward": 0.6285714507102966, |
|
"reward_std": 0.5468525290489197, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 486.2857360839844, |
|
"epoch": 0.11753494282083862, |
|
"grad_norm": 3.5567517280578613, |
|
"kl": 0.296875, |
|
"learning_rate": 3.768230815350213e-06, |
|
"loss": 0.003, |
|
"reward": 0.37142857909202576, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.1428833007812, |
|
"epoch": 0.1181702668360864, |
|
"grad_norm": 2.51031494140625, |
|
"kl": 0.30859375, |
|
"learning_rate": 3.7552129624539557e-06, |
|
"loss": 0.0031, |
|
"reward": 0.37142857909202576, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.5714721679688, |
|
"epoch": 0.11880559085133419, |
|
"grad_norm": 2.589846611022949, |
|
"kl": 0.26171875, |
|
"learning_rate": 3.7421529328350316e-06, |
|
"loss": 0.0026, |
|
"reward": 0.5284404158592224, |
|
"reward_std": 0.19612440466880798, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.12844036519527435, |
|
"rewards/format_reward": 1.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 595.857177734375, |
|
"epoch": 0.11944091486658195, |
|
"grad_norm": 2.4991862773895264, |
|
"kl": 0.27734375, |
|
"learning_rate": 3.7290512744661274e-06, |
|
"loss": 0.0028, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 552.0, |
|
"epoch": 0.12007623888182974, |
|
"grad_norm": 2.0362820625305176, |
|
"kl": 0.1845703125, |
|
"learning_rate": 3.715908537066589e-06, |
|
"loss": 0.0019, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.42983949184417725, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 698.5714721679688, |
|
"epoch": 0.1207115628970775, |
|
"grad_norm": 2.4349231719970703, |
|
"kl": 0.25, |
|
"learning_rate": 3.7027252720793538e-06, |
|
"loss": 0.0025, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 453.2857360839844, |
|
"epoch": 0.12134688691232529, |
|
"grad_norm": 3.212773323059082, |
|
"kl": 0.1796875, |
|
"learning_rate": 3.689502032647817e-06, |
|
"loss": 0.0018, |
|
"reward": 0.36014559864997864, |
|
"reward_std": 0.1179879903793335, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.017288444563746452, |
|
"rewards/format_reward": 1.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 671.1428833007812, |
|
"epoch": 0.12198221092757307, |
|
"grad_norm": 0.08436475694179535, |
|
"kl": 0.1767578125, |
|
"learning_rate": 3.6762393735926245e-06, |
|
"loss": 0.0021, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 651.2857666015625, |
|
"epoch": 0.12261753494282084, |
|
"grad_norm": 2.7841696739196777, |
|
"kl": 0.146484375, |
|
"learning_rate": 3.6629378513883852e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 545.2857666015625, |
|
"epoch": 0.12325285895806862, |
|
"grad_norm": 3.3902151584625244, |
|
"kl": 0.205078125, |
|
"learning_rate": 3.6495980241403307e-06, |
|
"loss": 0.002, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 593.1428833007812, |
|
"epoch": 0.12388818297331639, |
|
"grad_norm": 2.5586631298065186, |
|
"kl": 0.197265625, |
|
"learning_rate": 3.636220451560896e-06, |
|
"loss": 0.002, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.7142944335938, |
|
"epoch": 0.12452350698856417, |
|
"grad_norm": 0.08236116170883179, |
|
"kl": 0.3203125, |
|
"learning_rate": 3.622805694946235e-06, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 780.7142944335938, |
|
"epoch": 0.12515883100381195, |
|
"grad_norm": 0.9724776148796082, |
|
"kl": 0.390625, |
|
"learning_rate": 3.609354317152667e-06, |
|
"loss": 0.0042, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 921.5714721679688, |
|
"epoch": 0.12579415501905972, |
|
"grad_norm": 2.485982894897461, |
|
"kl": 0.263671875, |
|
"learning_rate": 3.595866882573063e-06, |
|
"loss": 0.0026, |
|
"reward": 0.20000001788139343, |
|
"reward_std": 0.20000001788139343, |
|
"rewards/code_format_reward": 0.4285714626312256, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 850.2857666015625, |
|
"epoch": 0.12642947903430748, |
|
"grad_norm": 2.52078914642334, |
|
"kl": 0.451171875, |
|
"learning_rate": 3.5823439571131675e-06, |
|
"loss": 0.0045, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 924.2857666015625, |
|
"epoch": 0.12706480304955528, |
|
"grad_norm": 2.445126533508301, |
|
"kl": 0.20703125, |
|
"learning_rate": 3.5687861081678477e-06, |
|
"loss": 0.0021, |
|
"reward": 0.22857144474983215, |
|
"reward_std": 0.21380899846553802, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.5714285969734192, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 764.1428833007812, |
|
"epoch": 0.12770012706480305, |
|
"grad_norm": 2.1626222133636475, |
|
"kl": 0.3359375, |
|
"learning_rate": 3.555193904597291e-06, |
|
"loss": 0.0034, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 798.2857666015625, |
|
"epoch": 0.12833545108005082, |
|
"grad_norm": 2.010795831680298, |
|
"kl": 0.275390625, |
|
"learning_rate": 3.541567916703138e-06, |
|
"loss": 0.0027, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.42983949184417725, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 675.5714721679688, |
|
"epoch": 0.1289707750952986, |
|
"grad_norm": 2.5785133838653564, |
|
"kl": 0.2431640625, |
|
"learning_rate": 3.5279087162045517e-06, |
|
"loss": 0.0024, |
|
"reward": 0.6285714507102966, |
|
"reward_std": 0.5468525290489197, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 696.4285888671875, |
|
"epoch": 0.12960609911054638, |
|
"grad_norm": 0.14385801553726196, |
|
"kl": 0.166015625, |
|
"learning_rate": 3.5142168762142265e-06, |
|
"loss": 0.002, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 673.5714721679688, |
|
"epoch": 0.13024142312579415, |
|
"grad_norm": 29.659584045410156, |
|
"kl": 2.109375, |
|
"learning_rate": 3.500492971214347e-06, |
|
"loss": 0.0212, |
|
"reward": 0.40446433424949646, |
|
"reward_std": 0.00819578766822815, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.004464285913854837, |
|
"rewards/format_reward": 1.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 724.857177734375, |
|
"epoch": 0.13087674714104194, |
|
"grad_norm": 3.008819580078125, |
|
"kl": 0.29296875, |
|
"learning_rate": 3.48673757703248e-06, |
|
"loss": 0.0029, |
|
"reward": 0.4012531638145447, |
|
"reward_std": 0.003315483685582876, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0012531329412013292, |
|
"rewards/format_reward": 1.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 670.0, |
|
"epoch": 0.1315120711562897, |
|
"grad_norm": 2.4119279384613037, |
|
"kl": 0.1904296875, |
|
"learning_rate": 3.472951270817418e-06, |
|
"loss": 0.0019, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 723.5714721679688, |
|
"epoch": 0.13214739517153748, |
|
"grad_norm": 0.09062952548265457, |
|
"kl": 0.26171875, |
|
"learning_rate": 3.4591346310149578e-06, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 885.5714721679688, |
|
"epoch": 0.13278271918678525, |
|
"grad_norm": 2.048800468444824, |
|
"kl": 0.2109375, |
|
"learning_rate": 3.445288237343632e-06, |
|
"loss": 0.0021, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1864454597234726, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.02857143059372902, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 768.0000610351562, |
|
"epoch": 0.13341804320203304, |
|
"grad_norm": 0.07965610176324844, |
|
"kl": 0.279296875, |
|
"learning_rate": 3.4314126707703895e-06, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 668.7142944335938, |
|
"epoch": 0.1340533672172808, |
|
"grad_norm": 2.6163687705993652, |
|
"kl": 0.259765625, |
|
"learning_rate": 3.4175085134862128e-06, |
|
"loss": 0.0026, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 846.5714721679688, |
|
"epoch": 0.13468869123252858, |
|
"grad_norm": 2.003486394882202, |
|
"kl": 0.1591796875, |
|
"learning_rate": 3.4035763488816953e-06, |
|
"loss": 0.0016, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 518.5714721679688, |
|
"epoch": 0.13532401524777637, |
|
"grad_norm": 2.321255683898926, |
|
"kl": 0.1865234375, |
|
"learning_rate": 3.3896167615225594e-06, |
|
"loss": 0.0019, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 736.4285888671875, |
|
"epoch": 0.13595933926302414, |
|
"grad_norm": 0.06602618098258972, |
|
"kl": 0.25, |
|
"learning_rate": 3.375630337125133e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 668.2857666015625, |
|
"epoch": 0.1365946632782719, |
|
"grad_norm": 2.6393556594848633, |
|
"kl": 0.18359375, |
|
"learning_rate": 3.361617662531772e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5445378422737122, |
|
"reward_std": 0.2468453347682953, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.14453783631324768, |
|
"rewards/format_reward": 1.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 696.5714721679688, |
|
"epoch": 0.1372299872935197, |
|
"grad_norm": 0.06534316390752792, |
|
"kl": 0.1962890625, |
|
"learning_rate": 3.347579325686237e-06, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 728.4285888671875, |
|
"epoch": 0.13786531130876747, |
|
"grad_norm": 2.1525208950042725, |
|
"kl": 0.162109375, |
|
"learning_rate": 3.333515915609027e-06, |
|
"loss": 0.0016, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 695.5714721679688, |
|
"epoch": 0.13850063532401524, |
|
"grad_norm": 2.821530342102051, |
|
"kl": 0.271484375, |
|
"learning_rate": 3.3194280223726616e-06, |
|
"loss": 0.0027, |
|
"reward": 0.41904765367507935, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0476190522313118, |
|
"rewards/format_reward": 1.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 558.4285888671875, |
|
"epoch": 0.13913595933926304, |
|
"grad_norm": 0.07845824211835861, |
|
"kl": 0.322265625, |
|
"learning_rate": 3.305316237076927e-06, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 762.2857666015625, |
|
"epoch": 0.1397712833545108, |
|
"grad_norm": 2.1945505142211914, |
|
"kl": 0.15625, |
|
"learning_rate": 3.291181151824071e-06, |
|
"loss": 0.0016, |
|
"reward": 0.3142857253551483, |
|
"reward_std": 0.15735916793346405, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.0, |
|
"epoch": 0.14040660736975857, |
|
"grad_norm": 2.6735599040985107, |
|
"kl": 0.19921875, |
|
"learning_rate": 3.27702335969396e-06, |
|
"loss": 0.002, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 673.2857666015625, |
|
"epoch": 0.14104193138500634, |
|
"grad_norm": 0.16328755021095276, |
|
"kl": 0.3203125, |
|
"learning_rate": 3.2628434547191985e-06, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 690.7142944335938, |
|
"epoch": 0.14167725540025414, |
|
"grad_norm": 0.05350850895047188, |
|
"kl": 0.28515625, |
|
"learning_rate": 3.2486420318601973e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 705.7142944335938, |
|
"epoch": 0.1423125794155019, |
|
"grad_norm": 0.05627477914094925, |
|
"kl": 0.255859375, |
|
"learning_rate": 3.2344196869802187e-06, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 522.857177734375, |
|
"epoch": 0.14294790343074967, |
|
"grad_norm": 3.546515464782715, |
|
"kl": 0.1943359375, |
|
"learning_rate": 3.2201770168203694e-06, |
|
"loss": 0.0019, |
|
"reward": 0.37142857909202576, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 656.857177734375, |
|
"epoch": 0.14358322744599747, |
|
"grad_norm": 0.30215829610824585, |
|
"kl": 0.283203125, |
|
"learning_rate": 3.205914618974563e-06, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 680.1428833007812, |
|
"epoch": 0.14421855146124524, |
|
"grad_norm": 2.333859443664551, |
|
"kl": 0.1982421875, |
|
"learning_rate": 3.1916330918644496e-06, |
|
"loss": 0.002, |
|
"reward": 0.290529727935791, |
|
"reward_std": 0.16188988089561462, |
|
"rewards/code_format_reward": 0.5714285969734192, |
|
"rewards/code_reward": 0.004815409425646067, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 702.7142944335938, |
|
"epoch": 0.144853875476493, |
|
"grad_norm": 0.06898491084575653, |
|
"kl": 0.26171875, |
|
"learning_rate": 3.177333034714303e-06, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 699.7142944335938, |
|
"epoch": 0.1454891994917408, |
|
"grad_norm": 3.096179246902466, |
|
"kl": 0.26171875, |
|
"learning_rate": 3.1630150475258813e-06, |
|
"loss": 0.0026, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.09759000688791275, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 704.4285888671875, |
|
"epoch": 0.14612452350698857, |
|
"grad_norm": 0.06554409861564636, |
|
"kl": 0.302734375, |
|
"learning_rate": 3.148679731053252e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 575.5714721679688, |
|
"epoch": 0.14675984752223634, |
|
"grad_norm": 2.8049392700195312, |
|
"kl": 0.2158203125, |
|
"learning_rate": 3.1343276867775805e-06, |
|
"loss": 0.0022, |
|
"reward": 0.4242587983608246, |
|
"reward_std": 0.04178621619939804, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.02425876073539257, |
|
"rewards/format_reward": 1.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 611.857177734375, |
|
"epoch": 0.1473951715374841, |
|
"grad_norm": 3.6352577209472656, |
|
"kl": 0.296875, |
|
"learning_rate": 3.1199595168819043e-06, |
|
"loss": 0.003, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 638.0, |
|
"epoch": 0.1480304955527319, |
|
"grad_norm": 2.4661004543304443, |
|
"kl": 0.298828125, |
|
"learning_rate": 3.105575824225852e-06, |
|
"loss": 0.003, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.857177734375, |
|
"epoch": 0.14866581956797967, |
|
"grad_norm": 2.9908604621887207, |
|
"kl": 0.20703125, |
|
"learning_rate": 3.091177212320363e-06, |
|
"loss": 0.0021, |
|
"reward": 0.37142857909202576, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 652.1428833007812, |
|
"epoch": 0.14930114358322744, |
|
"grad_norm": 0.04900944232940674, |
|
"kl": 0.1875, |
|
"learning_rate": 3.0767642853023538e-06, |
|
"loss": 0.0022, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 485.5714416503906, |
|
"epoch": 0.14993646759847523, |
|
"grad_norm": 2.6988017559051514, |
|
"kl": 0.216796875, |
|
"learning_rate": 3.062337647909376e-06, |
|
"loss": 0.0022, |
|
"reward": 0.37142857909202576, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 672.857177734375, |
|
"epoch": 0.150571791613723, |
|
"grad_norm": 2.6142776012420654, |
|
"kl": 0.22265625, |
|
"learning_rate": 3.04789790545424e-06, |
|
"loss": 0.0022, |
|
"reward": 0.4169172942638397, |
|
"reward_std": 0.04475894197821617, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.016917293891310692, |
|
"rewards/format_reward": 1.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 709.7142944335938, |
|
"epoch": 0.15120711562897077, |
|
"grad_norm": 0.06715590506792068, |
|
"kl": 0.1865234375, |
|
"learning_rate": 3.033445663799621e-06, |
|
"loss": 0.0022, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 642.2857666015625, |
|
"epoch": 0.15184243964421856, |
|
"grad_norm": 2.8286352157592773, |
|
"kl": 0.2197265625, |
|
"learning_rate": 3.018981529332633e-06, |
|
"loss": 0.0022, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645264148712, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 567.2857666015625, |
|
"epoch": 0.15247776365946633, |
|
"grad_norm": 0.07005994766950607, |
|
"kl": 0.21875, |
|
"learning_rate": 3.00450610893939e-06, |
|
"loss": 0.0025, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 678.5714721679688, |
|
"epoch": 0.1531130876747141, |
|
"grad_norm": 2.2401628494262695, |
|
"kl": 0.162109375, |
|
"learning_rate": 2.9900200099795396e-06, |
|
"loss": 0.0016, |
|
"reward": 0.40160515904426575, |
|
"reward_std": 0.004246791359037161, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.00160513655282557, |
|
"rewards/format_reward": 1.0, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 640.4285888671875, |
|
"epoch": 0.15374841168996187, |
|
"grad_norm": 2.341778039932251, |
|
"kl": 0.30078125, |
|
"learning_rate": 2.9755238402607826e-06, |
|
"loss": 0.003, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.22677868604660034, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 929.4285888671875, |
|
"epoch": 0.15438373570520966, |
|
"grad_norm": 2.2782864570617676, |
|
"kl": 0.193359375, |
|
"learning_rate": 2.961018208013367e-06, |
|
"loss": 0.0019, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 655.5714721679688, |
|
"epoch": 0.15501905972045743, |
|
"grad_norm": 0.0515868104994297, |
|
"kl": 0.265625, |
|
"learning_rate": 2.9465037218645694e-06, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 596.7142944335938, |
|
"epoch": 0.1556543837357052, |
|
"grad_norm": 0.06041451543569565, |
|
"kl": 0.271484375, |
|
"learning_rate": 2.9319809908131604e-06, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 623.5714721679688, |
|
"epoch": 0.156289707750953, |
|
"grad_norm": 0.11632593721151352, |
|
"kl": 0.33203125, |
|
"learning_rate": 2.917450624203847e-06, |
|
"loss": 0.0036, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 708.7142944335938, |
|
"epoch": 0.15692503176620076, |
|
"grad_norm": 2.282824993133545, |
|
"kl": 0.17578125, |
|
"learning_rate": 2.9029132317017118e-06, |
|
"loss": 0.0018, |
|
"reward": 0.6714285612106323, |
|
"reward_std": 0.27516230940818787, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.30000001192092896, |
|
"rewards/format_reward": 1.0, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 402.2857360839844, |
|
"epoch": 0.15756035578144853, |
|
"grad_norm": 0.05820649862289429, |
|
"kl": 0.263671875, |
|
"learning_rate": 2.888369423266629e-06, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 559.1428833007812, |
|
"epoch": 0.15819567979669633, |
|
"grad_norm": 2.360961437225342, |
|
"kl": 0.2431640625, |
|
"learning_rate": 2.8738198091276712e-06, |
|
"loss": 0.0024, |
|
"reward": 0.771428644657135, |
|
"reward_std": 0.48205915093421936, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 673.857177734375, |
|
"epoch": 0.1588310038119441, |
|
"grad_norm": 2.426175832748413, |
|
"kl": 0.2431640625, |
|
"learning_rate": 2.859264999757509e-06, |
|
"loss": 0.0024, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.42983946204185486, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 555.7142944335938, |
|
"epoch": 0.15946632782719186, |
|
"grad_norm": 1.7476675510406494, |
|
"kl": 0.2060546875, |
|
"learning_rate": 2.8447056058467928e-06, |
|
"loss": 0.0021, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 635.1428833007812, |
|
"epoch": 0.16010165184243966, |
|
"grad_norm": 0.10966142266988754, |
|
"kl": 0.296875, |
|
"learning_rate": 2.830142238278531e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 569.4285888671875, |
|
"epoch": 0.16073697585768743, |
|
"grad_norm": 0.07483859360218048, |
|
"kl": 0.306640625, |
|
"learning_rate": 2.81557550810246e-06, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 696.1428833007812, |
|
"epoch": 0.1613722998729352, |
|
"grad_norm": 557283.4375, |
|
"kl": 19456.0, |
|
"learning_rate": 2.8010060265094026e-06, |
|
"loss": 194.2972, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.42983946204185486, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 572.1428833007812, |
|
"epoch": 0.16200762388818296, |
|
"grad_norm": 2.958310842514038, |
|
"kl": 0.203125, |
|
"learning_rate": 2.786434404805629e-06, |
|
"loss": 0.002, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 642.5714721679688, |
|
"epoch": 0.16264294790343076, |
|
"grad_norm": 2.9970836639404297, |
|
"kl": 0.359375, |
|
"learning_rate": 2.771861254387199e-06, |
|
"loss": 0.0036, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.42983949184417725, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 626.5714721679688, |
|
"epoch": 0.16327827191867852, |
|
"grad_norm": 0.052250247448682785, |
|
"kl": 0.203125, |
|
"learning_rate": 2.7572871867143204e-06, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 587.0, |
|
"epoch": 0.1639135959339263, |
|
"grad_norm": 0.049148622900247574, |
|
"kl": 0.1845703125, |
|
"learning_rate": 2.742712813285681e-06, |
|
"loss": 0.0021, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 413.71429443359375, |
|
"epoch": 0.1645489199491741, |
|
"grad_norm": 3.1558637619018555, |
|
"kl": 0.296875, |
|
"learning_rate": 2.7281387456128017e-06, |
|
"loss": 0.003, |
|
"reward": 0.6661654710769653, |
|
"reward_std": 0.2892994284629822, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.3233082890510559, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.1428833007812, |
|
"epoch": 0.16518424396442186, |
|
"grad_norm": 2.672922372817993, |
|
"kl": 0.333984375, |
|
"learning_rate": 2.7135655951943716e-06, |
|
"loss": 0.0033, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 638.2857666015625, |
|
"epoch": 0.16581956797966962, |
|
"grad_norm": 0.062102027237415314, |
|
"kl": 0.2890625, |
|
"learning_rate": 2.698993973490598e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 615.4285888671875, |
|
"epoch": 0.16645489199491742, |
|
"grad_norm": 2.032621145248413, |
|
"kl": 0.349609375, |
|
"learning_rate": 2.6844244918975416e-06, |
|
"loss": 0.0035, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 680.4285888671875, |
|
"epoch": 0.1670902160101652, |
|
"grad_norm": 0.07202436029911041, |
|
"kl": 0.25390625, |
|
"learning_rate": 2.66985776172147e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 607.0, |
|
"epoch": 0.16772554002541296, |
|
"grad_norm": 3.1766250133514404, |
|
"kl": 0.3046875, |
|
"learning_rate": 2.6552943941532088e-06, |
|
"loss": 0.003, |
|
"reward": 0.41587308049201965, |
|
"reward_std": 0.0419960655272007, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.01587301678955555, |
|
"rewards/format_reward": 1.0, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 661.5714721679688, |
|
"epoch": 0.16836086404066072, |
|
"grad_norm": 3.2823150157928467, |
|
"kl": 0.2392578125, |
|
"learning_rate": 2.6407350002424927e-06, |
|
"loss": 0.0024, |
|
"reward": 0.5868132710456848, |
|
"reward_std": 0.23857378959655762, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.18681320548057556, |
|
"rewards/format_reward": 1.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 598.857177734375, |
|
"epoch": 0.16899618805590852, |
|
"grad_norm": 0.08283974230289459, |
|
"kl": 0.2021484375, |
|
"learning_rate": 2.626180190872329e-06, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 439.2857360839844, |
|
"epoch": 0.1696315120711563, |
|
"grad_norm": 3.3979873657226562, |
|
"kl": 0.234375, |
|
"learning_rate": 2.611630576733372e-06, |
|
"loss": 0.0023, |
|
"reward": 0.41260507702827454, |
|
"reward_std": 0.01228986494243145, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.01260504312813282, |
|
"rewards/format_reward": 1.0, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 584.1428833007812, |
|
"epoch": 0.17026683608640406, |
|
"grad_norm": 2.4819862842559814, |
|
"kl": 0.28125, |
|
"learning_rate": 2.5970867682982885e-06, |
|
"loss": 0.0028, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 458.14288330078125, |
|
"epoch": 0.17090216010165185, |
|
"grad_norm": 0.08856673538684845, |
|
"kl": 0.349609375, |
|
"learning_rate": 2.582549375796154e-06, |
|
"loss": 0.0038, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 468.71429443359375, |
|
"epoch": 0.17153748411689962, |
|
"grad_norm": 2.835487127304077, |
|
"kl": 0.32421875, |
|
"learning_rate": 2.568019009186841e-06, |
|
"loss": 0.0032, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 637.4285888671875, |
|
"epoch": 0.1721728081321474, |
|
"grad_norm": 0.054750654846429825, |
|
"kl": 0.162109375, |
|
"learning_rate": 2.5534962781354317e-06, |
|
"loss": 0.0019, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 757.2857666015625, |
|
"epoch": 0.17280813214739518, |
|
"grad_norm": 2.1099636554718018, |
|
"kl": 0.21484375, |
|
"learning_rate": 2.538981791986634e-06, |
|
"loss": 0.0021, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 425.2857360839844, |
|
"epoch": 0.17344345616264295, |
|
"grad_norm": 2.960906982421875, |
|
"kl": 0.251953125, |
|
"learning_rate": 2.524476159739218e-06, |
|
"loss": 0.0025, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645264148712, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 592.5714721679688, |
|
"epoch": 0.17407878017789072, |
|
"grad_norm": 2.5509631633758545, |
|
"kl": 0.26953125, |
|
"learning_rate": 2.5099799900204607e-06, |
|
"loss": 0.0027, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 436.8571472167969, |
|
"epoch": 0.17471410419313851, |
|
"grad_norm": 2.9435410499572754, |
|
"kl": 0.400390625, |
|
"learning_rate": 2.4954938910606108e-06, |
|
"loss": 0.004, |
|
"reward": 0.563265323638916, |
|
"reward_std": 0.37278667092323303, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.16326531767845154, |
|
"rewards/format_reward": 1.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 409.5714416503906, |
|
"epoch": 0.17534942820838628, |
|
"grad_norm": 4.025363922119141, |
|
"kl": 0.275390625, |
|
"learning_rate": 2.481018470667368e-06, |
|
"loss": 0.0028, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 467.14288330078125, |
|
"epoch": 0.17598475222363405, |
|
"grad_norm": 0.09415756165981293, |
|
"kl": 0.30078125, |
|
"learning_rate": 2.4665543362003802e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.0000305175781, |
|
"epoch": 0.17662007623888182, |
|
"grad_norm": 3.590571165084839, |
|
"kl": 0.2216796875, |
|
"learning_rate": 2.4521020945457615e-06, |
|
"loss": 0.0022, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 439.2857360839844, |
|
"epoch": 0.1772554002541296, |
|
"grad_norm": 2.6514501571655273, |
|
"kl": 0.1884765625, |
|
"learning_rate": 2.4376623520906255e-06, |
|
"loss": 0.0019, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 395.4285888671875, |
|
"epoch": 0.17789072426937738, |
|
"grad_norm": 3.0715246200561523, |
|
"kl": 0.396484375, |
|
"learning_rate": 2.4232357146976478e-06, |
|
"loss": 0.004, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 552.1428833007812, |
|
"epoch": 0.17852604828462515, |
|
"grad_norm": 0.04878819361329079, |
|
"kl": 0.25390625, |
|
"learning_rate": 2.408822787679637e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 470.0000305175781, |
|
"epoch": 0.17916137229987295, |
|
"grad_norm": 2.8764052391052246, |
|
"kl": 0.337890625, |
|
"learning_rate": 2.3944241757741475e-06, |
|
"loss": 0.0034, |
|
"reward": 0.7617021799087524, |
|
"reward_std": 0.39519527554512024, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.3617021143436432, |
|
"rewards/format_reward": 1.0, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 552.0, |
|
"epoch": 0.1797966963151207, |
|
"grad_norm": 3.4082202911376953, |
|
"kl": 0.279296875, |
|
"learning_rate": 2.380040483118097e-06, |
|
"loss": 0.0028, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 419.0000305175781, |
|
"epoch": 0.18043202033036848, |
|
"grad_norm": 3.0144095420837402, |
|
"kl": 0.236328125, |
|
"learning_rate": 2.365672313222419e-06, |
|
"loss": 0.0024, |
|
"reward": 0.800332248210907, |
|
"reward_std": 0.2776820659637451, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4003322720527649, |
|
"rewards/format_reward": 1.0, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 514.5714721679688, |
|
"epoch": 0.18106734434561628, |
|
"grad_norm": 0.1193128377199173, |
|
"kl": 0.35546875, |
|
"learning_rate": 2.351320268946749e-06, |
|
"loss": 0.0038, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 638.7142944335938, |
|
"epoch": 0.18170266836086404, |
|
"grad_norm": 0.050928860902786255, |
|
"kl": 0.25390625, |
|
"learning_rate": 2.336984952474119e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.857177734375, |
|
"epoch": 0.1823379923761118, |
|
"grad_norm": 0.12127784639596939, |
|
"kl": 0.310546875, |
|
"learning_rate": 2.322666965285697e-06, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 515.857177734375, |
|
"epoch": 0.18297331639135958, |
|
"grad_norm": 0.04745308309793472, |
|
"kl": 0.302734375, |
|
"learning_rate": 2.3083669081355507e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 619.0, |
|
"epoch": 0.18360864040660738, |
|
"grad_norm": 0.04747169092297554, |
|
"kl": 0.2392578125, |
|
"learning_rate": 2.2940853810254377e-06, |
|
"loss": 0.0027, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 427.2857360839844, |
|
"epoch": 0.18424396442185514, |
|
"grad_norm": 0.04503343254327774, |
|
"kl": 0.2109375, |
|
"learning_rate": 2.2798229831796313e-06, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 350.8571472167969, |
|
"epoch": 0.1848792884371029, |
|
"grad_norm": 0.15933562815189362, |
|
"kl": 0.29296875, |
|
"learning_rate": 2.2655803130197816e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 492.4285888671875, |
|
"epoch": 0.1855146124523507, |
|
"grad_norm": 0.061849016696214676, |
|
"kl": 0.34375, |
|
"learning_rate": 2.2513579681398034e-06, |
|
"loss": 0.0037, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 447.5714416503906, |
|
"epoch": 0.18614993646759848, |
|
"grad_norm": 3.1712300777435303, |
|
"kl": 0.1982421875, |
|
"learning_rate": 2.237156545280803e-06, |
|
"loss": 0.002, |
|
"reward": 1.1142858266830444, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.7142857313156128, |
|
"rewards/format_reward": 1.0, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 530.7142944335938, |
|
"epoch": 0.18678526048284624, |
|
"grad_norm": 2.926269769668579, |
|
"kl": 0.27734375, |
|
"learning_rate": 2.2229766403060403e-06, |
|
"loss": 0.0028, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 370.14288330078125, |
|
"epoch": 0.18742058449809404, |
|
"grad_norm": 3.639002799987793, |
|
"kl": 0.412109375, |
|
"learning_rate": 2.2088188481759305e-06, |
|
"loss": 0.0041, |
|
"reward": 0.44044750928878784, |
|
"reward_std": 0.06550441682338715, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.040447503328323364, |
|
"rewards/format_reward": 1.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 411.8571472167969, |
|
"epoch": 0.1880559085133418, |
|
"grad_norm": 3.473376512527466, |
|
"kl": 0.337890625, |
|
"learning_rate": 2.194683762923073e-06, |
|
"loss": 0.0034, |
|
"reward": 0.8047619462013245, |
|
"reward_std": 0.28637492656707764, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4047619104385376, |
|
"rewards/format_reward": 1.0, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 537.1428833007812, |
|
"epoch": 0.18869123252858958, |
|
"grad_norm": 0.05299937725067139, |
|
"kl": 0.2373046875, |
|
"learning_rate": 2.1805719776273387e-06, |
|
"loss": 0.0027, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 499.0000305175781, |
|
"epoch": 0.18932655654383734, |
|
"grad_norm": 0.04631977900862694, |
|
"kl": 0.2265625, |
|
"learning_rate": 2.166484084390974e-06, |
|
"loss": 0.0026, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 490.14288330078125, |
|
"epoch": 0.18996188055908514, |
|
"grad_norm": 0.08931510150432587, |
|
"kl": 0.244140625, |
|
"learning_rate": 2.1524206743137636e-06, |
|
"loss": 0.0027, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 461.2857360839844, |
|
"epoch": 0.1905972045743329, |
|
"grad_norm": 3.1879539489746094, |
|
"kl": 0.236328125, |
|
"learning_rate": 2.1383823374682287e-06, |
|
"loss": 0.0024, |
|
"reward": 0.971428632736206, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 493.14288330078125, |
|
"epoch": 0.19123252858958067, |
|
"grad_norm": 2.5813148021698, |
|
"kl": 0.2294921875, |
|
"learning_rate": 2.124369662874868e-06, |
|
"loss": 0.0023, |
|
"reward": 0.44916945695877075, |
|
"reward_std": 0.2263808697462082, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.10631229728460312, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 467.857177734375, |
|
"epoch": 0.19186785260482847, |
|
"grad_norm": 2.3633549213409424, |
|
"kl": 0.251953125, |
|
"learning_rate": 2.110383238477441e-06, |
|
"loss": 0.0025, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 467.5714416503906, |
|
"epoch": 0.19250317662007624, |
|
"grad_norm": 3.2412614822387695, |
|
"kl": 0.2021484375, |
|
"learning_rate": 2.096423651118305e-06, |
|
"loss": 0.002, |
|
"reward": 0.41476020216941833, |
|
"reward_std": 0.03590288758277893, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.014760148711502552, |
|
"rewards/format_reward": 1.0, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 629.1428833007812, |
|
"epoch": 0.193138500635324, |
|
"grad_norm": 1.8208192586898804, |
|
"kl": 0.1845703125, |
|
"learning_rate": 2.082491486513788e-06, |
|
"loss": 0.0018, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.1428833007812, |
|
"epoch": 0.1937738246505718, |
|
"grad_norm": 0.06740770488977432, |
|
"kl": 0.326171875, |
|
"learning_rate": 2.0685873292296116e-06, |
|
"loss": 0.0036, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 469.0000305175781, |
|
"epoch": 0.19440914866581957, |
|
"grad_norm": 0.050011664628982544, |
|
"kl": 0.322265625, |
|
"learning_rate": 2.054711762656369e-06, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 501.2857360839844, |
|
"epoch": 0.19504447268106734, |
|
"grad_norm": 2.7797482013702393, |
|
"kl": 0.1865234375, |
|
"learning_rate": 2.040865368985044e-06, |
|
"loss": 0.0019, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 483.4285888671875, |
|
"epoch": 0.19567979669631513, |
|
"grad_norm": 0.056970253586769104, |
|
"kl": 0.205078125, |
|
"learning_rate": 2.027048729182583e-06, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 708.4285888671875, |
|
"epoch": 0.1963151207115629, |
|
"grad_norm": 1.9797154664993286, |
|
"kl": 0.2421875, |
|
"learning_rate": 2.0132624229675205e-06, |
|
"loss": 0.0024, |
|
"reward": 0.45210087299346924, |
|
"reward_std": 0.11403417587280273, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.10924370586872101, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 432.14288330078125, |
|
"epoch": 0.19695044472681067, |
|
"grad_norm": 0.0952640175819397, |
|
"kl": 0.40625, |
|
"learning_rate": 1.9995070287856546e-06, |
|
"loss": 0.0044, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 533.7142944335938, |
|
"epoch": 0.19758576874205844, |
|
"grad_norm": 3.1438472270965576, |
|
"kl": 0.30859375, |
|
"learning_rate": 1.985783123785774e-06, |
|
"loss": 0.0031, |
|
"reward": 0.49523812532424927, |
|
"reward_std": 0.11878278106451035, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0952381044626236, |
|
"rewards/format_reward": 1.0, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 536.5714721679688, |
|
"epoch": 0.19822109275730623, |
|
"grad_norm": 2.1929433345794678, |
|
"kl": 0.314453125, |
|
"learning_rate": 1.9720912837954486e-06, |
|
"loss": 0.0031, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.71429443359375, |
|
"epoch": 0.198856416772554, |
|
"grad_norm": 0.05079368129372597, |
|
"kl": 0.306640625, |
|
"learning_rate": 1.958432083296862e-06, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 454.14288330078125, |
|
"epoch": 0.19949174078780177, |
|
"grad_norm": 0.06545651704072952, |
|
"kl": 0.2099609375, |
|
"learning_rate": 1.9448060954027093e-06, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 523.1428833007812, |
|
"epoch": 0.20012706480304956, |
|
"grad_norm": 2.852233409881592, |
|
"kl": 0.27734375, |
|
"learning_rate": 1.931213891832153e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40317460894584656, |
|
"reward_std": 0.005421662237495184, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0031746034510433674, |
|
"rewards/format_reward": 1.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 518.7142944335938, |
|
"epoch": 0.20076238881829733, |
|
"grad_norm": 0.056687433272600174, |
|
"kl": 0.2158203125, |
|
"learning_rate": 1.9176560428868336e-06, |
|
"loss": 0.0025, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 651.857177734375, |
|
"epoch": 0.2013977128335451, |
|
"grad_norm": 2.1868667602539062, |
|
"kl": 0.1923828125, |
|
"learning_rate": 1.9041331174269373e-06, |
|
"loss": 0.0019, |
|
"reward": 0.3556329905986786, |
|
"reward_std": 0.16032202541828156, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.012775842100381851, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 627.7142944335938, |
|
"epoch": 0.2020330368487929, |
|
"grad_norm": 0.055125512182712555, |
|
"kl": 0.2578125, |
|
"learning_rate": 1.8906456828473341e-06, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 585.4285888671875, |
|
"epoch": 0.20266836086404066, |
|
"grad_norm": 0.0708109587430954, |
|
"kl": 0.265625, |
|
"learning_rate": 1.8771943050537656e-06, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 568.857177734375, |
|
"epoch": 0.20330368487928843, |
|
"grad_norm": 3.287763833999634, |
|
"kl": 0.1796875, |
|
"learning_rate": 1.8637795484391046e-06, |
|
"loss": 0.0018, |
|
"reward": 0.4266955256462097, |
|
"reward_std": 0.05106709897518158, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.026695528998970985, |
|
"rewards/format_reward": 1.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 356.71429443359375, |
|
"epoch": 0.2039390088945362, |
|
"grad_norm": 2.707819700241089, |
|
"kl": 0.271484375, |
|
"learning_rate": 1.8504019758596698e-06, |
|
"loss": 0.0027, |
|
"reward": 1.2571430206298828, |
|
"reward_std": 0.3779645264148712, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.8571429252624512, |
|
"rewards/format_reward": 1.0, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 470.4285888671875, |
|
"epoch": 0.204574332909784, |
|
"grad_norm": 2.942082405090332, |
|
"kl": 0.353515625, |
|
"learning_rate": 1.8370621486116163e-06, |
|
"loss": 0.0035, |
|
"reward": 0.41904765367507935, |
|
"reward_std": 0.05039527267217636, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.01904762163758278, |
|
"rewards/format_reward": 1.0, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 485.14288330078125, |
|
"epoch": 0.20520965692503176, |
|
"grad_norm": 0.06295392662286758, |
|
"kl": 0.330078125, |
|
"learning_rate": 1.823760626407377e-06, |
|
"loss": 0.0036, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 499.4285888671875, |
|
"epoch": 0.20584498094027953, |
|
"grad_norm": 2.458465099334717, |
|
"kl": 0.34375, |
|
"learning_rate": 1.8104979673521838e-06, |
|
"loss": 0.0034, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 526.2857666015625, |
|
"epoch": 0.20648030495552733, |
|
"grad_norm": 3.1990699768066406, |
|
"kl": 0.33203125, |
|
"learning_rate": 1.7972747279206482e-06, |
|
"loss": 0.0033, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 601.0, |
|
"epoch": 0.2071156289707751, |
|
"grad_norm": 0.05853183940052986, |
|
"kl": 0.2890625, |
|
"learning_rate": 1.7840914629334122e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 498.71429443359375, |
|
"epoch": 0.20775095298602286, |
|
"grad_norm": 0.07322244346141815, |
|
"kl": 0.3125, |
|
"learning_rate": 1.7709487255338731e-06, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 553.857177734375, |
|
"epoch": 0.20838627700127066, |
|
"grad_norm": 0.17839957773685455, |
|
"kl": 0.228515625, |
|
"learning_rate": 1.7578470671649684e-06, |
|
"loss": 0.0026, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 530.857177734375, |
|
"epoch": 0.20902160101651843, |
|
"grad_norm": 0.1131465956568718, |
|
"kl": 0.3359375, |
|
"learning_rate": 1.744787037546045e-06, |
|
"loss": 0.0037, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 564.1428833007812, |
|
"epoch": 0.2096569250317662, |
|
"grad_norm": 0.08917635679244995, |
|
"kl": 0.30078125, |
|
"learning_rate": 1.731769184649788e-06, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 401.4285888671875, |
|
"epoch": 0.210292249047014, |
|
"grad_norm": 0.15413963794708252, |
|
"kl": 0.36328125, |
|
"learning_rate": 1.7187940546792325e-06, |
|
"loss": 0.0039, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.0, |
|
"epoch": 0.21092757306226176, |
|
"grad_norm": 3.1670172214508057, |
|
"kl": 0.306640625, |
|
"learning_rate": 1.7058621920448465e-06, |
|
"loss": 0.0031, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 477.14288330078125, |
|
"epoch": 0.21156289707750953, |
|
"grad_norm": 0.07635504752397537, |
|
"kl": 0.333984375, |
|
"learning_rate": 1.6929741393416855e-06, |
|
"loss": 0.0036, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 405.71429443359375, |
|
"epoch": 0.2121982210927573, |
|
"grad_norm": 0.0835573673248291, |
|
"kl": 0.361328125, |
|
"learning_rate": 1.6801304373266286e-06, |
|
"loss": 0.0039, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 556.0, |
|
"epoch": 0.2128335451080051, |
|
"grad_norm": 0.17580975592136383, |
|
"kl": 0.310546875, |
|
"learning_rate": 1.667331624895689e-06, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 382.14288330078125, |
|
"epoch": 0.21346886912325286, |
|
"grad_norm": 2.9555137157440186, |
|
"kl": 0.330078125, |
|
"learning_rate": 1.6545782390614037e-06, |
|
"loss": 0.0033, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 400.14288330078125, |
|
"epoch": 0.21410419313850063, |
|
"grad_norm": 0.08389069885015488, |
|
"kl": 0.2275390625, |
|
"learning_rate": 1.6418708149302992e-06, |
|
"loss": 0.0026, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 425.2857360839844, |
|
"epoch": 0.21473951715374842, |
|
"grad_norm": 2.6030876636505127, |
|
"kl": 0.23046875, |
|
"learning_rate": 1.6292098856804423e-06, |
|
"loss": 0.0023, |
|
"reward": 1.2571430206298828, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.8571429252624512, |
|
"rewards/format_reward": 1.0, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 369.71429443359375, |
|
"epoch": 0.2153748411689962, |
|
"grad_norm": 0.09957047551870346, |
|
"kl": 0.255859375, |
|
"learning_rate": 1.6165959825390661e-06, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 374.5714416503906, |
|
"epoch": 0.21601016518424396, |
|
"grad_norm": 3.5869476795196533, |
|
"kl": 0.2216796875, |
|
"learning_rate": 1.604029634760284e-06, |
|
"loss": 0.0022, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.14288330078125, |
|
"epoch": 0.21664548919949175, |
|
"grad_norm": 0.04904744401574135, |
|
"kl": 0.208984375, |
|
"learning_rate": 1.59151136960288e-06, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 592.0, |
|
"epoch": 0.21728081321473952, |
|
"grad_norm": 0.07155793160200119, |
|
"kl": 0.29296875, |
|
"learning_rate": 1.5790417123081903e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 498.14288330078125, |
|
"epoch": 0.2179161372299873, |
|
"grad_norm": 0.07623915374279022, |
|
"kl": 0.34765625, |
|
"learning_rate": 1.5666211860780583e-06, |
|
"loss": 0.0038, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 439.2857360839844, |
|
"epoch": 0.21855146124523506, |
|
"grad_norm": 3.3097286224365234, |
|
"kl": 0.330078125, |
|
"learning_rate": 1.5542503120528918e-06, |
|
"loss": 0.0033, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 504.4285888671875, |
|
"epoch": 0.21918678526048285, |
|
"grad_norm": 3.0239760875701904, |
|
"kl": 0.3125, |
|
"learning_rate": 1.5419296092897866e-06, |
|
"loss": 0.0031, |
|
"reward": 0.5071429014205933, |
|
"reward_std": 0.14202801883220673, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.107142873108387, |
|
"rewards/format_reward": 1.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 656.857177734375, |
|
"epoch": 0.21982210927573062, |
|
"grad_norm": 0.04605603963136673, |
|
"kl": 0.279296875, |
|
"learning_rate": 1.529659594740755e-06, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 411.2857360839844, |
|
"epoch": 0.2204574332909784, |
|
"grad_norm": 0.05368569865822792, |
|
"kl": 0.373046875, |
|
"learning_rate": 1.5174407832310338e-06, |
|
"loss": 0.004, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 626.0, |
|
"epoch": 0.22109275730622618, |
|
"grad_norm": 2.683948040008545, |
|
"kl": 0.26171875, |
|
"learning_rate": 1.5052736874374815e-06, |
|
"loss": 0.0026, |
|
"reward": 0.4285714626312256, |
|
"reward_std": 0.04879499599337578, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.02857143059372902, |
|
"rewards/format_reward": 1.0, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.857177734375, |
|
"epoch": 0.22172808132147395, |
|
"grad_norm": 2.875629186630249, |
|
"kl": 0.203125, |
|
"learning_rate": 1.4931588178670695e-06, |
|
"loss": 0.002, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 259.5714416503906, |
|
"epoch": 0.22236340533672172, |
|
"grad_norm": 0.2272838056087494, |
|
"kl": 0.353515625, |
|
"learning_rate": 1.4810966828354605e-06, |
|
"loss": 0.0047, |
|
"reward": 1.4000002145767212, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 1.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 479.71429443359375, |
|
"epoch": 0.22299872935196952, |
|
"grad_norm": 3.2993814945220947, |
|
"kl": 0.21484375, |
|
"learning_rate": 1.469087788445684e-06, |
|
"loss": 0.0021, |
|
"reward": 0.42500001192092896, |
|
"reward_std": 0.04564352706074715, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.02500000223517418, |
|
"rewards/format_reward": 1.0, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 498.5714416503906, |
|
"epoch": 0.22363405336721728, |
|
"grad_norm": 0.2275489717721939, |
|
"kl": 0.333984375, |
|
"learning_rate": 1.4571326385668965e-06, |
|
"loss": 0.0036, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 470.857177734375, |
|
"epoch": 0.22426937738246505, |
|
"grad_norm": 0.05804005637764931, |
|
"kl": 0.341796875, |
|
"learning_rate": 1.4452317348132434e-06, |
|
"loss": 0.0037, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 255.1428680419922, |
|
"epoch": 0.22490470139771285, |
|
"grad_norm": 0.30045151710510254, |
|
"kl": 0.3125, |
|
"learning_rate": 1.4333855765228104e-06, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 401.8571472167969, |
|
"epoch": 0.22554002541296062, |
|
"grad_norm": 0.1624024361371994, |
|
"kl": 0.36328125, |
|
"learning_rate": 1.421594660736675e-06, |
|
"loss": 0.0039, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 458.5714416503906, |
|
"epoch": 0.22617534942820838, |
|
"grad_norm": 0.06793898344039917, |
|
"kl": 0.2294921875, |
|
"learning_rate": 1.4098594821780476e-06, |
|
"loss": 0.0026, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 354.2857360839844, |
|
"epoch": 0.22681067344345615, |
|
"grad_norm": 3.499160051345825, |
|
"kl": 0.2412109375, |
|
"learning_rate": 1.3981805332315174e-06, |
|
"loss": 0.0024, |
|
"reward": 1.057142972946167, |
|
"reward_std": 0.47207754850387573, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 432.71429443359375, |
|
"epoch": 0.22744599745870395, |
|
"grad_norm": 3.1533403396606445, |
|
"kl": 0.30859375, |
|
"learning_rate": 1.3865583039223929e-06, |
|
"loss": 0.0031, |
|
"reward": 0.7381389141082764, |
|
"reward_std": 0.2022811770439148, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.3381389379501343, |
|
"rewards/format_reward": 1.0, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 287.8571472167969, |
|
"epoch": 0.22808132147395171, |
|
"grad_norm": 0.08542405813932419, |
|
"kl": 0.3203125, |
|
"learning_rate": 1.374993281896137e-06, |
|
"loss": 0.0044, |
|
"reward": 1.4000002145767212, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 1.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 557.5714721679688, |
|
"epoch": 0.22871664548919948, |
|
"grad_norm": 1.56419038772583, |
|
"kl": 0.1650390625, |
|
"learning_rate": 1.3634859523979134e-06, |
|
"loss": 0.0016, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 356.4285888671875, |
|
"epoch": 0.22935196950444728, |
|
"grad_norm": 3.0232937335968018, |
|
"kl": 0.224609375, |
|
"learning_rate": 1.3520367982522208e-06, |
|
"loss": 0.0022, |
|
"reward": 1.1142858266830444, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.7142857313156128, |
|
"rewards/format_reward": 1.0, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.1428833007812, |
|
"epoch": 0.22998729351969505, |
|
"grad_norm": 3.2974321842193604, |
|
"kl": 0.1875, |
|
"learning_rate": 1.3406462998426358e-06, |
|
"loss": 0.0019, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 456.2857360839844, |
|
"epoch": 0.23062261753494281, |
|
"grad_norm": 3.04162859916687, |
|
"kl": 0.2119140625, |
|
"learning_rate": 1.3293149350916595e-06, |
|
"loss": 0.0021, |
|
"reward": 0.971428632736206, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 532.4285888671875, |
|
"epoch": 0.2312579415501906, |
|
"grad_norm": 3.197749376296997, |
|
"kl": 0.1748046875, |
|
"learning_rate": 1.3180431794406623e-06, |
|
"loss": 0.0017, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.71429443359375, |
|
"epoch": 0.23189326556543838, |
|
"grad_norm": 3.0233240127563477, |
|
"kl": 0.2294921875, |
|
"learning_rate": 1.3068315058299358e-06, |
|
"loss": 0.0023, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 362.5714416503906, |
|
"epoch": 0.23252858958068615, |
|
"grad_norm": 0.06233400106430054, |
|
"kl": 0.28515625, |
|
"learning_rate": 1.2956803846788503e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 475.2857360839844, |
|
"epoch": 0.2331639135959339, |
|
"grad_norm": 0.04722089692950249, |
|
"kl": 0.193359375, |
|
"learning_rate": 1.284590283866116e-06, |
|
"loss": 0.0022, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 644.5714721679688, |
|
"epoch": 0.2337992376111817, |
|
"grad_norm": 2.4597692489624023, |
|
"kl": 0.265625, |
|
"learning_rate": 1.2735616687101518e-06, |
|
"loss": 0.0027, |
|
"reward": 0.4714285731315613, |
|
"reward_std": 0.18898221850395203, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0714285746216774, |
|
"rewards/format_reward": 1.0, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 353.8571472167969, |
|
"epoch": 0.23443456162642948, |
|
"grad_norm": 3.595822811126709, |
|
"kl": 0.255859375, |
|
"learning_rate": 1.2625950019495614e-06, |
|
"loss": 0.0026, |
|
"reward": 0.6106783151626587, |
|
"reward_std": 0.34680601954460144, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.21067820489406586, |
|
"rewards/format_reward": 1.0, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 585.5714721679688, |
|
"epoch": 0.23506988564167725, |
|
"grad_norm": 0.05966843292117119, |
|
"kl": 0.287109375, |
|
"learning_rate": 1.251690743723718e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 315.4285888671875, |
|
"epoch": 0.23570520965692504, |
|
"grad_norm": 3.2647478580474854, |
|
"kl": 0.26171875, |
|
"learning_rate": 1.2408493515534581e-06, |
|
"loss": 0.0026, |
|
"reward": 1.1142858266830444, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.7142857313156128, |
|
"rewards/format_reward": 1.0, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 503.0000305175781, |
|
"epoch": 0.2363405336721728, |
|
"grad_norm": 3.232701063156128, |
|
"kl": 0.1650390625, |
|
"learning_rate": 1.2300712803218834e-06, |
|
"loss": 0.0017, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 551.5714721679688, |
|
"epoch": 0.23697585768742058, |
|
"grad_norm": 0.05632919818162918, |
|
"kl": 0.2890625, |
|
"learning_rate": 1.2193569822552772e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 407.0000305175781, |
|
"epoch": 0.23761118170266837, |
|
"grad_norm": 0.06528866291046143, |
|
"kl": 0.228515625, |
|
"learning_rate": 1.2087069069041268e-06, |
|
"loss": 0.0026, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 378.8571472167969, |
|
"epoch": 0.23824650571791614, |
|
"grad_norm": 3.4633543491363525, |
|
"kl": 0.29296875, |
|
"learning_rate": 1.1981215011242654e-06, |
|
"loss": 0.0029, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 499.0000305175781, |
|
"epoch": 0.2388818297331639, |
|
"grad_norm": 0.06882923096418381, |
|
"kl": 0.2216796875, |
|
"learning_rate": 1.1876012090581184e-06, |
|
"loss": 0.0025, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 424.8571472167969, |
|
"epoch": 0.23951715374841168, |
|
"grad_norm": 0.048563580960035324, |
|
"kl": 0.1806640625, |
|
"learning_rate": 1.177146472116071e-06, |
|
"loss": 0.0021, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 556.1428833007812, |
|
"epoch": 0.24015247776365947, |
|
"grad_norm": 3.4067060947418213, |
|
"kl": 0.263671875, |
|
"learning_rate": 1.1667577289579462e-06, |
|
"loss": 0.0026, |
|
"reward": 0.43296706676483154, |
|
"reward_std": 0.06052277237176895, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.032967038452625275, |
|
"rewards/format_reward": 1.0, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 415.0000305175781, |
|
"epoch": 0.24078780177890724, |
|
"grad_norm": 2.6581125259399414, |
|
"kl": 0.3203125, |
|
"learning_rate": 1.1564354154746007e-06, |
|
"loss": 0.0032, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 473.0000305175781, |
|
"epoch": 0.241423125794155, |
|
"grad_norm": 0.12217168509960175, |
|
"kl": 0.2734375, |
|
"learning_rate": 1.146179964769635e-06, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 358.5714416503906, |
|
"epoch": 0.2420584498094028, |
|
"grad_norm": 0.07087297737598419, |
|
"kl": 0.337890625, |
|
"learning_rate": 1.1359918071412195e-06, |
|
"loss": 0.0037, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 666.7142944335938, |
|
"epoch": 0.24269377382465057, |
|
"grad_norm": 3.078494071960449, |
|
"kl": 0.1669921875, |
|
"learning_rate": 1.1258713700640456e-06, |
|
"loss": 0.0017, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.5714721679688, |
|
"epoch": 0.24332909783989834, |
|
"grad_norm": 0.0404311902821064, |
|
"kl": 0.25, |
|
"learning_rate": 1.115819078171383e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 508.857177734375, |
|
"epoch": 0.24396442185514614, |
|
"grad_norm": 2.985482931137085, |
|
"kl": 0.1787109375, |
|
"learning_rate": 1.1058353532372667e-06, |
|
"loss": 0.0018, |
|
"reward": 0.37460315227508545, |
|
"reward_std": 0.07708179205656052, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0031746032182127237, |
|
"rewards/format_reward": 1.0, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 499.857177734375, |
|
"epoch": 0.2445997458703939, |
|
"grad_norm": 2.4948058128356934, |
|
"kl": 0.251953125, |
|
"learning_rate": 1.0959206141587998e-06, |
|
"loss": 0.0025, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 400.2857360839844, |
|
"epoch": 0.24523506988564167, |
|
"grad_norm": 3.1324355602264404, |
|
"kl": 0.2119140625, |
|
"learning_rate": 1.0860752769385766e-06, |
|
"loss": 0.0021, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 425.14288330078125, |
|
"epoch": 0.24587039390088947, |
|
"grad_norm": 0.061597954481840134, |
|
"kl": 0.28515625, |
|
"learning_rate": 1.0762997546672279e-06, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.4285888671875, |
|
"epoch": 0.24650571791613723, |
|
"grad_norm": 0.047814078629016876, |
|
"kl": 0.2021484375, |
|
"learning_rate": 1.0665944575060914e-06, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 375.71429443359375, |
|
"epoch": 0.247141041931385, |
|
"grad_norm": 3.590963363647461, |
|
"kl": 0.22265625, |
|
"learning_rate": 1.056959792669997e-06, |
|
"loss": 0.0022, |
|
"reward": 0.5285714268684387, |
|
"reward_std": 0.11126972734928131, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.12857143580913544, |
|
"rewards/format_reward": 1.0, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 470.0000305175781, |
|
"epoch": 0.24777636594663277, |
|
"grad_norm": 0.10802248120307922, |
|
"kl": 0.376953125, |
|
"learning_rate": 1.0473961644101856e-06, |
|
"loss": 0.0041, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 475.14288330078125, |
|
"epoch": 0.24841168996188057, |
|
"grad_norm": 0.18473738431930542, |
|
"kl": 0.318359375, |
|
"learning_rate": 1.037903973997345e-06, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 455.5714416503906, |
|
"epoch": 0.24904701397712833, |
|
"grad_norm": 4.052010536193848, |
|
"kl": 0.349609375, |
|
"learning_rate": 1.0284836197047737e-06, |
|
"loss": 0.0035, |
|
"reward": 0.49629050493240356, |
|
"reward_std": 0.17879442870616913, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0962904542684555, |
|
"rewards/format_reward": 1.0, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 437.0000305175781, |
|
"epoch": 0.2496823379923761, |
|
"grad_norm": 2.7656362056732178, |
|
"kl": 0.2158203125, |
|
"learning_rate": 1.0191354967916712e-06, |
|
"loss": 0.0022, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 513.2857666015625, |
|
"epoch": 0.2503176620076239, |
|
"grad_norm": 0.08351174741983414, |
|
"kl": 0.275390625, |
|
"learning_rate": 1.0098599974865515e-06, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 429.4285888671875, |
|
"epoch": 0.25095298602287164, |
|
"grad_norm": 0.061210744082927704, |
|
"kl": 0.248046875, |
|
"learning_rate": 1.0006575109707898e-06, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 506.71429443359375, |
|
"epoch": 0.25158831003811943, |
|
"grad_norm": 0.06361155211925507, |
|
"kl": 0.2734375, |
|
"learning_rate": 9.915284233622877e-07, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 431.4285888671875, |
|
"epoch": 0.25222363405336723, |
|
"grad_norm": 0.06987325847148895, |
|
"kl": 0.203125, |
|
"learning_rate": 9.824731176992796e-07, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 355.14288330078125, |
|
"epoch": 0.25285895806861497, |
|
"grad_norm": 0.1327361762523651, |
|
"kl": 0.30859375, |
|
"learning_rate": 9.734919739242543e-07, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 417.5714416503906, |
|
"epoch": 0.25349428208386277, |
|
"grad_norm": 3.1446924209594727, |
|
"kl": 0.333984375, |
|
"learning_rate": 9.645853688680177e-07, |
|
"loss": 0.0033, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 389.5714416503906, |
|
"epoch": 0.25412960609911056, |
|
"grad_norm": 0.048515211790800095, |
|
"kl": 0.1787109375, |
|
"learning_rate": 9.557536762338786e-07, |
|
"loss": 0.0021, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 399.14288330078125, |
|
"epoch": 0.2547649301143583, |
|
"grad_norm": 0.053919967263936996, |
|
"kl": 0.2109375, |
|
"learning_rate": 9.46997266581973e-07, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 401 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 408.2857360839844, |
|
"epoch": 0.2554002541296061, |
|
"grad_norm": 3.3761556148529053, |
|
"kl": 0.310546875, |
|
"learning_rate": 9.383165073137115e-07, |
|
"loss": 0.0031, |
|
"reward": 0.44395607709884644, |
|
"reward_std": 0.04111712798476219, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.04395604878664017, |
|
"rewards/format_reward": 1.0, |
|
"step": 402 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 423.8571472167969, |
|
"epoch": 0.2560355781448539, |
|
"grad_norm": 0.17396095395088196, |
|
"kl": 0.3515625, |
|
"learning_rate": 9.297117626563687e-07, |
|
"loss": 0.0038, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 403 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.7142944335938, |
|
"epoch": 0.25667090216010163, |
|
"grad_norm": 0.44405487179756165, |
|
"kl": 0.318359375, |
|
"learning_rate": 9.211833936477957e-07, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 404 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 416.0000305175781, |
|
"epoch": 0.25730622617534943, |
|
"grad_norm": 0.06627284735441208, |
|
"kl": 0.29296875, |
|
"learning_rate": 9.127317581212753e-07, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 381.0000305175781, |
|
"epoch": 0.2579415501905972, |
|
"grad_norm": 0.10693392902612686, |
|
"kl": 0.30078125, |
|
"learning_rate": 9.043572106905084e-07, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 406 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.0000305175781, |
|
"epoch": 0.25857687420584496, |
|
"grad_norm": 0.04814046248793602, |
|
"kl": 0.26953125, |
|
"learning_rate": 8.960601027347321e-07, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 407 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 390.4285888671875, |
|
"epoch": 0.25921219822109276, |
|
"grad_norm": 0.06620350480079651, |
|
"kl": 0.21484375, |
|
"learning_rate": 8.878407823839788e-07, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 408 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 444.4285888671875, |
|
"epoch": 0.25984752223634056, |
|
"grad_norm": 3.463074207305908, |
|
"kl": 0.376953125, |
|
"learning_rate": 8.796995945044689e-07, |
|
"loss": 0.0038, |
|
"reward": 0.4129870533943176, |
|
"reward_std": 0.034360405057668686, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.012987013906240463, |
|
"rewards/format_reward": 1.0, |
|
"step": 409 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 442.5714416503906, |
|
"epoch": 0.2604828462515883, |
|
"grad_norm": 2.6474926471710205, |
|
"kl": 0.287109375, |
|
"learning_rate": 8.716368806841405e-07, |
|
"loss": 0.0029, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 393.4285888671875, |
|
"epoch": 0.2611181702668361, |
|
"grad_norm": 3.3637120723724365, |
|
"kl": 0.203125, |
|
"learning_rate": 8.636529792183171e-07, |
|
"loss": 0.002, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 411 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 441.857177734375, |
|
"epoch": 0.2617534942820839, |
|
"grad_norm": 2.904768466949463, |
|
"kl": 0.2060546875, |
|
"learning_rate": 8.557482250955144e-07, |
|
"loss": 0.0021, |
|
"reward": 0.6987013220787048, |
|
"reward_std": 0.21078045666217804, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.29870128631591797, |
|
"rewards/format_reward": 1.0, |
|
"step": 412 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 399.14288330078125, |
|
"epoch": 0.2623888182973316, |
|
"grad_norm": 0.10355106741189957, |
|
"kl": 0.3359375, |
|
"learning_rate": 8.479229499833844e-07, |
|
"loss": 0.0037, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 413 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 456.0000305175781, |
|
"epoch": 0.2630241423125794, |
|
"grad_norm": 0.13930782675743103, |
|
"kl": 0.294921875, |
|
"learning_rate": 8.401774822147976e-07, |
|
"loss": 0.0032, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 414 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 443.4285888671875, |
|
"epoch": 0.2636594663278272, |
|
"grad_norm": 3.3897175788879395, |
|
"kl": 0.306640625, |
|
"learning_rate": 8.325121467740695e-07, |
|
"loss": 0.0031, |
|
"reward": 0.5224490165710449, |
|
"reward_std": 0.12853363156318665, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.12244899570941925, |
|
"rewards/format_reward": 1.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 550.1428833007812, |
|
"epoch": 0.26429479034307496, |
|
"grad_norm": 0.05424835905432701, |
|
"kl": 0.25390625, |
|
"learning_rate": 8.249272652833226e-07, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 416 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 482.14288330078125, |
|
"epoch": 0.26493011435832275, |
|
"grad_norm": 0.2863621115684509, |
|
"kl": 0.3203125, |
|
"learning_rate": 8.174231559889931e-07, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 417 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 486.5714416503906, |
|
"epoch": 0.2655654383735705, |
|
"grad_norm": 0.05610418692231178, |
|
"kl": 0.28515625, |
|
"learning_rate": 8.100001337484787e-07, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 418 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 466.2857360839844, |
|
"epoch": 0.2662007623888183, |
|
"grad_norm": 0.07580099999904633, |
|
"kl": 0.322265625, |
|
"learning_rate": 8.026585100169251e-07, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 419 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 478.71429443359375, |
|
"epoch": 0.2668360864040661, |
|
"grad_norm": 3.3928205966949463, |
|
"kl": 0.23046875, |
|
"learning_rate": 7.953985928341601e-07, |
|
"loss": 0.0023, |
|
"reward": 0.4655141234397888, |
|
"reward_std": 0.08343012630939484, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.06551411002874374, |
|
"rewards/format_reward": 1.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 494.0000305175781, |
|
"epoch": 0.2674714104193138, |
|
"grad_norm": 0.04618528112769127, |
|
"kl": 0.1669921875, |
|
"learning_rate": 7.882206868117693e-07, |
|
"loss": 0.002, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 421 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 441.5714416503906, |
|
"epoch": 0.2681067344345616, |
|
"grad_norm": 2.9898011684417725, |
|
"kl": 0.28515625, |
|
"learning_rate": 7.81125093120313e-07, |
|
"loss": 0.0028, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 422 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 462.4285888671875, |
|
"epoch": 0.2687420584498094, |
|
"grad_norm": 0.056119803339242935, |
|
"kl": 0.3046875, |
|
"learning_rate": 7.741121094766916e-07, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 423 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 312.2857360839844, |
|
"epoch": 0.26937738246505716, |
|
"grad_norm": 3.943939447402954, |
|
"kl": 0.314453125, |
|
"learning_rate": 7.671820301316532e-07, |
|
"loss": 0.0031, |
|
"reward": 0.48671239614486694, |
|
"reward_std": 0.044492121785879135, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.08671241253614426, |
|
"rewards/format_reward": 1.0, |
|
"step": 424 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 422.4285888671875, |
|
"epoch": 0.27001270648030495, |
|
"grad_norm": 0.04282496124505997, |
|
"kl": 0.16015625, |
|
"learning_rate": 7.603351458574474e-07, |
|
"loss": 0.0019, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.2857360839844, |
|
"epoch": 0.27064803049555275, |
|
"grad_norm": 0.059300344437360764, |
|
"kl": 0.26953125, |
|
"learning_rate": 7.535717439356255e-07, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 426 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.14288330078125, |
|
"epoch": 0.2712833545108005, |
|
"grad_norm": 2.4999890327453613, |
|
"kl": 0.14453125, |
|
"learning_rate": 7.46892108144986e-07, |
|
"loss": 0.0014, |
|
"reward": 0.7171429395675659, |
|
"reward_std": 0.4510992467403412, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.31714287400245667, |
|
"rewards/format_reward": 1.0, |
|
"step": 427 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 488.71429443359375, |
|
"epoch": 0.2719186785260483, |
|
"grad_norm": 0.052872247993946075, |
|
"kl": 0.279296875, |
|
"learning_rate": 7.402965187496697e-07, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 428 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 475.857177734375, |
|
"epoch": 0.2725540025412961, |
|
"grad_norm": 2.7971818447113037, |
|
"kl": 0.1826171875, |
|
"learning_rate": 7.337852524873974e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 429 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 647.1428833007812, |
|
"epoch": 0.2731893265565438, |
|
"grad_norm": 2.0566720962524414, |
|
"kl": 0.2255859375, |
|
"learning_rate": 7.273585825578608e-07, |
|
"loss": 0.0023, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 432.8571472167969, |
|
"epoch": 0.2738246505717916, |
|
"grad_norm": 2.9699957370758057, |
|
"kl": 0.27734375, |
|
"learning_rate": 7.21016778611259e-07, |
|
"loss": 0.0028, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 431 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 419.14288330078125, |
|
"epoch": 0.2744599745870394, |
|
"grad_norm": 0.10036282241344452, |
|
"kl": 0.31640625, |
|
"learning_rate": 7.147601067369835e-07, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 432 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 323.71429443359375, |
|
"epoch": 0.27509529860228715, |
|
"grad_norm": 3.8502118587493896, |
|
"kl": 0.275390625, |
|
"learning_rate": 7.085888294524561e-07, |
|
"loss": 0.0028, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 433 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 631.2857666015625, |
|
"epoch": 0.27573062261753495, |
|
"grad_norm": 0.05429592728614807, |
|
"kl": 0.23828125, |
|
"learning_rate": 7.025032056921117e-07, |
|
"loss": 0.0027, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 434 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 455.71429443359375, |
|
"epoch": 0.27636594663278274, |
|
"grad_norm": 3.2364959716796875, |
|
"kl": 0.29296875, |
|
"learning_rate": 6.965034907965349e-07, |
|
"loss": 0.0029, |
|
"reward": 0.5190476179122925, |
|
"reward_std": 0.09449110925197601, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1190476268529892, |
|
"rewards/format_reward": 1.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 419.0000305175781, |
|
"epoch": 0.2770012706480305, |
|
"grad_norm": 3.7509405612945557, |
|
"kl": 0.1748046875, |
|
"learning_rate": 6.905899365017462e-07, |
|
"loss": 0.0017, |
|
"reward": 0.4330357313156128, |
|
"reward_std": 0.0516289584338665, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.03303571790456772, |
|
"rewards/format_reward": 1.0, |
|
"step": 436 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 386.5714416503906, |
|
"epoch": 0.2776365946632783, |
|
"grad_norm": 0.05398930609226227, |
|
"kl": 0.314453125, |
|
"learning_rate": 6.847627909286409e-07, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 437 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 474.4285888671875, |
|
"epoch": 0.2782719186785261, |
|
"grad_norm": 2.960287094116211, |
|
"kl": 0.181640625, |
|
"learning_rate": 6.790222985725761e-07, |
|
"loss": 0.0018, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 438 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 492.4285888671875, |
|
"epoch": 0.2789072426937738, |
|
"grad_norm": 3.587923288345337, |
|
"kl": 0.2470703125, |
|
"learning_rate": 6.733687002931141e-07, |
|
"loss": 0.0025, |
|
"reward": 0.4714285731315613, |
|
"reward_std": 0.18898221850395203, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0714285746216774, |
|
"rewards/format_reward": 1.0, |
|
"step": 439 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 425.5714416503906, |
|
"epoch": 0.2795425667090216, |
|
"grad_norm": 2.850022077560425, |
|
"kl": 0.1650390625, |
|
"learning_rate": 6.678022333039158e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 397.4285888671875, |
|
"epoch": 0.28017789072426935, |
|
"grad_norm": 0.045945875346660614, |
|
"kl": 0.265625, |
|
"learning_rate": 6.623231311627876e-07, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 441 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.5714721679688, |
|
"epoch": 0.28081321473951715, |
|
"grad_norm": 0.06309759616851807, |
|
"kl": 0.265625, |
|
"learning_rate": 6.569316237618811e-07, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 442 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 318.0, |
|
"epoch": 0.28144853875476494, |
|
"grad_norm": 3.4273765087127686, |
|
"kl": 0.234375, |
|
"learning_rate": 6.516279373180499e-07, |
|
"loss": 0.0023, |
|
"reward": 0.971428632736206, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 443 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 551.4285888671875, |
|
"epoch": 0.2820838627700127, |
|
"grad_norm": 0.059127844870090485, |
|
"kl": 0.2490234375, |
|
"learning_rate": 6.464122943633543e-07, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 444 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 388.71429443359375, |
|
"epoch": 0.2827191867852605, |
|
"grad_norm": 0.058766093105077744, |
|
"kl": 0.302734375, |
|
"learning_rate": 6.412849137357271e-07, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 484.2857360839844, |
|
"epoch": 0.2833545108005083, |
|
"grad_norm": 0.05756891146302223, |
|
"kl": 0.275390625, |
|
"learning_rate": 6.3624601056979e-07, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 446 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 397.4285888671875, |
|
"epoch": 0.283989834815756, |
|
"grad_norm": 3.6736083030700684, |
|
"kl": 0.30859375, |
|
"learning_rate": 6.312957962878278e-07, |
|
"loss": 0.0031, |
|
"reward": 0.4039139151573181, |
|
"reward_std": 0.01035518478602171, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.003913894295692444, |
|
"rewards/format_reward": 1.0, |
|
"step": 447 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 528.857177734375, |
|
"epoch": 0.2846251588310038, |
|
"grad_norm": 0.047244369983673096, |
|
"kl": 0.259765625, |
|
"learning_rate": 6.264344785909181e-07, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 448 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 432.5714416503906, |
|
"epoch": 0.2852604828462516, |
|
"grad_norm": 0.06841956079006195, |
|
"kl": 0.298828125, |
|
"learning_rate": 6.216622614502149e-07, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 449 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 538.1428833007812, |
|
"epoch": 0.28589580686149935, |
|
"grad_norm": 0.052464455366134644, |
|
"kl": 0.25, |
|
"learning_rate": 6.169793450983916e-07, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 490.71429443359375, |
|
"epoch": 0.28653113087674714, |
|
"grad_norm": 0.04570423439145088, |
|
"kl": 0.2470703125, |
|
"learning_rate": 6.123859260212393e-07, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 451 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 400.14288330078125, |
|
"epoch": 0.28716645489199494, |
|
"grad_norm": 0.05332702025771141, |
|
"kl": 0.306640625, |
|
"learning_rate": 6.07882196949423e-07, |
|
"loss": 0.0034, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 452 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 338.8571472167969, |
|
"epoch": 0.2878017789072427, |
|
"grad_norm": 3.625446081161499, |
|
"kl": 0.1982421875, |
|
"learning_rate": 6.034683468503948e-07, |
|
"loss": 0.002, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 453 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 361.14288330078125, |
|
"epoch": 0.2884371029224905, |
|
"grad_norm": 3.3959381580352783, |
|
"kl": 0.375, |
|
"learning_rate": 5.991445609204641e-07, |
|
"loss": 0.0037, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 454 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 404.2857360839844, |
|
"epoch": 0.28907242693773827, |
|
"grad_norm": 0.06705067306756973, |
|
"kl": 0.27734375, |
|
"learning_rate": 5.949110205770292e-07, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 368.14288330078125, |
|
"epoch": 0.289707750952986, |
|
"grad_norm": 0.0792422741651535, |
|
"kl": 0.30078125, |
|
"learning_rate": 5.90767903450964e-07, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 456 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 356.0000305175781, |
|
"epoch": 0.2903430749682338, |
|
"grad_norm": 3.4945454597473145, |
|
"kl": 0.1982421875, |
|
"learning_rate": 5.867153833791652e-07, |
|
"loss": 0.002, |
|
"reward": 0.4571428596973419, |
|
"reward_std": 0.053452249616384506, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.05714286118745804, |
|
"rewards/format_reward": 1.0, |
|
"step": 457 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 371.5714416503906, |
|
"epoch": 0.2909783989834816, |
|
"grad_norm": 3.341874837875366, |
|
"kl": 0.3203125, |
|
"learning_rate": 5.827536303972587e-07, |
|
"loss": 0.0032, |
|
"reward": 0.44761908054351807, |
|
"reward_std": 0.12598817050457, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0476190522313118, |
|
"rewards/format_reward": 1.0, |
|
"step": 458 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 395.8571472167969, |
|
"epoch": 0.29161372299872934, |
|
"grad_norm": 0.0777309313416481, |
|
"kl": 0.30078125, |
|
"learning_rate": 5.78882810732465e-07, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 459 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 461.857177734375, |
|
"epoch": 0.29224904701397714, |
|
"grad_norm": 0.0932033360004425, |
|
"kl": 0.3203125, |
|
"learning_rate": 5.75103086796625e-07, |
|
"loss": 0.0035, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 419.4285888671875, |
|
"epoch": 0.2928843710292249, |
|
"grad_norm": 3.0678720474243164, |
|
"kl": 0.1650390625, |
|
"learning_rate": 5.714146171793846e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 461 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.71429443359375, |
|
"epoch": 0.2935196950444727, |
|
"grad_norm": 0.08228779584169388, |
|
"kl": 0.216796875, |
|
"learning_rate": 5.678175566415422e-07, |
|
"loss": 0.0025, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 462 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 443.71429443359375, |
|
"epoch": 0.29415501905972047, |
|
"grad_norm": 2.509739398956299, |
|
"kl": 0.349609375, |
|
"learning_rate": 5.643120561085528e-07, |
|
"loss": 0.0035, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645264148712, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 463 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.857177734375, |
|
"epoch": 0.2947903430749682, |
|
"grad_norm": 0.0598638616502285, |
|
"kl": 0.26171875, |
|
"learning_rate": 5.608982626641991e-07, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 464 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 508.0000305175781, |
|
"epoch": 0.295425667090216, |
|
"grad_norm": 0.04956334829330444, |
|
"kl": 0.2109375, |
|
"learning_rate": 5.575763195444166e-07, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.1428833007812, |
|
"epoch": 0.2960609911054638, |
|
"grad_norm": 3.209406614303589, |
|
"kl": 0.275390625, |
|
"learning_rate": 5.543463661312847e-07, |
|
"loss": 0.0027, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 466 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 440.857177734375, |
|
"epoch": 0.29669631512071154, |
|
"grad_norm": 0.05284934490919113, |
|
"kl": 0.26171875, |
|
"learning_rate": 5.512085379471808e-07, |
|
"loss": 0.0029, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 467 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 445.2857360839844, |
|
"epoch": 0.29733163913595934, |
|
"grad_norm": 3.3668439388275146, |
|
"kl": 0.29296875, |
|
"learning_rate": 5.481629666490903e-07, |
|
"loss": 0.0029, |
|
"reward": 0.485714316368103, |
|
"reward_std": 0.08997353911399841, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.08571429550647736, |
|
"rewards/format_reward": 1.0, |
|
"step": 468 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.0, |
|
"epoch": 0.29796696315120713, |
|
"grad_norm": 2.8946890830993652, |
|
"kl": 0.2216796875, |
|
"learning_rate": 5.452097800230853e-07, |
|
"loss": 0.0022, |
|
"reward": 0.3428571820259094, |
|
"reward_std": 0.1511857956647873, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 469 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 425.4285888671875, |
|
"epoch": 0.29860228716645487, |
|
"grad_norm": 0.04910692945122719, |
|
"kl": 0.1669921875, |
|
"learning_rate": 5.423491019789623e-07, |
|
"loss": 0.002, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 238.2857208251953, |
|
"epoch": 0.29923761118170267, |
|
"grad_norm": 0.06361490488052368, |
|
"kl": 0.208984375, |
|
"learning_rate": 5.395810525450425e-07, |
|
"loss": 0.0033, |
|
"reward": 1.4000002145767212, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 1.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 471 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 395.8571472167969, |
|
"epoch": 0.29987293519695046, |
|
"grad_norm": 0.04494641348719597, |
|
"kl": 0.2041015625, |
|
"learning_rate": 5.369057478631359e-07, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 472 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.0000305175781, |
|
"epoch": 0.3005082592121982, |
|
"grad_norm": 3.899231195449829, |
|
"kl": 0.2099609375, |
|
"learning_rate": 5.343233001836694e-07, |
|
"loss": 0.0021, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 473 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 566.1428833007812, |
|
"epoch": 0.301143583227446, |
|
"grad_norm": 2.999528646469116, |
|
"kl": 0.1689453125, |
|
"learning_rate": 5.318338178609754e-07, |
|
"loss": 0.0017, |
|
"reward": 0.42418450117111206, |
|
"reward_std": 0.06398611515760422, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.02418447658419609, |
|
"rewards/format_reward": 1.0, |
|
"step": 474 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 505.2857360839844, |
|
"epoch": 0.3017789072426938, |
|
"grad_norm": 0.056534409523010254, |
|
"kl": 0.2138671875, |
|
"learning_rate": 5.294374053487459e-07, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 376.71429443359375, |
|
"epoch": 0.30241423125794153, |
|
"grad_norm": 3.721620798110962, |
|
"kl": 0.212890625, |
|
"learning_rate": 5.271341631956511e-07, |
|
"loss": 0.0021, |
|
"reward": 0.5836734771728516, |
|
"reward_std": 0.3662114441394806, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.18367348611354828, |
|
"rewards/format_reward": 1.0, |
|
"step": 476 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 492.14288330078125, |
|
"epoch": 0.30304955527318933, |
|
"grad_norm": 0.05311813950538635, |
|
"kl": 0.251953125, |
|
"learning_rate": 5.249241880411181e-07, |
|
"loss": 0.0028, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 477 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 510.0000305175781, |
|
"epoch": 0.3036848792884371, |
|
"grad_norm": 3.715238332748413, |
|
"kl": 0.240234375, |
|
"learning_rate": 5.228075726112785e-07, |
|
"loss": 0.0024, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645264148712, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 478 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 455.0000305175781, |
|
"epoch": 0.30432020330368487, |
|
"grad_norm": 3.2647809982299805, |
|
"kl": 0.1640625, |
|
"learning_rate": 5.207844057150768e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.37796446681022644, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 479 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 481.71429443359375, |
|
"epoch": 0.30495552731893266, |
|
"grad_norm": 0.06345637142658234, |
|
"kl": 0.265625, |
|
"learning_rate": 5.188547722405437e-07, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 439.14288330078125, |
|
"epoch": 0.30559085133418046, |
|
"grad_norm": 0.06628384441137314, |
|
"kl": 0.26953125, |
|
"learning_rate": 5.170187531512351e-07, |
|
"loss": 0.003, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 481 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 422.14288330078125, |
|
"epoch": 0.3062261753494282, |
|
"grad_norm": 2.7409112453460693, |
|
"kl": 0.1591796875, |
|
"learning_rate": 5.152764254828348e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645562171936, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 482 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.1428833007812, |
|
"epoch": 0.306861499364676, |
|
"grad_norm": 0.041587937623262405, |
|
"kl": 0.140625, |
|
"learning_rate": 5.136278623399225e-07, |
|
"loss": 0.0017, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 483 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 375.0000305175781, |
|
"epoch": 0.30749682337992373, |
|
"grad_norm": 2.9096410274505615, |
|
"kl": 0.2001953125, |
|
"learning_rate": 5.120731328929058e-07, |
|
"loss": 0.002, |
|
"reward": 0.971428632736206, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.5714285969734192, |
|
"rewards/format_reward": 1.0, |
|
"step": 484 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 454.0000305175781, |
|
"epoch": 0.30813214739517153, |
|
"grad_norm": 3.31744122505188, |
|
"kl": 0.21484375, |
|
"learning_rate": 5.106123023751187e-07, |
|
"loss": 0.0021, |
|
"reward": 0.8511278033256531, |
|
"reward_std": 0.23666103184223175, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.451127827167511, |
|
"rewards/format_reward": 1.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 416.4285888671875, |
|
"epoch": 0.3087674714104193, |
|
"grad_norm": 2.8466956615448, |
|
"kl": 0.19140625, |
|
"learning_rate": 5.092454320800833e-07, |
|
"loss": 0.0019, |
|
"reward": 0.4338059425354004, |
|
"reward_std": 0.06710861623287201, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.03380589187145233, |
|
"rewards/format_reward": 1.0, |
|
"step": 486 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 466.4285888671875, |
|
"epoch": 0.30940279542566707, |
|
"grad_norm": 5.7825751304626465, |
|
"kl": 0.65234375, |
|
"learning_rate": 5.079725793589405e-07, |
|
"loss": 0.0068, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 487 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 358.4285888671875, |
|
"epoch": 0.31003811944091486, |
|
"grad_norm": 0.06072893738746643, |
|
"kl": 0.1953125, |
|
"learning_rate": 5.067937976180407e-07, |
|
"loss": 0.0022, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 488 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 504.2857360839844, |
|
"epoch": 0.31067344345616266, |
|
"grad_norm": 0.03951825201511383, |
|
"kl": 0.1630859375, |
|
"learning_rate": 5.057091363167046e-07, |
|
"loss": 0.0019, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 489 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 235.1428680419922, |
|
"epoch": 0.3113087674714104, |
|
"grad_norm": 3.270069122314453, |
|
"kl": 0.224609375, |
|
"learning_rate": 5.047186409651489e-07, |
|
"loss": 0.0022, |
|
"reward": 1.3428572416305542, |
|
"reward_std": 0.15118582546710968, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 1.0, |
|
"rewards/format_reward": 0.8571429252624512, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 509.0000305175781, |
|
"epoch": 0.3119440914866582, |
|
"grad_norm": 0.1327732652425766, |
|
"kl": 0.298828125, |
|
"learning_rate": 5.038223531225742e-07, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 491 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 507.2857360839844, |
|
"epoch": 0.312579415501906, |
|
"grad_norm": 3.286302328109741, |
|
"kl": 0.2353515625, |
|
"learning_rate": 5.030203103954232e-07, |
|
"loss": 0.0024, |
|
"reward": 0.37142860889434814, |
|
"reward_std": 0.07559289783239365, |
|
"rewards/code_format_reward": 0.8571429252624512, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 492 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 433.71429443359375, |
|
"epoch": 0.31321473951715373, |
|
"grad_norm": 0.04749950394034386, |
|
"kl": 0.173828125, |
|
"learning_rate": 5.023125464358026e-07, |
|
"loss": 0.002, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 493 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 402.2857360839844, |
|
"epoch": 0.3138500635324015, |
|
"grad_norm": 3.375169515609741, |
|
"kl": 0.26171875, |
|
"learning_rate": 5.016990909400709e-07, |
|
"loss": 0.0026, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 494 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 491.857177734375, |
|
"epoch": 0.3144853875476493, |
|
"grad_norm": 0.05139714851975441, |
|
"kl": 0.296875, |
|
"learning_rate": 5.011799696475915e-07, |
|
"loss": 0.0033, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 323.5714416503906, |
|
"epoch": 0.31512071156289706, |
|
"grad_norm": 3.7574667930603027, |
|
"kl": 0.181640625, |
|
"learning_rate": 5.007552043396547e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5428571701049805, |
|
"reward_std": 0.3779645264148712, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.1428571492433548, |
|
"rewards/format_reward": 1.0, |
|
"step": 496 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.857177734375, |
|
"epoch": 0.31575603557814486, |
|
"grad_norm": 2.853102445602417, |
|
"kl": 0.1806640625, |
|
"learning_rate": 5.004248128385618e-07, |
|
"loss": 0.0018, |
|
"reward": 0.6857143640518188, |
|
"reward_std": 0.4879501163959503, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.2857142984867096, |
|
"rewards/format_reward": 1.0, |
|
"step": 497 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 422.8571472167969, |
|
"epoch": 0.31639135959339265, |
|
"grad_norm": 3.089301824569702, |
|
"kl": 0.1923828125, |
|
"learning_rate": 5.001888090068784e-07, |
|
"loss": 0.0019, |
|
"reward": 0.8285714983940125, |
|
"reward_std": 0.534522533416748, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.4285714626312256, |
|
"rewards/format_reward": 1.0, |
|
"step": 498 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 451.2857360839844, |
|
"epoch": 0.3170266836086404, |
|
"grad_norm": 5.161578178405762, |
|
"kl": 0.57421875, |
|
"learning_rate": 5.000472027468528e-07, |
|
"loss": 0.0057, |
|
"reward": 0.2857142984867096, |
|
"reward_std": 0.1951800137758255, |
|
"rewards/code_format_reward": 0.7142857313156128, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 499 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 611.4285888671875, |
|
"epoch": 0.3176620076238882, |
|
"grad_norm": 0.048863135278224945, |
|
"kl": 0.2060546875, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.0024, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 444.857177734375, |
|
"epoch": 0.318297331639136, |
|
"grad_norm": 0.047799259424209595, |
|
"kl": 0.19921875, |
|
"learning_rate": 5.000472027468528e-07, |
|
"loss": 0.0023, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 501 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 446.5714416503906, |
|
"epoch": 0.3189326556543837, |
|
"grad_norm": 0.15391913056373596, |
|
"kl": 0.283203125, |
|
"learning_rate": 5.001888090068784e-07, |
|
"loss": 0.0031, |
|
"reward": 0.40000003576278687, |
|
"reward_std": 0.0, |
|
"rewards/code_format_reward": 1.0, |
|
"rewards/code_reward": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.3189326556543837, |
|
"step": 502, |
|
"total_flos": 0.0, |
|
"train_loss": 6.229176085843033e-06, |
|
"train_runtime": 154.2548, |
|
"train_samples_per_second": 22.69, |
|
"train_steps_per_second": 3.241 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|