{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3189326556543837, "eval_steps": 500, "global_step": 502, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 630.0, "epoch": 0.0006353240152477764, "grad_norm": 3.4243216514587402, "kl": 0.0, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 337.14288330078125, "epoch": 0.0012706480304955528, "grad_norm": 5.606422424316406, "kl": 0.0, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "reward": 0.3428571820259094, "reward_std": 0.09759000688791275, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 517.2857666015625, "epoch": 0.0019059720457433292, "grad_norm": 3.686295986175537, "kl": 0.0004425048828125, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 0.26031747460365295, "reward_std": 0.19519509375095367, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0317460335791111, "rewards/format_reward": 0.5714285969734192, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 638.4285888671875, "epoch": 0.0025412960609911056, "grad_norm": 3.40989089012146, "kl": 0.0004863739013671875, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "reward": 0.1714285910129547, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.4285714626312256, "rewards/code_reward": 0.0, "rewards/format_reward": 0.4285714626312256, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 376.4285888671875, "epoch": 0.0031766200762388818, "grad_norm": 4.474236011505127, "kl": 0.0008697509765625, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": 0.40000003576278687, "reward_std": 0.40000003576278687, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.7142857313156128, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 484.71429443359375, "epoch": 0.0038119440914866584, "grad_norm": 4.529554843902588, "kl": 0.0179443359375, "learning_rate": 2.0000000000000003e-06, "loss": 0.0002, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 446.0000305175781, "epoch": 0.0044472681067344345, "grad_norm": 3.3631019592285156, "kl": 0.00128936767578125, "learning_rate": 2.3333333333333336e-06, "loss": 0.0, "reward": 0.8025974631309509, "reward_std": 0.4480050206184387, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.40259745717048645, "rewards/format_reward": 1.0, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 765.0000610351562, "epoch": 0.005082592121982211, "grad_norm": 3.205476999282837, "kl": 0.000675201416015625, "learning_rate": 2.666666666666667e-06, "loss": 0.0, "reward": 0.1714285910129547, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.4285714626312256, "rewards/code_reward": 0.0, "rewards/format_reward": 0.4285714626312256, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 628.5714721679688, "epoch": 0.005717916137229987, "grad_norm": 0.008113108575344086, "kl": 0.001678466796875, "learning_rate": 3e-06, "loss": 0.0003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 934.1428833007812, "epoch": 0.0063532401524777635, "grad_norm": 2.805612087249756, "kl": 0.00390625, "learning_rate": 3.3333333333333333e-06, "loss": 0.0, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 733.4285888671875, "epoch": 0.00698856416772554, "grad_norm": 0.014334071427583694, "kl": 0.0016326904296875, "learning_rate": 3.6666666666666666e-06, "loss": 0.0003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 545.5714721679688, "epoch": 0.007623888182973317, "grad_norm": 3.0349507331848145, "kl": 0.003936767578125, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 539.7142944335938, "epoch": 0.008259212198221092, "grad_norm": 2.7579803466796875, "kl": 0.00787353515625, "learning_rate": 4.333333333333334e-06, "loss": 0.0001, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 502.5714416503906, "epoch": 0.008894536213468869, "grad_norm": 0.06331096589565277, "kl": 0.0101318359375, "learning_rate": 4.666666666666667e-06, "loss": 0.0004, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 583.5714721679688, "epoch": 0.009529860228716646, "grad_norm": 1.8358005285263062, "kl": 0.0576171875, "learning_rate": 5e-06, "loss": 0.0009, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 528.4285888671875, "epoch": 0.010165184243964422, "grad_norm": 5.376899242401123, "kl": 0.173828125, "learning_rate": 4.999952797253148e-06, "loss": 0.0017, "reward": 0.971428632736206, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 390.5714416503906, "epoch": 0.010800508259212199, "grad_norm": 5.1895365715026855, "kl": 0.1767578125, "learning_rate": 4.9998111909931225e-06, "loss": 0.0018, "reward": 0.5571428537368774, "reward_std": 0.12051477283239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.2142857313156128, "rewards/format_reward": 0.8571429252624512, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 449.5714416503906, "epoch": 0.011435832274459974, "grad_norm": 3.5275230407714844, "kl": 0.05126953125, "learning_rate": 4.999575187161439e-06, "loss": 0.0005, "reward": 0.6000000238418579, "reward_std": 0.432049423456192, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 0.8571429252624512, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 523.1428833007812, "epoch": 0.01207115628970775, "grad_norm": 3.147331953048706, "kl": 0.04541015625, "learning_rate": 4.9992447956603455e-06, "loss": 0.0005, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 330.14288330078125, "epoch": 0.012706480304955527, "grad_norm": 108.5281753540039, "kl": 3.046875, "learning_rate": 4.998820030352409e-06, "loss": 0.0305, "reward": 0.6884711980819702, "reward_std": 0.4107622504234314, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.31704264879226685, "rewards/format_reward": 1.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 359.0000305175781, "epoch": 0.013341804320203304, "grad_norm": 5.423805236816406, "kl": 0.1591796875, "learning_rate": 4.998300909059929e-06, "loss": 0.0016, "reward": 0.5269841551780701, "reward_std": 0.1777612417936325, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1269841343164444, "rewards/format_reward": 1.0, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 491.2857360839844, "epoch": 0.01397712833545108, "grad_norm": 1545.3717041015625, "kl": 23.0, "learning_rate": 4.997687453564198e-06, "loss": 0.2309, "reward": 0.5428571701049805, "reward_std": 0.6078847646713257, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 0.7142857313156128, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 503.5714416503906, "epoch": 0.014612452350698857, "grad_norm": 8.096309661865234, "kl": 0.69140625, "learning_rate": 4.9969796896045775e-06, "loss": 0.0069, "reward": 0.3142857253551483, "reward_std": 0.10690449923276901, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 604.1428833007812, "epoch": 0.015247776365946633, "grad_norm": 45758.625, "kl": 608.0, "learning_rate": 4.996177646877426e-06, "loss": 6.0704, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 605.1428833007812, "epoch": 0.01588310038119441, "grad_norm": 3.5074126720428467, "kl": 0.1083984375, "learning_rate": 4.995281359034851e-06, "loss": 0.0011, "reward": 0.05714286118745804, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.1428571492433548, "rewards/code_reward": 0.0, "rewards/format_reward": 0.1428571492433548, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 476.2857360839844, "epoch": 0.016518424396442185, "grad_norm": 9.928363800048828, "kl": 0.421875, "learning_rate": 4.994290863683296e-06, "loss": 0.0042, "reward": 0.11428572237491608, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.2857142984867096, "rewards/code_reward": 0.0, "rewards/format_reward": 0.2857142984867096, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 420.8571472167969, "epoch": 0.017153748411689963, "grad_norm": 30.34365463256836, "kl": 1.2890625, "learning_rate": 4.99320620238196e-06, "loss": 0.0129, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 618.7142944335938, "epoch": 0.017789072426937738, "grad_norm": 3.0826454162597656, "kl": 0.051513671875, "learning_rate": 4.99202742064106e-06, "loss": 0.0005, "reward": 0.22857144474983215, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 582.5714721679688, "epoch": 0.018424396442185513, "grad_norm": 3.330746650695801, "kl": 0.07666015625, "learning_rate": 4.990754567919917e-06, "loss": 0.0008, "reward": 0.4571428894996643, "reward_std": 0.35989415645599365, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.1714285910129547, "rewards/format_reward": 0.7142857313156128, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 440.2857360839844, "epoch": 0.01905972045743329, "grad_norm": 3.658856153488159, "kl": 0.1318359375, "learning_rate": 4.989387697624881e-06, "loss": 0.0013, "reward": 0.3086913228034973, "reward_std": 0.20198491215705872, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.02297702245414257, "rewards/format_reward": 0.7142857313156128, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 764.0000610351562, "epoch": 0.019695044472681066, "grad_norm": 2.7840328216552734, "kl": 0.078125, "learning_rate": 4.987926867107095e-06, "loss": 0.0008, "reward": 0.31462588906288147, "reward_std": 0.19902175664901733, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.028911566361784935, "rewards/format_reward": 0.7142857313156128, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 573.4285888671875, "epoch": 0.020330368487928845, "grad_norm": 3.031372308731079, "kl": 0.078125, "learning_rate": 4.986372137660078e-06, "loss": 0.0008, "reward": 0.485714316368103, "reward_std": 0.22677868604660034, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 460.14288330078125, "epoch": 0.02096569250317662, "grad_norm": 3.5374386310577393, "kl": 0.107421875, "learning_rate": 4.984723574517165e-06, "loss": 0.0011, "reward": 0.3362637758255005, "reward_std": 0.17483238875865936, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.021978024393320084, "rewards/format_reward": 0.8571429252624512, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 510.71429443359375, "epoch": 0.021601016518424398, "grad_norm": 3.355870246887207, "kl": 0.20703125, "learning_rate": 4.9829812468487655e-06, "loss": 0.0021, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 617.7142944335938, "epoch": 0.022236340533672173, "grad_norm": 2.325124979019165, "kl": 0.0849609375, "learning_rate": 4.981145227759457e-06, "loss": 0.0008, "reward": 0.5142857432365417, "reward_std": 0.39761197566986084, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 329.4285888671875, "epoch": 0.022871664548919948, "grad_norm": 4.9638495445251465, "kl": 0.224609375, "learning_rate": 4.979215594284924e-06, "loss": 0.0022, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 734.5714721679688, "epoch": 0.023506988564167726, "grad_norm": 0.06015148386359215, "kl": 0.08154296875, "learning_rate": 4.977192427388722e-06, "loss": 0.0011, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 481.5714416503906, "epoch": 0.0241423125794155, "grad_norm": 0.20115888118743896, "kl": 0.12109375, "learning_rate": 4.9750758119588824e-06, "loss": 0.0015, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 817.0000610351562, "epoch": 0.02477763659466328, "grad_norm": 2.3490183353424072, "kl": 0.0888671875, "learning_rate": 4.972865836804349e-06, "loss": 0.0009, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 622.1428833007812, "epoch": 0.025412960609911054, "grad_norm": 40.009159088134766, "kl": 2.34375, "learning_rate": 4.970562594651254e-06, "loss": 0.0234, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 498.71429443359375, "epoch": 0.026048284625158832, "grad_norm": 3.2044177055358887, "kl": 0.12109375, "learning_rate": 4.968166182139026e-06, "loss": 0.0012, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 480.4285888671875, "epoch": 0.026683608640406607, "grad_norm": 0.08217895030975342, "kl": 0.1484375, "learning_rate": 4.9656766998163306e-06, "loss": 0.0018, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 699.5714721679688, "epoch": 0.027318932655654382, "grad_norm": 3.2091763019561768, "kl": 0.0966796875, "learning_rate": 4.963094252136865e-06, "loss": 0.001, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 595.857177734375, "epoch": 0.02795425667090216, "grad_norm": 0.07719128578901291, "kl": 0.115234375, "learning_rate": 4.960418947454958e-06, "loss": 0.0014, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 598.0, "epoch": 0.028589580686149935, "grad_norm": 0.10212292522192001, "kl": 0.1298828125, "learning_rate": 4.957650898021038e-06, "loss": 0.0016, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 517.857177734375, "epoch": 0.029224904701397714, "grad_norm": 2.6977856159210205, "kl": 0.1337890625, "learning_rate": 4.954790219976915e-06, "loss": 0.0013, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 699.5714721679688, "epoch": 0.02986022871664549, "grad_norm": 0.09081219881772995, "kl": 0.10888671875, "learning_rate": 4.95183703335091e-06, "loss": 0.0014, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 375.5714416503906, "epoch": 0.030495552731893267, "grad_norm": 3.1593081951141357, "kl": 0.162109375, "learning_rate": 4.948791462052819e-06, "loss": 0.0016, "reward": 0.9428572654724121, "reward_std": 0.5740416646003723, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 605.5714721679688, "epoch": 0.031130876747141042, "grad_norm": 0.2238318771123886, "kl": 0.1396484375, "learning_rate": 4.945653633868716e-06, "loss": 0.0017, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 519.0, "epoch": 0.03176620076238882, "grad_norm": 3.9522361755371094, "kl": 0.1796875, "learning_rate": 4.942423680455584e-06, "loss": 0.0018, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 577.0, "epoch": 0.032401524777636595, "grad_norm": 3.095000743865967, "kl": 0.15234375, "learning_rate": 4.939101737335802e-06, "loss": 0.0015, "reward": 0.11428572237491608, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.2857142984867096, "rewards/code_reward": 0.0, "rewards/format_reward": 0.2857142984867096, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 651.2857666015625, "epoch": 0.03303684879288437, "grad_norm": 2.880178451538086, "kl": 0.14453125, "learning_rate": 4.935687943891447e-06, "loss": 0.0015, "reward": 0.22857144474983215, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 570.4285888671875, "epoch": 0.033672172808132145, "grad_norm": 6.103281021118164, "kl": 0.421875, "learning_rate": 4.932182443358458e-06, "loss": 0.0042, "reward": 0.22857144474983215, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 461.857177734375, "epoch": 0.03430749682337993, "grad_norm": 5.561570644378662, "kl": 0.283203125, "learning_rate": 4.928585382820616e-06, "loss": 0.0028, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 710.0000610351562, "epoch": 0.0349428208386277, "grad_norm": 0.06716505438089371, "kl": 0.1875, "learning_rate": 4.924896913203376e-06, "loss": 0.0022, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 725.5714721679688, "epoch": 0.035578144853875476, "grad_norm": 4.731691837310791, "kl": 0.130859375, "learning_rate": 4.921117189267535e-06, "loss": 0.0013, "reward": 0.3588235676288605, "reward_std": 0.43374723196029663, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.13025210797786713, "rewards/format_reward": 0.5714285969734192, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 665.2857666015625, "epoch": 0.03621346886912325, "grad_norm": 2.7444961071014404, "kl": 0.18359375, "learning_rate": 4.917246369602742e-06, "loss": 0.0018, "reward": 0.4285714626312256, "reward_std": 0.4680252969264984, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.7142857313156128, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 648.0, "epoch": 0.036848792884371026, "grad_norm": 3.6189987659454346, "kl": 0.20703125, "learning_rate": 4.9132846166208355e-06, "loss": 0.0021, "reward": 0.5142857432365417, "reward_std": 0.6309479475021362, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 0.5714285969734192, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 483.14288330078125, "epoch": 0.03748411689961881, "grad_norm": 0.15579625964164734, "kl": 0.26953125, "learning_rate": 4.9092320965490365e-06, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 599.857177734375, "epoch": 0.03811944091486658, "grad_norm": 3.2243692874908447, "kl": 0.154296875, "learning_rate": 4.905088979422971e-06, "loss": 0.0015, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 446.0000305175781, "epoch": 0.03875476493011436, "grad_norm": 3.187040090560913, "kl": 0.2392578125, "learning_rate": 4.900855439079536e-06, "loss": 0.0024, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 619.7142944335938, "epoch": 0.03939008894536213, "grad_norm": 3.501086473464966, "kl": 0.205078125, "learning_rate": 4.8965316531496055e-06, "loss": 0.0021, "reward": 0.3428571820259094, "reward_std": 0.09759001433849335, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 598.0, "epoch": 0.040025412960609914, "grad_norm": 4.5344557762146, "kl": 0.1484375, "learning_rate": 4.892117803050578e-06, "loss": 0.0015, "reward": 0.22857144474983215, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 507.0000305175781, "epoch": 0.04066073697585769, "grad_norm": 6.242559432983398, "kl": 0.392578125, "learning_rate": 4.887614073978761e-06, "loss": 0.0039, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 757.1428833007812, "epoch": 0.041296060991105464, "grad_norm": 10.64748764038086, "kl": 0.53515625, "learning_rate": 4.883020654901609e-06, "loss": 0.0054, "reward": 0.22857144474983215, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 585.4285888671875, "epoch": 0.04193138500635324, "grad_norm": 3.1377880573272705, "kl": 0.2119140625, "learning_rate": 4.878337738549785e-06, "loss": 0.0021, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 604.2857666015625, "epoch": 0.042566709021601014, "grad_norm": 0.17088104784488678, "kl": 0.2490234375, "learning_rate": 4.873565521409082e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 842.5714721679688, "epoch": 0.043202033036848796, "grad_norm": 3.577969551086426, "kl": 0.384765625, "learning_rate": 4.868704203712173e-06, "loss": 0.0038, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 841.0000610351562, "epoch": 0.04383735705209657, "grad_norm": 3.18711256980896, "kl": 0.2080078125, "learning_rate": 4.86375398943021e-06, "loss": 0.0021, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 525.4285888671875, "epoch": 0.044472681067344345, "grad_norm": 2.3753271102905273, "kl": 0.177734375, "learning_rate": 4.858715086264274e-06, "loss": 0.0018, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 406.8571472167969, "epoch": 0.04510800508259212, "grad_norm": 3.5183515548706055, "kl": 0.2138671875, "learning_rate": 4.853587705636646e-06, "loss": 0.0021, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 630.5714721679688, "epoch": 0.045743329097839895, "grad_norm": 2.6844465732574463, "kl": 0.103515625, "learning_rate": 4.84837206268195e-06, "loss": 0.001, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 464.14288330078125, "epoch": 0.04637865311308768, "grad_norm": 3.63313627243042, "kl": 0.1591796875, "learning_rate": 4.8430683762381195e-06, "loss": 0.0016, "reward": 0.2571428716182709, "reward_std": 0.19023796916007996, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 507.2857360839844, "epoch": 0.04701397712833545, "grad_norm": 2.707540988922119, "kl": 0.111328125, "learning_rate": 4.837676868837213e-06, "loss": 0.0011, "reward": 0.4023166298866272, "reward_std": 0.004252949263900518, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0023166025057435036, "rewards/format_reward": 1.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 558.857177734375, "epoch": 0.04764930114358323, "grad_norm": 3.0911412239074707, "kl": 0.1474609375, "learning_rate": 4.832197766696085e-06, "loss": 0.0015, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 460.4285888671875, "epoch": 0.048284625158831, "grad_norm": 3.5635647773742676, "kl": 0.1357421875, "learning_rate": 4.826631299706887e-06, "loss": 0.0014, "reward": 0.5032258033752441, "reward_std": 0.2984292209148407, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.18894009292125702, "rewards/format_reward": 0.8571429252624512, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 382.2857360839844, "epoch": 0.04891994917407878, "grad_norm": 0.05187558755278587, "kl": 0.1611328125, "learning_rate": 4.820977701427424e-06, "loss": 0.0019, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 466.14288330078125, "epoch": 0.04955527318932656, "grad_norm": 0.04543861746788025, "kl": 0.130859375, "learning_rate": 4.81523720907136e-06, "loss": 0.0016, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 551.1428833007812, "epoch": 0.05019059720457433, "grad_norm": 0.45131492614746094, "kl": 0.28515625, "learning_rate": 4.809410063498254e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 379.5714416503906, "epoch": 0.05082592121982211, "grad_norm": 3.854992151260376, "kl": 0.1591796875, "learning_rate": 4.8034965092034656e-06, "loss": 0.0016, "reward": 1.2000001668930054, "reward_std": 0.3829708993434906, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.8571429252624512, "rewards/format_reward": 0.8571429252624512, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 566.857177734375, "epoch": 0.05146124523506988, "grad_norm": 41.30952453613281, "kl": 1.9921875, "learning_rate": 4.797496794307889e-06, "loss": 0.0199, "reward": 0.2489795982837677, "reward_std": 0.23831285536289215, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.020408164709806442, "rewards/format_reward": 0.5714285969734192, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 448.71429443359375, "epoch": 0.052096569250317665, "grad_norm": 4.601639270782471, "kl": 0.2412109375, "learning_rate": 4.791411170547545e-06, "loss": 0.0024, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 614.5714721679688, "epoch": 0.05273189326556544, "grad_norm": 4.209871768951416, "kl": 0.396484375, "learning_rate": 4.785239893263017e-06, "loss": 0.004, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 410.4285888671875, "epoch": 0.053367217280813214, "grad_norm": 4.049778938293457, "kl": 0.171875, "learning_rate": 4.778983221388742e-06, "loss": 0.0017, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 421.71429443359375, "epoch": 0.05400254129606099, "grad_norm": 5.517159461975098, "kl": 0.318359375, "learning_rate": 4.77264141744214e-06, "loss": 0.0032, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 405.8571472167969, "epoch": 0.054637865311308764, "grad_norm": 1.0127395391464233, "kl": 0.37109375, "learning_rate": 4.766214747512603e-06, "loss": 0.004, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 502.2857360839844, "epoch": 0.055273189326556546, "grad_norm": 5.1016316413879395, "kl": 0.625, "learning_rate": 4.759703481250331e-06, "loss": 0.0062, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 415.5714416503906, "epoch": 0.05590851334180432, "grad_norm": 3205.96337890625, "kl": 94.0, "learning_rate": 4.753107891855015e-06, "loss": 0.9386, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 162.0, "epoch": 0.056543837357052096, "grad_norm": 5.175131797790527, "kl": 0.349609375, "learning_rate": 4.746428256064375e-06, "loss": 0.0035, "reward": 0.971428632736206, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 255.2857208251953, "epoch": 0.05717916137229987, "grad_norm": 4.350471496582031, "kl": 0.251953125, "learning_rate": 4.7396648541425534e-06, "loss": 0.0025, "reward": 0.9142858386039734, "reward_std": 0.6202918887138367, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 0.8571429252624512, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 515.7142944335938, "epoch": 0.05781448538754765, "grad_norm": 3.159823417663574, "kl": 0.1904296875, "learning_rate": 4.732817969868348e-06, "loss": 0.0019, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 504.5714416503906, "epoch": 0.05844980940279543, "grad_norm": 3.7896342277526855, "kl": 0.2353515625, "learning_rate": 4.7258878905233095e-06, "loss": 0.0023, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 355.5714416503906, "epoch": 0.0590851334180432, "grad_norm": 2.87099289894104, "kl": 0.1962890625, "learning_rate": 4.718874906879688e-06, "loss": 0.002, "reward": 0.5428571701049805, "reward_std": 0.3779645264148712, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 561.0, "epoch": 0.05972045743329098, "grad_norm": 3.082097053527832, "kl": 0.220703125, "learning_rate": 4.711779313188231e-06, "loss": 0.0022, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 430.0000305175781, "epoch": 0.06035578144853875, "grad_norm": 0.06256424635648727, "kl": 0.2138671875, "learning_rate": 4.70460140716584e-06, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 695.1428833007812, "epoch": 0.060991105463786534, "grad_norm": 3.0291473865509033, "kl": 0.412109375, "learning_rate": 4.697341489983076e-06, "loss": 0.0041, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 459.4285888671875, "epoch": 0.06162642947903431, "grad_norm": 3.461120843887329, "kl": 0.291015625, "learning_rate": 4.6899998662515215e-06, "loss": 0.0029, "reward": 0.4714285731315613, "reward_std": 0.06681530177593231, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0714285746216774, "rewards/format_reward": 1.0, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 626.4285888671875, "epoch": 0.062261753494282084, "grad_norm": 0.07474807649850845, "kl": 0.3203125, "learning_rate": 4.682576844011007e-06, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 594.2857666015625, "epoch": 0.06289707750952986, "grad_norm": 0.3679976463317871, "kl": 0.28515625, "learning_rate": 4.675072734716678e-06, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 627.1428833007812, "epoch": 0.06353240152477764, "grad_norm": 39.379425048828125, "kl": 1.5390625, "learning_rate": 4.667487853225931e-06, "loss": 0.0153, "reward": 0.485714316368103, "reward_std": 0.42983949184417725, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 636.0, "epoch": 0.06416772554002541, "grad_norm": 0.5123635530471802, "kl": 0.1962890625, "learning_rate": 4.659822517785203e-06, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 623.7142944335938, "epoch": 0.06480304955527319, "grad_norm": 2.7022430896759033, "kl": 0.2314453125, "learning_rate": 4.6520770500166165e-06, "loss": 0.0023, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 338.4285888671875, "epoch": 0.06543837357052097, "grad_norm": 3.6665234565734863, "kl": 0.294921875, "learning_rate": 4.644251774904487e-06, "loss": 0.0029, "reward": 1.057142972946167, "reward_std": 0.5968170166015625, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.7142857313156128, "rewards/format_reward": 0.8571429252624512, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 563.5714721679688, "epoch": 0.06607369758576874, "grad_norm": 0.09322147816419601, "kl": 0.23828125, "learning_rate": 4.636347020781684e-06, "loss": 0.0027, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 734.1428833007812, "epoch": 0.06670902160101652, "grad_norm": 2.2183244228363037, "kl": 0.177734375, "learning_rate": 4.6283631193158605e-06, "loss": 0.0018, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 495.4285888671875, "epoch": 0.06734434561626429, "grad_norm": 3.992103338241577, "kl": 0.345703125, "learning_rate": 4.620300405495532e-06, "loss": 0.0035, "reward": 0.2571428716182709, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.4285714626312256, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 449.4285888671875, "epoch": 0.06797966963151207, "grad_norm": 4.281430244445801, "kl": 0.26953125, "learning_rate": 4.612159217616022e-06, "loss": 0.0027, "reward": 0.2571428716182709, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.4285714626312256, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 463.2857360839844, "epoch": 0.06861499364675985, "grad_norm": 3.0798134803771973, "kl": 0.2158203125, "learning_rate": 4.603939897265268e-06, "loss": 0.0022, "reward": 0.5067504644393921, "reward_std": 0.28243502974510193, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.10675039887428284, "rewards/format_reward": 1.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 534.7142944335938, "epoch": 0.06925031766200762, "grad_norm": 3.179516077041626, "kl": 0.236328125, "learning_rate": 4.595642789309492e-06, "loss": 0.0024, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 673.1428833007812, "epoch": 0.0698856416772554, "grad_norm": 2.751520872116089, "kl": 0.228515625, "learning_rate": 4.587268241878724e-06, "loss": 0.0023, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 506.71429443359375, "epoch": 0.07052096569250317, "grad_norm": 0.06256023049354553, "kl": 0.2490234375, "learning_rate": 4.578816606352205e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 705.5714721679688, "epoch": 0.07115628970775095, "grad_norm": 0.08651240170001984, "kl": 0.1826171875, "learning_rate": 4.570288237343632e-06, "loss": 0.0021, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 667.2857666015625, "epoch": 0.07179161372299873, "grad_norm": 1.87228262424469, "kl": 0.1552734375, "learning_rate": 4.561683492686289e-06, "loss": 0.0016, "reward": 0.37142857909202576, "reward_std": 0.335232675075531, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.5714285969734192, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 511.857177734375, "epoch": 0.0724269377382465, "grad_norm": 0.06886231899261475, "kl": 0.21875, "learning_rate": 4.5530027334180285e-06, "loss": 0.0025, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 746.857177734375, "epoch": 0.07306226175349428, "grad_norm": 2.801109552383423, "kl": 0.2060546875, "learning_rate": 4.544246323766122e-06, "loss": 0.0021, "reward": 0.22857144474983215, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 639.5714721679688, "epoch": 0.07369758576874205, "grad_norm": 2.9071028232574463, "kl": 0.2373046875, "learning_rate": 4.535414631131983e-06, "loss": 0.0024, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 760.2857666015625, "epoch": 0.07433290978398983, "grad_norm": 2.4393320083618164, "kl": 0.201171875, "learning_rate": 4.526508026075746e-06, "loss": 0.002, "reward": 0.2571428716182709, "reward_std": 0.19023796916007996, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 729.7142944335938, "epoch": 0.07496823379923762, "grad_norm": 7.918629169464111, "kl": 0.671875, "learning_rate": 4.517526882300721e-06, "loss": 0.0067, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 745.1428833007812, "epoch": 0.07560355781448538, "grad_norm": 2.229508876800537, "kl": 0.185546875, "learning_rate": 4.508471576637713e-06, "loss": 0.0019, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 508.4285888671875, "epoch": 0.07623888182973317, "grad_norm": 2.4816155433654785, "kl": 0.1845703125, "learning_rate": 4.499342489029211e-06, "loss": 0.0018, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 535.857177734375, "epoch": 0.07687420584498093, "grad_norm": 3.3153207302093506, "kl": 0.2490234375, "learning_rate": 4.490140002513449e-06, "loss": 0.0025, "reward": 0.43708792328834534, "reward_std": 0.0981253907084465, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.03708791360259056, "rewards/format_reward": 1.0, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 776.1428833007812, "epoch": 0.07750952986022872, "grad_norm": 2.2401111125946045, "kl": 0.1484375, "learning_rate": 4.48086450320833e-06, "loss": 0.0015, "reward": 0.22857144474983215, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 764.7142944335938, "epoch": 0.0781448538754765, "grad_norm": 2.0424013137817383, "kl": 0.1806640625, "learning_rate": 4.4715163802952266e-06, "loss": 0.0018, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 543.7142944335938, "epoch": 0.07878017789072427, "grad_norm": 0.1234215497970581, "kl": 0.25390625, "learning_rate": 4.462096026002655e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 741.5714721679688, "epoch": 0.07941550190597205, "grad_norm": 2.020581007003784, "kl": 0.1884765625, "learning_rate": 4.4526038355898144e-06, "loss": 0.0019, "reward": 0.36326533555984497, "reward_std": 0.16554531455039978, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.020408164709806442, "rewards/format_reward": 0.8571429252624512, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 593.857177734375, "epoch": 0.08005082592121983, "grad_norm": 2.3378539085388184, "kl": 0.2373046875, "learning_rate": 4.4430402073300035e-06, "loss": 0.0024, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 599.5714721679688, "epoch": 0.0806861499364676, "grad_norm": 0.06773251295089722, "kl": 0.234375, "learning_rate": 4.433405542493909e-06, "loss": 0.0026, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 703.1428833007812, "epoch": 0.08132147395171538, "grad_norm": 3.0085108280181885, "kl": 0.1630859375, "learning_rate": 4.4237002453327734e-06, "loss": 0.0016, "reward": 0.41020408272743225, "reward_std": 0.026997461915016174, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.010204082354903221, "rewards/format_reward": 1.0, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 363.2857360839844, "epoch": 0.08195679796696315, "grad_norm": 3.266486644744873, "kl": 0.171875, "learning_rate": 4.4139247230614245e-06, "loss": 0.0017, "reward": 0.4520833492279053, "reward_std": 0.12870661914348602, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0520833320915699, "rewards/format_reward": 1.0, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 395.2857360839844, "epoch": 0.08259212198221093, "grad_norm": 3.562208414077759, "kl": 0.322265625, "learning_rate": 4.404079385841201e-06, "loss": 0.0032, "reward": 0.4571428894996643, "reward_std": 0.2507132887840271, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 367.0000305175781, "epoch": 0.08322744599745871, "grad_norm": 0.06771758943796158, "kl": 0.244140625, "learning_rate": 4.394164646762734e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 523.857177734375, "epoch": 0.08386277001270648, "grad_norm": 2.9015965461730957, "kl": 0.220703125, "learning_rate": 4.384180921828618e-06, "loss": 0.0022, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 519.2857666015625, "epoch": 0.08449809402795426, "grad_norm": 3.0210537910461426, "kl": 0.1689453125, "learning_rate": 4.374128629935955e-06, "loss": 0.0017, "reward": 0.9833123087882996, "reward_std": 0.4177902638912201, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5833122730255127, "rewards/format_reward": 1.0, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 358.5714416503906, "epoch": 0.08513341804320203, "grad_norm": 2.964980363845825, "kl": 0.1962890625, "learning_rate": 4.364008192858781e-06, "loss": 0.002, "reward": 0.4098522365093231, "reward_std": 0.016825860366225243, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.009852217510342598, "rewards/format_reward": 1.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 473.5714416503906, "epoch": 0.08576874205844981, "grad_norm": 2.856916666030884, "kl": 0.2119140625, "learning_rate": 4.353820035230366e-06, "loss": 0.0021, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 507.2857360839844, "epoch": 0.08640406607369759, "grad_norm": 3.613862991333008, "kl": 0.169921875, "learning_rate": 4.3435645845254e-06, "loss": 0.0017, "reward": 0.6571429371833801, "reward_std": 0.5126960277557373, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 451.4285888671875, "epoch": 0.08703939008894536, "grad_norm": 0.05980609729886055, "kl": 0.177734375, "learning_rate": 4.333242271042054e-06, "loss": 0.0021, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 515.7142944335938, "epoch": 0.08767471410419314, "grad_norm": 3.2973406314849854, "kl": 0.248046875, "learning_rate": 4.32285352788393e-06, "loss": 0.0025, "reward": 0.5261905193328857, "reward_std": 0.28607451915740967, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1547619104385376, "rewards/format_reward": 1.0, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 384.14288330078125, "epoch": 0.08831003811944091, "grad_norm": 2.6424343585968018, "kl": 0.31640625, "learning_rate": 4.312398790941882e-06, "loss": 0.0032, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 326.8571472167969, "epoch": 0.08894536213468869, "grad_norm": 3.2031986713409424, "kl": 0.1884765625, "learning_rate": 4.301878498875735e-06, "loss": 0.0019, "reward": 0.971428632736206, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 366.71429443359375, "epoch": 0.08958068614993647, "grad_norm": 0.29011043906211853, "kl": 0.232421875, "learning_rate": 4.291293093095873e-06, "loss": 0.0026, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 353.5714416503906, "epoch": 0.09021601016518424, "grad_norm": 0.05228522792458534, "kl": 0.1728515625, "learning_rate": 4.280643017744723e-06, "loss": 0.002, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 571.857177734375, "epoch": 0.09085133418043202, "grad_norm": 4.668937683105469, "kl": 0.255859375, "learning_rate": 4.269928719678117e-06, "loss": 0.0026, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 441.0000305175781, "epoch": 0.09148665819567979, "grad_norm": 3.7214486598968506, "kl": 0.42578125, "learning_rate": 4.2591506484465426e-06, "loss": 0.0042, "reward": 0.3428571820259094, "reward_std": 0.09759001433849335, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 493.14288330078125, "epoch": 0.09212198221092757, "grad_norm": 0.11755923926830292, "kl": 0.28515625, "learning_rate": 4.248309256276283e-06, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 365.4285888671875, "epoch": 0.09275730622617535, "grad_norm": 2.491922378540039, "kl": 0.2099609375, "learning_rate": 4.23740499805044e-06, "loss": 0.0021, "reward": 0.38035714626312256, "reward_std": 0.0817723423242569, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.008928571827709675, "rewards/format_reward": 1.0, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 363.4285888671875, "epoch": 0.09339263024142312, "grad_norm": 3.340977191925049, "kl": 0.185546875, "learning_rate": 4.22643833128985e-06, "loss": 0.0018, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 244.57144165039062, "epoch": 0.0940279542566709, "grad_norm": 0.05019477382302284, "kl": 0.2255859375, "learning_rate": 4.215409716133885e-06, "loss": 0.0035, "reward": 1.4000002145767212, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 1.0, "rewards/format_reward": 1.0, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 372.2857360839844, "epoch": 0.09466327827191867, "grad_norm": 3.1343963146209717, "kl": 0.34375, "learning_rate": 4.204319615321151e-06, "loss": 0.0034, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 355.5714416503906, "epoch": 0.09529860228716645, "grad_norm": 3.144193172454834, "kl": 0.35546875, "learning_rate": 4.193168494170065e-06, "loss": 0.0035, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 438.857177734375, "epoch": 0.09593392630241424, "grad_norm": 3.4582438468933105, "kl": 0.27734375, "learning_rate": 4.181956820559339e-06, "loss": 0.0028, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 556.857177734375, "epoch": 0.096569250317662, "grad_norm": 2.9669277667999268, "kl": 0.259765625, "learning_rate": 4.170685064908342e-06, "loss": 0.0026, "reward": 0.4870130121707916, "reward_std": 0.23021474480628967, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.08701299130916595, "rewards/format_reward": 1.0, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 561.1428833007812, "epoch": 0.09720457433290978, "grad_norm": 0.06397932022809982, "kl": 0.265625, "learning_rate": 4.159353700157365e-06, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 382.8571472167969, "epoch": 0.09783989834815757, "grad_norm": 0.12080562859773636, "kl": 0.369140625, "learning_rate": 4.14796320174778e-06, "loss": 0.004, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 583.7142944335938, "epoch": 0.09847522236340533, "grad_norm": 0.06819948554039001, "kl": 0.298828125, "learning_rate": 4.136514047602087e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 347.14288330078125, "epoch": 0.09911054637865312, "grad_norm": 0.07698381692171097, "kl": 0.3984375, "learning_rate": 4.1250067181038635e-06, "loss": 0.0043, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 692.1428833007812, "epoch": 0.09974587039390088, "grad_norm": 0.17737269401550293, "kl": 0.28125, "learning_rate": 4.113441696077608e-06, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 341.5714416503906, "epoch": 0.10038119440914867, "grad_norm": 4.598702907562256, "kl": 0.4296875, "learning_rate": 4.101819466768484e-06, "loss": 0.0043, "reward": 0.8134453892707825, "reward_std": 0.18902148306369781, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4134454131126404, "rewards/format_reward": 1.0, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 455.2857360839844, "epoch": 0.10101651842439645, "grad_norm": 0.07675225287675858, "kl": 0.302734375, "learning_rate": 4.0901405178219535e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 752.2857666015625, "epoch": 0.10165184243964422, "grad_norm": 1.9616703987121582, "kl": 0.2080078125, "learning_rate": 4.078405339263326e-06, "loss": 0.0021, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 349.8571472167969, "epoch": 0.102287166454892, "grad_norm": 3.1100926399230957, "kl": 0.2451171875, "learning_rate": 4.06661442347719e-06, "loss": 0.0025, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 616.1428833007812, "epoch": 0.10292249047013977, "grad_norm": 2.4743812084198, "kl": 0.296875, "learning_rate": 4.054768265186758e-06, "loss": 0.003, "reward": 0.4129870533943176, "reward_std": 0.034360405057668686, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.012987013906240463, "rewards/format_reward": 1.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 359.4285888671875, "epoch": 0.10355781448538755, "grad_norm": 3.092137336730957, "kl": 0.3046875, "learning_rate": 4.0428673614331036e-06, "loss": 0.0031, "reward": 0.971428632736206, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 439.5714416503906, "epoch": 0.10419313850063533, "grad_norm": 1.3710603713989258, "kl": 0.302734375, "learning_rate": 4.030912211554316e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 477.5714416503906, "epoch": 0.1048284625158831, "grad_norm": 2.647684097290039, "kl": 0.1826171875, "learning_rate": 4.018903317164539e-06, "loss": 0.0018, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 545.857177734375, "epoch": 0.10546378653113088, "grad_norm": 0.07831098884344101, "kl": 0.1650390625, "learning_rate": 4.006841182132932e-06, "loss": 0.0019, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 517.857177734375, "epoch": 0.10609911054637865, "grad_norm": 0.6324580311775208, "kl": 0.296875, "learning_rate": 3.9947263125625195e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 451.5714416503906, "epoch": 0.10673443456162643, "grad_norm": 2.877990961074829, "kl": 0.166015625, "learning_rate": 3.982559216768967e-06, "loss": 0.0017, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 527.2857666015625, "epoch": 0.10736975857687421, "grad_norm": 0.31927597522735596, "kl": 0.310546875, "learning_rate": 3.970340405259245e-06, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 545.7142944335938, "epoch": 0.10800508259212198, "grad_norm": 2.99507474899292, "kl": 0.28125, "learning_rate": 3.958070390710214e-06, "loss": 0.0028, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 430.71429443359375, "epoch": 0.10864040660736976, "grad_norm": 3.1845176219940186, "kl": 0.302734375, "learning_rate": 3.945749687947109e-06, "loss": 0.003, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 496.4285888671875, "epoch": 0.10927573062261753, "grad_norm": 2.674358606338501, "kl": 0.375, "learning_rate": 3.933378813921942e-06, "loss": 0.0037, "reward": 0.46666669845581055, "reward_std": 0.11547007411718369, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.06666667759418488, "rewards/format_reward": 1.0, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 722.1428833007812, "epoch": 0.10991105463786531, "grad_norm": 1.7992055416107178, "kl": 0.205078125, "learning_rate": 3.920958287691811e-06, "loss": 0.002, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 436.71429443359375, "epoch": 0.11054637865311309, "grad_norm": 0.03782209753990173, "kl": 0.1591796875, "learning_rate": 3.908488630397121e-06, "loss": 0.0019, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 446.14288330078125, "epoch": 0.11118170266836086, "grad_norm": 2.619063138961792, "kl": 0.1796875, "learning_rate": 3.8959703652397175e-06, "loss": 0.0018, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 705.5714721679688, "epoch": 0.11181702668360864, "grad_norm": 2.71985125541687, "kl": 0.267578125, "learning_rate": 3.883404017460935e-06, "loss": 0.0027, "reward": 0.2857142984867096, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 460.71429443359375, "epoch": 0.11245235069885642, "grad_norm": 3.090186595916748, "kl": 0.2109375, "learning_rate": 3.870790114319559e-06, "loss": 0.0021, "reward": 0.6571429371833801, "reward_std": 0.5126960277557373, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 482.71429443359375, "epoch": 0.11308767471410419, "grad_norm": 0.13239029049873352, "kl": 0.287109375, "learning_rate": 3.858129185069701e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 671.5714721679688, "epoch": 0.11372299872935197, "grad_norm": 2.1125059127807617, "kl": 0.27734375, "learning_rate": 3.845421760938597e-06, "loss": 0.0028, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 500.4285888671875, "epoch": 0.11435832274459974, "grad_norm": 8.883991241455078, "kl": 0.9453125, "learning_rate": 3.832668375104312e-06, "loss": 0.0095, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 519.0, "epoch": 0.11499364675984752, "grad_norm": 12.942805290222168, "kl": 1.375, "learning_rate": 3.8198695626733725e-06, "loss": 0.0141, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 667.2857666015625, "epoch": 0.1156289707750953, "grad_norm": 2.5961642265319824, "kl": 0.23828125, "learning_rate": 3.8070258606583156e-06, "loss": 0.0024, "reward": 0.3142857253551483, "reward_std": 0.10690449923276901, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 587.2857666015625, "epoch": 0.11626429479034307, "grad_norm": 3.12656831741333, "kl": 0.251953125, "learning_rate": 3.7941378079551544e-06, "loss": 0.0025, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 328.71429443359375, "epoch": 0.11689961880559085, "grad_norm": 2.924964427947998, "kl": 0.25, "learning_rate": 3.7812059453207677e-06, "loss": 0.0025, "reward": 0.6285714507102966, "reward_std": 0.5468525290489197, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 0.8571429252624512, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 486.2857360839844, "epoch": 0.11753494282083862, "grad_norm": 3.5567517280578613, "kl": 0.296875, "learning_rate": 3.768230815350213e-06, "loss": 0.003, "reward": 0.37142857909202576, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 554.1428833007812, "epoch": 0.1181702668360864, "grad_norm": 2.51031494140625, "kl": 0.30859375, "learning_rate": 3.7552129624539557e-06, "loss": 0.0031, "reward": 0.37142857909202576, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 594.5714721679688, "epoch": 0.11880559085133419, "grad_norm": 2.589846611022949, "kl": 0.26171875, "learning_rate": 3.7421529328350316e-06, "loss": 0.0026, "reward": 0.5284404158592224, "reward_std": 0.19612440466880798, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.12844036519527435, "rewards/format_reward": 1.0, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 595.857177734375, "epoch": 0.11944091486658195, "grad_norm": 2.4991862773895264, "kl": 0.27734375, "learning_rate": 3.7290512744661274e-06, "loss": 0.0028, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 552.0, "epoch": 0.12007623888182974, "grad_norm": 2.0362820625305176, "kl": 0.1845703125, "learning_rate": 3.715908537066589e-06, "loss": 0.0019, "reward": 0.485714316368103, "reward_std": 0.42983949184417725, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 698.5714721679688, "epoch": 0.1207115628970775, "grad_norm": 2.4349231719970703, "kl": 0.25, "learning_rate": 3.7027252720793538e-06, "loss": 0.0025, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 453.2857360839844, "epoch": 0.12134688691232529, "grad_norm": 3.212773323059082, "kl": 0.1796875, "learning_rate": 3.689502032647817e-06, "loss": 0.0018, "reward": 0.36014559864997864, "reward_std": 0.1179879903793335, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.017288444563746452, "rewards/format_reward": 1.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 671.1428833007812, "epoch": 0.12198221092757307, "grad_norm": 0.08436475694179535, "kl": 0.1767578125, "learning_rate": 3.6762393735926245e-06, "loss": 0.0021, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 651.2857666015625, "epoch": 0.12261753494282084, "grad_norm": 2.7841696739196777, "kl": 0.146484375, "learning_rate": 3.6629378513883852e-06, "loss": 0.0015, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 545.2857666015625, "epoch": 0.12325285895806862, "grad_norm": 3.3902151584625244, "kl": 0.205078125, "learning_rate": 3.6495980241403307e-06, "loss": 0.002, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 593.1428833007812, "epoch": 0.12388818297331639, "grad_norm": 2.5586631298065186, "kl": 0.197265625, "learning_rate": 3.636220451560896e-06, "loss": 0.002, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 589.7142944335938, "epoch": 0.12452350698856417, "grad_norm": 0.08236116170883179, "kl": 0.3203125, "learning_rate": 3.622805694946235e-06, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 780.7142944335938, "epoch": 0.12515883100381195, "grad_norm": 0.9724776148796082, "kl": 0.390625, "learning_rate": 3.609354317152667e-06, "loss": 0.0042, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 921.5714721679688, "epoch": 0.12579415501905972, "grad_norm": 2.485982894897461, "kl": 0.263671875, "learning_rate": 3.595866882573063e-06, "loss": 0.0026, "reward": 0.20000001788139343, "reward_std": 0.20000001788139343, "rewards/code_format_reward": 0.4285714626312256, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 850.2857666015625, "epoch": 0.12642947903430748, "grad_norm": 2.52078914642334, "kl": 0.451171875, "learning_rate": 3.5823439571131675e-06, "loss": 0.0045, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 924.2857666015625, "epoch": 0.12706480304955528, "grad_norm": 2.445126533508301, "kl": 0.20703125, "learning_rate": 3.5687861081678477e-06, "loss": 0.0021, "reward": 0.22857144474983215, "reward_std": 0.21380899846553802, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.0, "rewards/format_reward": 0.5714285969734192, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 764.1428833007812, "epoch": 0.12770012706480305, "grad_norm": 2.1626222133636475, "kl": 0.3359375, "learning_rate": 3.555193904597291e-06, "loss": 0.0034, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 798.2857666015625, "epoch": 0.12833545108005082, "grad_norm": 2.010795831680298, "kl": 0.275390625, "learning_rate": 3.541567916703138e-06, "loss": 0.0027, "reward": 0.485714316368103, "reward_std": 0.42983949184417725, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 675.5714721679688, "epoch": 0.1289707750952986, "grad_norm": 2.5785133838653564, "kl": 0.2431640625, "learning_rate": 3.5279087162045517e-06, "loss": 0.0024, "reward": 0.6285714507102966, "reward_std": 0.5468525290489197, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 0.8571429252624512, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 696.4285888671875, "epoch": 0.12960609911054638, "grad_norm": 0.14385801553726196, "kl": 0.166015625, "learning_rate": 3.5142168762142265e-06, "loss": 0.002, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 673.5714721679688, "epoch": 0.13024142312579415, "grad_norm": 29.659584045410156, "kl": 2.109375, "learning_rate": 3.500492971214347e-06, "loss": 0.0212, "reward": 0.40446433424949646, "reward_std": 0.00819578766822815, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.004464285913854837, "rewards/format_reward": 1.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 724.857177734375, "epoch": 0.13087674714104194, "grad_norm": 3.008819580078125, "kl": 0.29296875, "learning_rate": 3.48673757703248e-06, "loss": 0.0029, "reward": 0.4012531638145447, "reward_std": 0.003315483685582876, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0012531329412013292, "rewards/format_reward": 1.0, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 670.0, "epoch": 0.1315120711562897, "grad_norm": 2.4119279384613037, "kl": 0.1904296875, "learning_rate": 3.472951270817418e-06, "loss": 0.0019, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 723.5714721679688, "epoch": 0.13214739517153748, "grad_norm": 0.09062952548265457, "kl": 0.26171875, "learning_rate": 3.4591346310149578e-06, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 885.5714721679688, "epoch": 0.13278271918678525, "grad_norm": 2.048800468444824, "kl": 0.2109375, "learning_rate": 3.445288237343632e-06, "loss": 0.0021, "reward": 0.2857142984867096, "reward_std": 0.1864454597234726, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.02857143059372902, "rewards/format_reward": 0.7142857313156128, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 768.0000610351562, "epoch": 0.13341804320203304, "grad_norm": 0.07965610176324844, "kl": 0.279296875, "learning_rate": 3.4314126707703895e-06, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 668.7142944335938, "epoch": 0.1340533672172808, "grad_norm": 2.6163687705993652, "kl": 0.259765625, "learning_rate": 3.4175085134862128e-06, "loss": 0.0026, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 846.5714721679688, "epoch": 0.13468869123252858, "grad_norm": 2.003486394882202, "kl": 0.1591796875, "learning_rate": 3.4035763488816953e-06, "loss": 0.0016, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 518.5714721679688, "epoch": 0.13532401524777637, "grad_norm": 2.321255683898926, "kl": 0.1865234375, "learning_rate": 3.3896167615225594e-06, "loss": 0.0019, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 736.4285888671875, "epoch": 0.13595933926302414, "grad_norm": 0.06602618098258972, "kl": 0.25, "learning_rate": 3.375630337125133e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 668.2857666015625, "epoch": 0.1365946632782719, "grad_norm": 2.6393556594848633, "kl": 0.18359375, "learning_rate": 3.361617662531772e-06, "loss": 0.0018, "reward": 0.5445378422737122, "reward_std": 0.2468453347682953, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.14453783631324768, "rewards/format_reward": 1.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 696.5714721679688, "epoch": 0.1372299872935197, "grad_norm": 0.06534316390752792, "kl": 0.1962890625, "learning_rate": 3.347579325686237e-06, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 728.4285888671875, "epoch": 0.13786531130876747, "grad_norm": 2.1525208950042725, "kl": 0.162109375, "learning_rate": 3.333515915609027e-06, "loss": 0.0016, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 695.5714721679688, "epoch": 0.13850063532401524, "grad_norm": 2.821530342102051, "kl": 0.271484375, "learning_rate": 3.3194280223726616e-06, "loss": 0.0027, "reward": 0.41904765367507935, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0476190522313118, "rewards/format_reward": 1.0, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 558.4285888671875, "epoch": 0.13913595933926304, "grad_norm": 0.07845824211835861, "kl": 0.322265625, "learning_rate": 3.305316237076927e-06, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 762.2857666015625, "epoch": 0.1397712833545108, "grad_norm": 2.1945505142211914, "kl": 0.15625, "learning_rate": 3.291181151824071e-06, "loss": 0.0016, "reward": 0.3142857253551483, "reward_std": 0.15735916793346405, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 534.0, "epoch": 0.14040660736975857, "grad_norm": 2.6735599040985107, "kl": 0.19921875, "learning_rate": 3.27702335969396e-06, "loss": 0.002, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 673.2857666015625, "epoch": 0.14104193138500634, "grad_norm": 0.16328755021095276, "kl": 0.3203125, "learning_rate": 3.2628434547191985e-06, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 690.7142944335938, "epoch": 0.14167725540025414, "grad_norm": 0.05350850895047188, "kl": 0.28515625, "learning_rate": 3.2486420318601973e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 705.7142944335938, "epoch": 0.1423125794155019, "grad_norm": 0.05627477914094925, "kl": 0.255859375, "learning_rate": 3.2344196869802187e-06, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 522.857177734375, "epoch": 0.14294790343074967, "grad_norm": 3.546515464782715, "kl": 0.1943359375, "learning_rate": 3.2201770168203694e-06, "loss": 0.0019, "reward": 0.37142857909202576, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 656.857177734375, "epoch": 0.14358322744599747, "grad_norm": 0.30215829610824585, "kl": 0.283203125, "learning_rate": 3.205914618974563e-06, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 680.1428833007812, "epoch": 0.14421855146124524, "grad_norm": 2.333859443664551, "kl": 0.1982421875, "learning_rate": 3.1916330918644496e-06, "loss": 0.002, "reward": 0.290529727935791, "reward_std": 0.16188988089561462, "rewards/code_format_reward": 0.5714285969734192, "rewards/code_reward": 0.004815409425646067, "rewards/format_reward": 0.8571429252624512, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 702.7142944335938, "epoch": 0.144853875476493, "grad_norm": 0.06898491084575653, "kl": 0.26171875, "learning_rate": 3.177333034714303e-06, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 699.7142944335938, "epoch": 0.1454891994917408, "grad_norm": 3.096179246902466, "kl": 0.26171875, "learning_rate": 3.1630150475258813e-06, "loss": 0.0026, "reward": 0.3428571820259094, "reward_std": 0.09759000688791275, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 704.4285888671875, "epoch": 0.14612452350698857, "grad_norm": 0.06554409861564636, "kl": 0.302734375, "learning_rate": 3.148679731053252e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 575.5714721679688, "epoch": 0.14675984752223634, "grad_norm": 2.8049392700195312, "kl": 0.2158203125, "learning_rate": 3.1343276867775805e-06, "loss": 0.0022, "reward": 0.4242587983608246, "reward_std": 0.04178621619939804, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.02425876073539257, "rewards/format_reward": 1.0, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 611.857177734375, "epoch": 0.1473951715374841, "grad_norm": 3.6352577209472656, "kl": 0.296875, "learning_rate": 3.1199595168819043e-06, "loss": 0.003, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 638.0, "epoch": 0.1480304955527319, "grad_norm": 2.4661004543304443, "kl": 0.298828125, "learning_rate": 3.105575824225852e-06, "loss": 0.003, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 599.857177734375, "epoch": 0.14866581956797967, "grad_norm": 2.9908604621887207, "kl": 0.20703125, "learning_rate": 3.091177212320363e-06, "loss": 0.0021, "reward": 0.37142857909202576, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 652.1428833007812, "epoch": 0.14930114358322744, "grad_norm": 0.04900944232940674, "kl": 0.1875, "learning_rate": 3.0767642853023538e-06, "loss": 0.0022, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 485.5714416503906, "epoch": 0.14993646759847523, "grad_norm": 2.6988017559051514, "kl": 0.216796875, "learning_rate": 3.062337647909376e-06, "loss": 0.0022, "reward": 0.37142857909202576, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 672.857177734375, "epoch": 0.150571791613723, "grad_norm": 2.6142776012420654, "kl": 0.22265625, "learning_rate": 3.04789790545424e-06, "loss": 0.0022, "reward": 0.4169172942638397, "reward_std": 0.04475894197821617, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.016917293891310692, "rewards/format_reward": 1.0, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 709.7142944335938, "epoch": 0.15120711562897077, "grad_norm": 0.06715590506792068, "kl": 0.1865234375, "learning_rate": 3.033445663799621e-06, "loss": 0.0022, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 642.2857666015625, "epoch": 0.15184243964421856, "grad_norm": 2.8286352157592773, "kl": 0.2197265625, "learning_rate": 3.018981529332633e-06, "loss": 0.0022, "reward": 0.5428571701049805, "reward_std": 0.3779645264148712, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 567.2857666015625, "epoch": 0.15247776365946633, "grad_norm": 0.07005994766950607, "kl": 0.21875, "learning_rate": 3.00450610893939e-06, "loss": 0.0025, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 678.5714721679688, "epoch": 0.1531130876747141, "grad_norm": 2.2401628494262695, "kl": 0.162109375, "learning_rate": 2.9900200099795396e-06, "loss": 0.0016, "reward": 0.40160515904426575, "reward_std": 0.004246791359037161, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.00160513655282557, "rewards/format_reward": 1.0, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 640.4285888671875, "epoch": 0.15374841168996187, "grad_norm": 2.341778039932251, "kl": 0.30078125, "learning_rate": 2.9755238402607826e-06, "loss": 0.003, "reward": 0.485714316368103, "reward_std": 0.22677868604660034, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 929.4285888671875, "epoch": 0.15438373570520966, "grad_norm": 2.2782864570617676, "kl": 0.193359375, "learning_rate": 2.961018208013367e-06, "loss": 0.0019, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 655.5714721679688, "epoch": 0.15501905972045743, "grad_norm": 0.0515868104994297, "kl": 0.265625, "learning_rate": 2.9465037218645694e-06, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 596.7142944335938, "epoch": 0.1556543837357052, "grad_norm": 0.06041451543569565, "kl": 0.271484375, "learning_rate": 2.9319809908131604e-06, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 623.5714721679688, "epoch": 0.156289707750953, "grad_norm": 0.11632593721151352, "kl": 0.33203125, "learning_rate": 2.917450624203847e-06, "loss": 0.0036, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 708.7142944335938, "epoch": 0.15692503176620076, "grad_norm": 2.282824993133545, "kl": 0.17578125, "learning_rate": 2.9029132317017118e-06, "loss": 0.0018, "reward": 0.6714285612106323, "reward_std": 0.27516230940818787, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.30000001192092896, "rewards/format_reward": 1.0, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 402.2857360839844, "epoch": 0.15756035578144853, "grad_norm": 0.05820649862289429, "kl": 0.263671875, "learning_rate": 2.888369423266629e-06, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 559.1428833007812, "epoch": 0.15819567979669633, "grad_norm": 2.360961437225342, "kl": 0.2431640625, "learning_rate": 2.8738198091276712e-06, "loss": 0.0024, "reward": 0.771428644657135, "reward_std": 0.48205915093421936, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 0.8571429252624512, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 673.857177734375, "epoch": 0.1588310038119441, "grad_norm": 2.426175832748413, "kl": 0.2431640625, "learning_rate": 2.859264999757509e-06, "loss": 0.0024, "reward": 0.485714316368103, "reward_std": 0.42983946204185486, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 555.7142944335938, "epoch": 0.15946632782719186, "grad_norm": 1.7476675510406494, "kl": 0.2060546875, "learning_rate": 2.8447056058467928e-06, "loss": 0.0021, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 635.1428833007812, "epoch": 0.16010165184243966, "grad_norm": 0.10966142266988754, "kl": 0.296875, "learning_rate": 2.830142238278531e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 569.4285888671875, "epoch": 0.16073697585768743, "grad_norm": 0.07483859360218048, "kl": 0.306640625, "learning_rate": 2.81557550810246e-06, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 696.1428833007812, "epoch": 0.1613722998729352, "grad_norm": 557283.4375, "kl": 19456.0, "learning_rate": 2.8010060265094026e-06, "loss": 194.2972, "reward": 0.485714316368103, "reward_std": 0.42983946204185486, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 572.1428833007812, "epoch": 0.16200762388818296, "grad_norm": 2.958310842514038, "kl": 0.203125, "learning_rate": 2.786434404805629e-06, "loss": 0.002, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 642.5714721679688, "epoch": 0.16264294790343076, "grad_norm": 2.9970836639404297, "kl": 0.359375, "learning_rate": 2.771861254387199e-06, "loss": 0.0036, "reward": 0.485714316368103, "reward_std": 0.42983949184417725, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 0.8571429252624512, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 626.5714721679688, "epoch": 0.16327827191867852, "grad_norm": 0.052250247448682785, "kl": 0.203125, "learning_rate": 2.7572871867143204e-06, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 587.0, "epoch": 0.1639135959339263, "grad_norm": 0.049148622900247574, "kl": 0.1845703125, "learning_rate": 2.742712813285681e-06, "loss": 0.0021, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 413.71429443359375, "epoch": 0.1645489199491741, "grad_norm": 3.1558637619018555, "kl": 0.296875, "learning_rate": 2.7281387456128017e-06, "loss": 0.003, "reward": 0.6661654710769653, "reward_std": 0.2892994284629822, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.3233082890510559, "rewards/format_reward": 0.8571429252624512, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 599.1428833007812, "epoch": 0.16518424396442186, "grad_norm": 2.672922372817993, "kl": 0.333984375, "learning_rate": 2.7135655951943716e-06, "loss": 0.0033, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 638.2857666015625, "epoch": 0.16581956797966962, "grad_norm": 0.062102027237415314, "kl": 0.2890625, "learning_rate": 2.698993973490598e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 615.4285888671875, "epoch": 0.16645489199491742, "grad_norm": 2.032621145248413, "kl": 0.349609375, "learning_rate": 2.6844244918975416e-06, "loss": 0.0035, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 680.4285888671875, "epoch": 0.1670902160101652, "grad_norm": 0.07202436029911041, "kl": 0.25390625, "learning_rate": 2.66985776172147e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 607.0, "epoch": 0.16772554002541296, "grad_norm": 3.1766250133514404, "kl": 0.3046875, "learning_rate": 2.6552943941532088e-06, "loss": 0.003, "reward": 0.41587308049201965, "reward_std": 0.0419960655272007, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.01587301678955555, "rewards/format_reward": 1.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 661.5714721679688, "epoch": 0.16836086404066072, "grad_norm": 3.2823150157928467, "kl": 0.2392578125, "learning_rate": 2.6407350002424927e-06, "loss": 0.0024, "reward": 0.5868132710456848, "reward_std": 0.23857378959655762, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.18681320548057556, "rewards/format_reward": 1.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 598.857177734375, "epoch": 0.16899618805590852, "grad_norm": 0.08283974230289459, "kl": 0.2021484375, "learning_rate": 2.626180190872329e-06, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 439.2857360839844, "epoch": 0.1696315120711563, "grad_norm": 3.3979873657226562, "kl": 0.234375, "learning_rate": 2.611630576733372e-06, "loss": 0.0023, "reward": 0.41260507702827454, "reward_std": 0.01228986494243145, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.01260504312813282, "rewards/format_reward": 1.0, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 584.1428833007812, "epoch": 0.17026683608640406, "grad_norm": 2.4819862842559814, "kl": 0.28125, "learning_rate": 2.5970867682982885e-06, "loss": 0.0028, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 458.14288330078125, "epoch": 0.17090216010165185, "grad_norm": 0.08856673538684845, "kl": 0.349609375, "learning_rate": 2.582549375796154e-06, "loss": 0.0038, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 468.71429443359375, "epoch": 0.17153748411689962, "grad_norm": 2.835487127304077, "kl": 0.32421875, "learning_rate": 2.568019009186841e-06, "loss": 0.0032, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 637.4285888671875, "epoch": 0.1721728081321474, "grad_norm": 0.054750654846429825, "kl": 0.162109375, "learning_rate": 2.5534962781354317e-06, "loss": 0.0019, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 757.2857666015625, "epoch": 0.17280813214739518, "grad_norm": 2.1099636554718018, "kl": 0.21484375, "learning_rate": 2.538981791986634e-06, "loss": 0.0021, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 425.2857360839844, "epoch": 0.17344345616264295, "grad_norm": 2.960906982421875, "kl": 0.251953125, "learning_rate": 2.524476159739218e-06, "loss": 0.0025, "reward": 0.5428571701049805, "reward_std": 0.3779645264148712, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 592.5714721679688, "epoch": 0.17407878017789072, "grad_norm": 2.5509631633758545, "kl": 0.26953125, "learning_rate": 2.5099799900204607e-06, "loss": 0.0027, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 436.8571472167969, "epoch": 0.17471410419313851, "grad_norm": 2.9435410499572754, "kl": 0.400390625, "learning_rate": 2.4954938910606108e-06, "loss": 0.004, "reward": 0.563265323638916, "reward_std": 0.37278667092323303, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.16326531767845154, "rewards/format_reward": 1.0, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 409.5714416503906, "epoch": 0.17534942820838628, "grad_norm": 4.025363922119141, "kl": 0.275390625, "learning_rate": 2.481018470667368e-06, "loss": 0.0028, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 467.14288330078125, "epoch": 0.17598475222363405, "grad_norm": 0.09415756165981293, "kl": 0.30078125, "learning_rate": 2.4665543362003802e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 438.0000305175781, "epoch": 0.17662007623888182, "grad_norm": 3.590571165084839, "kl": 0.2216796875, "learning_rate": 2.4521020945457615e-06, "loss": 0.0022, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 439.2857360839844, "epoch": 0.1772554002541296, "grad_norm": 2.6514501571655273, "kl": 0.1884765625, "learning_rate": 2.4376623520906255e-06, "loss": 0.0019, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 395.4285888671875, "epoch": 0.17789072426937738, "grad_norm": 3.0715246200561523, "kl": 0.396484375, "learning_rate": 2.4232357146976478e-06, "loss": 0.004, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 552.1428833007812, "epoch": 0.17852604828462515, "grad_norm": 0.04878819361329079, "kl": 0.25390625, "learning_rate": 2.408822787679637e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 470.0000305175781, "epoch": 0.17916137229987295, "grad_norm": 2.8764052391052246, "kl": 0.337890625, "learning_rate": 2.3944241757741475e-06, "loss": 0.0034, "reward": 0.7617021799087524, "reward_std": 0.39519527554512024, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.3617021143436432, "rewards/format_reward": 1.0, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 552.0, "epoch": 0.1797966963151207, "grad_norm": 3.4082202911376953, "kl": 0.279296875, "learning_rate": 2.380040483118097e-06, "loss": 0.0028, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 419.0000305175781, "epoch": 0.18043202033036848, "grad_norm": 3.0144095420837402, "kl": 0.236328125, "learning_rate": 2.365672313222419e-06, "loss": 0.0024, "reward": 0.800332248210907, "reward_std": 0.2776820659637451, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4003322720527649, "rewards/format_reward": 1.0, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 514.5714721679688, "epoch": 0.18106734434561628, "grad_norm": 0.1193128377199173, "kl": 0.35546875, "learning_rate": 2.351320268946749e-06, "loss": 0.0038, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 638.7142944335938, "epoch": 0.18170266836086404, "grad_norm": 0.050928860902786255, "kl": 0.25390625, "learning_rate": 2.336984952474119e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 496.857177734375, "epoch": 0.1823379923761118, "grad_norm": 0.12127784639596939, "kl": 0.310546875, "learning_rate": 2.322666965285697e-06, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 515.857177734375, "epoch": 0.18297331639135958, "grad_norm": 0.04745308309793472, "kl": 0.302734375, "learning_rate": 2.3083669081355507e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 619.0, "epoch": 0.18360864040660738, "grad_norm": 0.04747169092297554, "kl": 0.2392578125, "learning_rate": 2.2940853810254377e-06, "loss": 0.0027, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 427.2857360839844, "epoch": 0.18424396442185514, "grad_norm": 0.04503343254327774, "kl": 0.2109375, "learning_rate": 2.2798229831796313e-06, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 350.8571472167969, "epoch": 0.1848792884371029, "grad_norm": 0.15933562815189362, "kl": 0.29296875, "learning_rate": 2.2655803130197816e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 492.4285888671875, "epoch": 0.1855146124523507, "grad_norm": 0.061849016696214676, "kl": 0.34375, "learning_rate": 2.2513579681398034e-06, "loss": 0.0037, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 447.5714416503906, "epoch": 0.18614993646759848, "grad_norm": 3.1712300777435303, "kl": 0.1982421875, "learning_rate": 2.237156545280803e-06, "loss": 0.002, "reward": 1.1142858266830444, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.7142857313156128, "rewards/format_reward": 1.0, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 530.7142944335938, "epoch": 0.18678526048284624, "grad_norm": 2.926269769668579, "kl": 0.27734375, "learning_rate": 2.2229766403060403e-06, "loss": 0.0028, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 370.14288330078125, "epoch": 0.18742058449809404, "grad_norm": 3.639002799987793, "kl": 0.412109375, "learning_rate": 2.2088188481759305e-06, "loss": 0.0041, "reward": 0.44044750928878784, "reward_std": 0.06550441682338715, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.040447503328323364, "rewards/format_reward": 1.0, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 411.8571472167969, "epoch": 0.1880559085133418, "grad_norm": 3.473376512527466, "kl": 0.337890625, "learning_rate": 2.194683762923073e-06, "loss": 0.0034, "reward": 0.8047619462013245, "reward_std": 0.28637492656707764, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4047619104385376, "rewards/format_reward": 1.0, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 537.1428833007812, "epoch": 0.18869123252858958, "grad_norm": 0.05299937725067139, "kl": 0.2373046875, "learning_rate": 2.1805719776273387e-06, "loss": 0.0027, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 499.0000305175781, "epoch": 0.18932655654383734, "grad_norm": 0.04631977900862694, "kl": 0.2265625, "learning_rate": 2.166484084390974e-06, "loss": 0.0026, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 490.14288330078125, "epoch": 0.18996188055908514, "grad_norm": 0.08931510150432587, "kl": 0.244140625, "learning_rate": 2.1524206743137636e-06, "loss": 0.0027, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 461.2857360839844, "epoch": 0.1905972045743329, "grad_norm": 3.1879539489746094, "kl": 0.236328125, "learning_rate": 2.1383823374682287e-06, "loss": 0.0024, "reward": 0.971428632736206, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 493.14288330078125, "epoch": 0.19123252858958067, "grad_norm": 2.5813148021698, "kl": 0.2294921875, "learning_rate": 2.124369662874868e-06, "loss": 0.0023, "reward": 0.44916945695877075, "reward_std": 0.2263808697462082, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.10631229728460312, "rewards/format_reward": 0.8571429252624512, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 467.857177734375, "epoch": 0.19186785260482847, "grad_norm": 2.3633549213409424, "kl": 0.251953125, "learning_rate": 2.110383238477441e-06, "loss": 0.0025, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 467.5714416503906, "epoch": 0.19250317662007624, "grad_norm": 3.2412614822387695, "kl": 0.2021484375, "learning_rate": 2.096423651118305e-06, "loss": 0.002, "reward": 0.41476020216941833, "reward_std": 0.03590288758277893, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.014760148711502552, "rewards/format_reward": 1.0, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 629.1428833007812, "epoch": 0.193138500635324, "grad_norm": 1.8208192586898804, "kl": 0.1845703125, "learning_rate": 2.082491486513788e-06, "loss": 0.0018, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 573.1428833007812, "epoch": 0.1937738246505718, "grad_norm": 0.06740770488977432, "kl": 0.326171875, "learning_rate": 2.0685873292296116e-06, "loss": 0.0036, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 469.0000305175781, "epoch": 0.19440914866581957, "grad_norm": 0.050011664628982544, "kl": 0.322265625, "learning_rate": 2.054711762656369e-06, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 501.2857360839844, "epoch": 0.19504447268106734, "grad_norm": 2.7797482013702393, "kl": 0.1865234375, "learning_rate": 2.040865368985044e-06, "loss": 0.0019, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 483.4285888671875, "epoch": 0.19567979669631513, "grad_norm": 0.056970253586769104, "kl": 0.205078125, "learning_rate": 2.027048729182583e-06, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 708.4285888671875, "epoch": 0.1963151207115629, "grad_norm": 1.9797154664993286, "kl": 0.2421875, "learning_rate": 2.0132624229675205e-06, "loss": 0.0024, "reward": 0.45210087299346924, "reward_std": 0.11403417587280273, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.10924370586872101, "rewards/format_reward": 0.8571429252624512, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 432.14288330078125, "epoch": 0.19695044472681067, "grad_norm": 0.0952640175819397, "kl": 0.40625, "learning_rate": 1.9995070287856546e-06, "loss": 0.0044, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 533.7142944335938, "epoch": 0.19758576874205844, "grad_norm": 3.1438472270965576, "kl": 0.30859375, "learning_rate": 1.985783123785774e-06, "loss": 0.0031, "reward": 0.49523812532424927, "reward_std": 0.11878278106451035, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0952381044626236, "rewards/format_reward": 1.0, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 536.5714721679688, "epoch": 0.19822109275730623, "grad_norm": 2.1929433345794678, "kl": 0.314453125, "learning_rate": 1.9720912837954486e-06, "loss": 0.0031, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 502.71429443359375, "epoch": 0.198856416772554, "grad_norm": 0.05079368129372597, "kl": 0.306640625, "learning_rate": 1.958432083296862e-06, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 454.14288330078125, "epoch": 0.19949174078780177, "grad_norm": 0.06545651704072952, "kl": 0.2099609375, "learning_rate": 1.9448060954027093e-06, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 523.1428833007812, "epoch": 0.20012706480304956, "grad_norm": 2.852233409881592, "kl": 0.27734375, "learning_rate": 1.931213891832153e-06, "loss": 0.0028, "reward": 0.40317460894584656, "reward_std": 0.005421662237495184, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0031746034510433674, "rewards/format_reward": 1.0, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 518.7142944335938, "epoch": 0.20076238881829733, "grad_norm": 0.056687433272600174, "kl": 0.2158203125, "learning_rate": 1.9176560428868336e-06, "loss": 0.0025, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 651.857177734375, "epoch": 0.2013977128335451, "grad_norm": 2.1868667602539062, "kl": 0.1923828125, "learning_rate": 1.9041331174269373e-06, "loss": 0.0019, "reward": 0.3556329905986786, "reward_std": 0.16032202541828156, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.012775842100381851, "rewards/format_reward": 0.8571429252624512, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 627.7142944335938, "epoch": 0.2020330368487929, "grad_norm": 0.055125512182712555, "kl": 0.2578125, "learning_rate": 1.8906456828473341e-06, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 585.4285888671875, "epoch": 0.20266836086404066, "grad_norm": 0.0708109587430954, "kl": 0.265625, "learning_rate": 1.8771943050537656e-06, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 568.857177734375, "epoch": 0.20330368487928843, "grad_norm": 3.287763833999634, "kl": 0.1796875, "learning_rate": 1.8637795484391046e-06, "loss": 0.0018, "reward": 0.4266955256462097, "reward_std": 0.05106709897518158, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.026695528998970985, "rewards/format_reward": 1.0, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 356.71429443359375, "epoch": 0.2039390088945362, "grad_norm": 2.707819700241089, "kl": 0.271484375, "learning_rate": 1.8504019758596698e-06, "loss": 0.0027, "reward": 1.2571430206298828, "reward_std": 0.3779645264148712, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.8571429252624512, "rewards/format_reward": 1.0, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 470.4285888671875, "epoch": 0.204574332909784, "grad_norm": 2.942082405090332, "kl": 0.353515625, "learning_rate": 1.8370621486116163e-06, "loss": 0.0035, "reward": 0.41904765367507935, "reward_std": 0.05039527267217636, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.01904762163758278, "rewards/format_reward": 1.0, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 485.14288330078125, "epoch": 0.20520965692503176, "grad_norm": 0.06295392662286758, "kl": 0.330078125, "learning_rate": 1.823760626407377e-06, "loss": 0.0036, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 499.4285888671875, "epoch": 0.20584498094027953, "grad_norm": 2.458465099334717, "kl": 0.34375, "learning_rate": 1.8104979673521838e-06, "loss": 0.0034, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 526.2857666015625, "epoch": 0.20648030495552733, "grad_norm": 3.1990699768066406, "kl": 0.33203125, "learning_rate": 1.7972747279206482e-06, "loss": 0.0033, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 601.0, "epoch": 0.2071156289707751, "grad_norm": 0.05853183940052986, "kl": 0.2890625, "learning_rate": 1.7840914629334122e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 498.71429443359375, "epoch": 0.20775095298602286, "grad_norm": 0.07322244346141815, "kl": 0.3125, "learning_rate": 1.7709487255338731e-06, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 553.857177734375, "epoch": 0.20838627700127066, "grad_norm": 0.17839957773685455, "kl": 0.228515625, "learning_rate": 1.7578470671649684e-06, "loss": 0.0026, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 530.857177734375, "epoch": 0.20902160101651843, "grad_norm": 0.1131465956568718, "kl": 0.3359375, "learning_rate": 1.744787037546045e-06, "loss": 0.0037, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 564.1428833007812, "epoch": 0.2096569250317662, "grad_norm": 0.08917635679244995, "kl": 0.30078125, "learning_rate": 1.731769184649788e-06, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 401.4285888671875, "epoch": 0.210292249047014, "grad_norm": 0.15413963794708252, "kl": 0.36328125, "learning_rate": 1.7187940546792325e-06, "loss": 0.0039, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 535.0, "epoch": 0.21092757306226176, "grad_norm": 3.1670172214508057, "kl": 0.306640625, "learning_rate": 1.7058621920448465e-06, "loss": 0.0031, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 477.14288330078125, "epoch": 0.21156289707750953, "grad_norm": 0.07635504752397537, "kl": 0.333984375, "learning_rate": 1.6929741393416855e-06, "loss": 0.0036, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 405.71429443359375, "epoch": 0.2121982210927573, "grad_norm": 0.0835573673248291, "kl": 0.361328125, "learning_rate": 1.6801304373266286e-06, "loss": 0.0039, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 556.0, "epoch": 0.2128335451080051, "grad_norm": 0.17580975592136383, "kl": 0.310546875, "learning_rate": 1.667331624895689e-06, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 382.14288330078125, "epoch": 0.21346886912325286, "grad_norm": 2.9555137157440186, "kl": 0.330078125, "learning_rate": 1.6545782390614037e-06, "loss": 0.0033, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 400.14288330078125, "epoch": 0.21410419313850063, "grad_norm": 0.08389069885015488, "kl": 0.2275390625, "learning_rate": 1.6418708149302992e-06, "loss": 0.0026, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 425.2857360839844, "epoch": 0.21473951715374842, "grad_norm": 2.6030876636505127, "kl": 0.23046875, "learning_rate": 1.6292098856804423e-06, "loss": 0.0023, "reward": 1.2571430206298828, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.8571429252624512, "rewards/format_reward": 1.0, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 369.71429443359375, "epoch": 0.2153748411689962, "grad_norm": 0.09957047551870346, "kl": 0.255859375, "learning_rate": 1.6165959825390661e-06, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 374.5714416503906, "epoch": 0.21601016518424396, "grad_norm": 3.5869476795196533, "kl": 0.2216796875, "learning_rate": 1.604029634760284e-06, "loss": 0.0022, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 449.14288330078125, "epoch": 0.21664548919949175, "grad_norm": 0.04904744401574135, "kl": 0.208984375, "learning_rate": 1.59151136960288e-06, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 592.0, "epoch": 0.21728081321473952, "grad_norm": 0.07155793160200119, "kl": 0.29296875, "learning_rate": 1.5790417123081903e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 498.14288330078125, "epoch": 0.2179161372299873, "grad_norm": 0.07623915374279022, "kl": 0.34765625, "learning_rate": 1.5666211860780583e-06, "loss": 0.0038, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 439.2857360839844, "epoch": 0.21855146124523506, "grad_norm": 3.3097286224365234, "kl": 0.330078125, "learning_rate": 1.5542503120528918e-06, "loss": 0.0033, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 504.4285888671875, "epoch": 0.21918678526048285, "grad_norm": 3.0239760875701904, "kl": 0.3125, "learning_rate": 1.5419296092897866e-06, "loss": 0.0031, "reward": 0.5071429014205933, "reward_std": 0.14202801883220673, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.107142873108387, "rewards/format_reward": 1.0, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 656.857177734375, "epoch": 0.21982210927573062, "grad_norm": 0.04605603963136673, "kl": 0.279296875, "learning_rate": 1.529659594740755e-06, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 411.2857360839844, "epoch": 0.2204574332909784, "grad_norm": 0.05368569865822792, "kl": 0.373046875, "learning_rate": 1.5174407832310338e-06, "loss": 0.004, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 626.0, "epoch": 0.22109275730622618, "grad_norm": 2.683948040008545, "kl": 0.26171875, "learning_rate": 1.5052736874374815e-06, "loss": 0.0026, "reward": 0.4285714626312256, "reward_std": 0.04879499599337578, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.02857143059372902, "rewards/format_reward": 1.0, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 500.857177734375, "epoch": 0.22172808132147395, "grad_norm": 2.875629186630249, "kl": 0.203125, "learning_rate": 1.4931588178670695e-06, "loss": 0.002, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 259.5714416503906, "epoch": 0.22236340533672172, "grad_norm": 0.2272838056087494, "kl": 0.353515625, "learning_rate": 1.4810966828354605e-06, "loss": 0.0047, "reward": 1.4000002145767212, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 1.0, "rewards/format_reward": 1.0, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 479.71429443359375, "epoch": 0.22299872935196952, "grad_norm": 3.2993814945220947, "kl": 0.21484375, "learning_rate": 1.469087788445684e-06, "loss": 0.0021, "reward": 0.42500001192092896, "reward_std": 0.04564352706074715, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.02500000223517418, "rewards/format_reward": 1.0, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 498.5714416503906, "epoch": 0.22363405336721728, "grad_norm": 0.2275489717721939, "kl": 0.333984375, "learning_rate": 1.4571326385668965e-06, "loss": 0.0036, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 470.857177734375, "epoch": 0.22426937738246505, "grad_norm": 0.05804005637764931, "kl": 0.341796875, "learning_rate": 1.4452317348132434e-06, "loss": 0.0037, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 255.1428680419922, "epoch": 0.22490470139771285, "grad_norm": 0.30045151710510254, "kl": 0.3125, "learning_rate": 1.4333855765228104e-06, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 401.8571472167969, "epoch": 0.22554002541296062, "grad_norm": 0.1624024361371994, "kl": 0.36328125, "learning_rate": 1.421594660736675e-06, "loss": 0.0039, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 458.5714416503906, "epoch": 0.22617534942820838, "grad_norm": 0.06793898344039917, "kl": 0.2294921875, "learning_rate": 1.4098594821780476e-06, "loss": 0.0026, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 354.2857360839844, "epoch": 0.22681067344345615, "grad_norm": 3.499160051345825, "kl": 0.2412109375, "learning_rate": 1.3981805332315174e-06, "loss": 0.0024, "reward": 1.057142972946167, "reward_std": 0.47207754850387573, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.7142857313156128, "rewards/format_reward": 0.8571429252624512, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 432.71429443359375, "epoch": 0.22744599745870395, "grad_norm": 3.1533403396606445, "kl": 0.30859375, "learning_rate": 1.3865583039223929e-06, "loss": 0.0031, "reward": 0.7381389141082764, "reward_std": 0.2022811770439148, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.3381389379501343, "rewards/format_reward": 1.0, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 287.8571472167969, "epoch": 0.22808132147395171, "grad_norm": 0.08542405813932419, "kl": 0.3203125, "learning_rate": 1.374993281896137e-06, "loss": 0.0044, "reward": 1.4000002145767212, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 1.0, "rewards/format_reward": 1.0, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 557.5714721679688, "epoch": 0.22871664548919948, "grad_norm": 1.56419038772583, "kl": 0.1650390625, "learning_rate": 1.3634859523979134e-06, "loss": 0.0016, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 356.4285888671875, "epoch": 0.22935196950444728, "grad_norm": 3.0232937335968018, "kl": 0.224609375, "learning_rate": 1.3520367982522208e-06, "loss": 0.0022, "reward": 1.1142858266830444, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.7142857313156128, "rewards/format_reward": 1.0, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 594.1428833007812, "epoch": 0.22998729351969505, "grad_norm": 3.2974321842193604, "kl": 0.1875, "learning_rate": 1.3406462998426358e-06, "loss": 0.0019, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 456.2857360839844, "epoch": 0.23062261753494281, "grad_norm": 3.04162859916687, "kl": 0.2119140625, "learning_rate": 1.3293149350916595e-06, "loss": 0.0021, "reward": 0.971428632736206, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 532.4285888671875, "epoch": 0.2312579415501906, "grad_norm": 3.197749376296997, "kl": 0.1748046875, "learning_rate": 1.3180431794406623e-06, "loss": 0.0017, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 449.71429443359375, "epoch": 0.23189326556543838, "grad_norm": 3.0233240127563477, "kl": 0.2294921875, "learning_rate": 1.3068315058299358e-06, "loss": 0.0023, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 362.5714416503906, "epoch": 0.23252858958068615, "grad_norm": 0.06233400106430054, "kl": 0.28515625, "learning_rate": 1.2956803846788503e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 475.2857360839844, "epoch": 0.2331639135959339, "grad_norm": 0.04722089692950249, "kl": 0.193359375, "learning_rate": 1.284590283866116e-06, "loss": 0.0022, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 644.5714721679688, "epoch": 0.2337992376111817, "grad_norm": 2.4597692489624023, "kl": 0.265625, "learning_rate": 1.2735616687101518e-06, "loss": 0.0027, "reward": 0.4714285731315613, "reward_std": 0.18898221850395203, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0714285746216774, "rewards/format_reward": 1.0, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 353.8571472167969, "epoch": 0.23443456162642948, "grad_norm": 3.595822811126709, "kl": 0.255859375, "learning_rate": 1.2625950019495614e-06, "loss": 0.0026, "reward": 0.6106783151626587, "reward_std": 0.34680601954460144, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.21067820489406586, "rewards/format_reward": 1.0, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 585.5714721679688, "epoch": 0.23506988564167725, "grad_norm": 0.05966843292117119, "kl": 0.287109375, "learning_rate": 1.251690743723718e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 315.4285888671875, "epoch": 0.23570520965692504, "grad_norm": 3.2647478580474854, "kl": 0.26171875, "learning_rate": 1.2408493515534581e-06, "loss": 0.0026, "reward": 1.1142858266830444, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.7142857313156128, "rewards/format_reward": 1.0, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 503.0000305175781, "epoch": 0.2363405336721728, "grad_norm": 3.232701063156128, "kl": 0.1650390625, "learning_rate": 1.2300712803218834e-06, "loss": 0.0017, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 551.5714721679688, "epoch": 0.23697585768742058, "grad_norm": 0.05632919818162918, "kl": 0.2890625, "learning_rate": 1.2193569822552772e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 407.0000305175781, "epoch": 0.23761118170266837, "grad_norm": 0.06528866291046143, "kl": 0.228515625, "learning_rate": 1.2087069069041268e-06, "loss": 0.0026, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 378.8571472167969, "epoch": 0.23824650571791614, "grad_norm": 3.4633543491363525, "kl": 0.29296875, "learning_rate": 1.1981215011242654e-06, "loss": 0.0029, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 499.0000305175781, "epoch": 0.2388818297331639, "grad_norm": 0.06882923096418381, "kl": 0.2216796875, "learning_rate": 1.1876012090581184e-06, "loss": 0.0025, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 424.8571472167969, "epoch": 0.23951715374841168, "grad_norm": 0.048563580960035324, "kl": 0.1806640625, "learning_rate": 1.177146472116071e-06, "loss": 0.0021, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 556.1428833007812, "epoch": 0.24015247776365947, "grad_norm": 3.4067060947418213, "kl": 0.263671875, "learning_rate": 1.1667577289579462e-06, "loss": 0.0026, "reward": 0.43296706676483154, "reward_std": 0.06052277237176895, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.032967038452625275, "rewards/format_reward": 1.0, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 415.0000305175781, "epoch": 0.24078780177890724, "grad_norm": 2.6581125259399414, "kl": 0.3203125, "learning_rate": 1.1564354154746007e-06, "loss": 0.0032, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 473.0000305175781, "epoch": 0.241423125794155, "grad_norm": 0.12217168509960175, "kl": 0.2734375, "learning_rate": 1.146179964769635e-06, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 358.5714416503906, "epoch": 0.2420584498094028, "grad_norm": 0.07087297737598419, "kl": 0.337890625, "learning_rate": 1.1359918071412195e-06, "loss": 0.0037, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 666.7142944335938, "epoch": 0.24269377382465057, "grad_norm": 3.078494071960449, "kl": 0.1669921875, "learning_rate": 1.1258713700640456e-06, "loss": 0.0017, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 573.5714721679688, "epoch": 0.24332909783989834, "grad_norm": 0.0404311902821064, "kl": 0.25, "learning_rate": 1.115819078171383e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 508.857177734375, "epoch": 0.24396442185514614, "grad_norm": 2.985482931137085, "kl": 0.1787109375, "learning_rate": 1.1058353532372667e-06, "loss": 0.0018, "reward": 0.37460315227508545, "reward_std": 0.07708179205656052, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0031746032182127237, "rewards/format_reward": 1.0, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 499.857177734375, "epoch": 0.2445997458703939, "grad_norm": 2.4948058128356934, "kl": 0.251953125, "learning_rate": 1.0959206141587998e-06, "loss": 0.0025, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 400.2857360839844, "epoch": 0.24523506988564167, "grad_norm": 3.1324355602264404, "kl": 0.2119140625, "learning_rate": 1.0860752769385766e-06, "loss": 0.0021, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 425.14288330078125, "epoch": 0.24587039390088947, "grad_norm": 0.061597954481840134, "kl": 0.28515625, "learning_rate": 1.0762997546672279e-06, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 500.4285888671875, "epoch": 0.24650571791613723, "grad_norm": 0.047814078629016876, "kl": 0.2021484375, "learning_rate": 1.0665944575060914e-06, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 375.71429443359375, "epoch": 0.247141041931385, "grad_norm": 3.590963363647461, "kl": 0.22265625, "learning_rate": 1.056959792669997e-06, "loss": 0.0022, "reward": 0.5285714268684387, "reward_std": 0.11126972734928131, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.12857143580913544, "rewards/format_reward": 1.0, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 470.0000305175781, "epoch": 0.24777636594663277, "grad_norm": 0.10802248120307922, "kl": 0.376953125, "learning_rate": 1.0473961644101856e-06, "loss": 0.0041, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 475.14288330078125, "epoch": 0.24841168996188057, "grad_norm": 0.18473738431930542, "kl": 0.318359375, "learning_rate": 1.037903973997345e-06, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 455.5714416503906, "epoch": 0.24904701397712833, "grad_norm": 4.052010536193848, "kl": 0.349609375, "learning_rate": 1.0284836197047737e-06, "loss": 0.0035, "reward": 0.49629050493240356, "reward_std": 0.17879442870616913, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0962904542684555, "rewards/format_reward": 1.0, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 437.0000305175781, "epoch": 0.2496823379923761, "grad_norm": 2.7656362056732178, "kl": 0.2158203125, "learning_rate": 1.0191354967916712e-06, "loss": 0.0022, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 513.2857666015625, "epoch": 0.2503176620076239, "grad_norm": 0.08351174741983414, "kl": 0.275390625, "learning_rate": 1.0098599974865515e-06, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 429.4285888671875, "epoch": 0.25095298602287164, "grad_norm": 0.061210744082927704, "kl": 0.248046875, "learning_rate": 1.0006575109707898e-06, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 506.71429443359375, "epoch": 0.25158831003811943, "grad_norm": 0.06361155211925507, "kl": 0.2734375, "learning_rate": 9.915284233622877e-07, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 431.4285888671875, "epoch": 0.25222363405336723, "grad_norm": 0.06987325847148895, "kl": 0.203125, "learning_rate": 9.824731176992796e-07, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 355.14288330078125, "epoch": 0.25285895806861497, "grad_norm": 0.1327361762523651, "kl": 0.30859375, "learning_rate": 9.734919739242543e-07, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 417.5714416503906, "epoch": 0.25349428208386277, "grad_norm": 3.1446924209594727, "kl": 0.333984375, "learning_rate": 9.645853688680177e-07, "loss": 0.0033, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 389.5714416503906, "epoch": 0.25412960609911056, "grad_norm": 0.048515211790800095, "kl": 0.1787109375, "learning_rate": 9.557536762338786e-07, "loss": 0.0021, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 399.14288330078125, "epoch": 0.2547649301143583, "grad_norm": 0.053919967263936996, "kl": 0.2109375, "learning_rate": 9.46997266581973e-07, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 408.2857360839844, "epoch": 0.2554002541296061, "grad_norm": 3.3761556148529053, "kl": 0.310546875, "learning_rate": 9.383165073137115e-07, "loss": 0.0031, "reward": 0.44395607709884644, "reward_std": 0.04111712798476219, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.04395604878664017, "rewards/format_reward": 1.0, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 423.8571472167969, "epoch": 0.2560355781448539, "grad_norm": 0.17396095395088196, "kl": 0.3515625, "learning_rate": 9.297117626563687e-07, "loss": 0.0038, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 589.7142944335938, "epoch": 0.25667090216010163, "grad_norm": 0.44405487179756165, "kl": 0.318359375, "learning_rate": 9.211833936477957e-07, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 416.0000305175781, "epoch": 0.25730622617534943, "grad_norm": 0.06627284735441208, "kl": 0.29296875, "learning_rate": 9.127317581212753e-07, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 381.0000305175781, "epoch": 0.2579415501905972, "grad_norm": 0.10693392902612686, "kl": 0.30078125, "learning_rate": 9.043572106905084e-07, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 476.0000305175781, "epoch": 0.25857687420584496, "grad_norm": 0.04814046248793602, "kl": 0.26953125, "learning_rate": 8.960601027347321e-07, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 390.4285888671875, "epoch": 0.25921219822109276, "grad_norm": 0.06620350480079651, "kl": 0.21484375, "learning_rate": 8.878407823839788e-07, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 444.4285888671875, "epoch": 0.25984752223634056, "grad_norm": 3.463074207305908, "kl": 0.376953125, "learning_rate": 8.796995945044689e-07, "loss": 0.0038, "reward": 0.4129870533943176, "reward_std": 0.034360405057668686, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.012987013906240463, "rewards/format_reward": 1.0, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 442.5714416503906, "epoch": 0.2604828462515883, "grad_norm": 2.6474926471710205, "kl": 0.287109375, "learning_rate": 8.716368806841405e-07, "loss": 0.0029, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 393.4285888671875, "epoch": 0.2611181702668361, "grad_norm": 3.3637120723724365, "kl": 0.203125, "learning_rate": 8.636529792183171e-07, "loss": 0.002, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 441.857177734375, "epoch": 0.2617534942820839, "grad_norm": 2.904768466949463, "kl": 0.2060546875, "learning_rate": 8.557482250955144e-07, "loss": 0.0021, "reward": 0.6987013220787048, "reward_std": 0.21078045666217804, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.29870128631591797, "rewards/format_reward": 1.0, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 399.14288330078125, "epoch": 0.2623888182973316, "grad_norm": 0.10355106741189957, "kl": 0.3359375, "learning_rate": 8.479229499833844e-07, "loss": 0.0037, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 456.0000305175781, "epoch": 0.2630241423125794, "grad_norm": 0.13930782675743103, "kl": 0.294921875, "learning_rate": 8.401774822147976e-07, "loss": 0.0032, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 443.4285888671875, "epoch": 0.2636594663278272, "grad_norm": 3.3897175788879395, "kl": 0.306640625, "learning_rate": 8.325121467740695e-07, "loss": 0.0031, "reward": 0.5224490165710449, "reward_std": 0.12853363156318665, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.12244899570941925, "rewards/format_reward": 1.0, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 550.1428833007812, "epoch": 0.26429479034307496, "grad_norm": 0.05424835905432701, "kl": 0.25390625, "learning_rate": 8.249272652833226e-07, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 482.14288330078125, "epoch": 0.26493011435832275, "grad_norm": 0.2863621115684509, "kl": 0.3203125, "learning_rate": 8.174231559889931e-07, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 486.5714416503906, "epoch": 0.2655654383735705, "grad_norm": 0.05610418692231178, "kl": 0.28515625, "learning_rate": 8.100001337484787e-07, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 466.2857360839844, "epoch": 0.2662007623888183, "grad_norm": 0.07580099999904633, "kl": 0.322265625, "learning_rate": 8.026585100169251e-07, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 478.71429443359375, "epoch": 0.2668360864040661, "grad_norm": 3.3928205966949463, "kl": 0.23046875, "learning_rate": 7.953985928341601e-07, "loss": 0.0023, "reward": 0.4655141234397888, "reward_std": 0.08343012630939484, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.06551411002874374, "rewards/format_reward": 1.0, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 494.0000305175781, "epoch": 0.2674714104193138, "grad_norm": 0.04618528112769127, "kl": 0.1669921875, "learning_rate": 7.882206868117693e-07, "loss": 0.002, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 441.5714416503906, "epoch": 0.2681067344345616, "grad_norm": 2.9898011684417725, "kl": 0.28515625, "learning_rate": 7.81125093120313e-07, "loss": 0.0028, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 462.4285888671875, "epoch": 0.2687420584498094, "grad_norm": 0.056119803339242935, "kl": 0.3046875, "learning_rate": 7.741121094766916e-07, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 312.2857360839844, "epoch": 0.26937738246505716, "grad_norm": 3.943939447402954, "kl": 0.314453125, "learning_rate": 7.671820301316532e-07, "loss": 0.0031, "reward": 0.48671239614486694, "reward_std": 0.044492121785879135, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.08671241253614426, "rewards/format_reward": 1.0, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 422.4285888671875, "epoch": 0.27001270648030495, "grad_norm": 0.04282496124505997, "kl": 0.16015625, "learning_rate": 7.603351458574474e-07, "loss": 0.0019, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 496.2857360839844, "epoch": 0.27064803049555275, "grad_norm": 0.059300344437360764, "kl": 0.26953125, "learning_rate": 7.535717439356255e-07, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 476.14288330078125, "epoch": 0.2712833545108005, "grad_norm": 2.4999890327453613, "kl": 0.14453125, "learning_rate": 7.46892108144986e-07, "loss": 0.0014, "reward": 0.7171429395675659, "reward_std": 0.4510992467403412, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.31714287400245667, "rewards/format_reward": 1.0, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 488.71429443359375, "epoch": 0.2719186785260483, "grad_norm": 0.052872247993946075, "kl": 0.279296875, "learning_rate": 7.402965187496697e-07, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 475.857177734375, "epoch": 0.2725540025412961, "grad_norm": 2.7971818447113037, "kl": 0.1826171875, "learning_rate": 7.337852524873974e-07, "loss": 0.0018, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 647.1428833007812, "epoch": 0.2731893265565438, "grad_norm": 2.0566720962524414, "kl": 0.2255859375, "learning_rate": 7.273585825578608e-07, "loss": 0.0023, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 432.8571472167969, "epoch": 0.2738246505717916, "grad_norm": 2.9699957370758057, "kl": 0.27734375, "learning_rate": 7.21016778611259e-07, "loss": 0.0028, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 419.14288330078125, "epoch": 0.2744599745870394, "grad_norm": 0.10036282241344452, "kl": 0.31640625, "learning_rate": 7.147601067369835e-07, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 323.71429443359375, "epoch": 0.27509529860228715, "grad_norm": 3.8502118587493896, "kl": 0.275390625, "learning_rate": 7.085888294524561e-07, "loss": 0.0028, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 631.2857666015625, "epoch": 0.27573062261753495, "grad_norm": 0.05429592728614807, "kl": 0.23828125, "learning_rate": 7.025032056921117e-07, "loss": 0.0027, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 455.71429443359375, "epoch": 0.27636594663278274, "grad_norm": 3.2364959716796875, "kl": 0.29296875, "learning_rate": 6.965034907965349e-07, "loss": 0.0029, "reward": 0.5190476179122925, "reward_std": 0.09449110925197601, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1190476268529892, "rewards/format_reward": 1.0, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 419.0000305175781, "epoch": 0.2770012706480305, "grad_norm": 3.7509405612945557, "kl": 0.1748046875, "learning_rate": 6.905899365017462e-07, "loss": 0.0017, "reward": 0.4330357313156128, "reward_std": 0.0516289584338665, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.03303571790456772, "rewards/format_reward": 1.0, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 386.5714416503906, "epoch": 0.2776365946632783, "grad_norm": 0.05398930609226227, "kl": 0.314453125, "learning_rate": 6.847627909286409e-07, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 474.4285888671875, "epoch": 0.2782719186785261, "grad_norm": 2.960287094116211, "kl": 0.181640625, "learning_rate": 6.790222985725761e-07, "loss": 0.0018, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 492.4285888671875, "epoch": 0.2789072426937738, "grad_norm": 3.587923288345337, "kl": 0.2470703125, "learning_rate": 6.733687002931141e-07, "loss": 0.0025, "reward": 0.4714285731315613, "reward_std": 0.18898221850395203, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0714285746216774, "rewards/format_reward": 1.0, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 425.5714416503906, "epoch": 0.2795425667090216, "grad_norm": 2.850022077560425, "kl": 0.1650390625, "learning_rate": 6.678022333039158e-07, "loss": 0.0016, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 397.4285888671875, "epoch": 0.28017789072426935, "grad_norm": 0.045945875346660614, "kl": 0.265625, "learning_rate": 6.623231311627876e-07, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 554.5714721679688, "epoch": 0.28081321473951715, "grad_norm": 0.06309759616851807, "kl": 0.265625, "learning_rate": 6.569316237618811e-07, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 318.0, "epoch": 0.28144853875476494, "grad_norm": 3.4273765087127686, "kl": 0.234375, "learning_rate": 6.516279373180499e-07, "loss": 0.0023, "reward": 0.971428632736206, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 551.4285888671875, "epoch": 0.2820838627700127, "grad_norm": 0.059127844870090485, "kl": 0.2490234375, "learning_rate": 6.464122943633543e-07, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 388.71429443359375, "epoch": 0.2827191867852605, "grad_norm": 0.058766093105077744, "kl": 0.302734375, "learning_rate": 6.412849137357271e-07, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 484.2857360839844, "epoch": 0.2833545108005083, "grad_norm": 0.05756891146302223, "kl": 0.275390625, "learning_rate": 6.3624601056979e-07, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 397.4285888671875, "epoch": 0.283989834815756, "grad_norm": 3.6736083030700684, "kl": 0.30859375, "learning_rate": 6.312957962878278e-07, "loss": 0.0031, "reward": 0.4039139151573181, "reward_std": 0.01035518478602171, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.003913894295692444, "rewards/format_reward": 1.0, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 528.857177734375, "epoch": 0.2846251588310038, "grad_norm": 0.047244369983673096, "kl": 0.259765625, "learning_rate": 6.264344785909181e-07, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 432.5714416503906, "epoch": 0.2852604828462516, "grad_norm": 0.06841956079006195, "kl": 0.298828125, "learning_rate": 6.216622614502149e-07, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 538.1428833007812, "epoch": 0.28589580686149935, "grad_norm": 0.052464455366134644, "kl": 0.25, "learning_rate": 6.169793450983916e-07, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 490.71429443359375, "epoch": 0.28653113087674714, "grad_norm": 0.04570423439145088, "kl": 0.2470703125, "learning_rate": 6.123859260212393e-07, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 400.14288330078125, "epoch": 0.28716645489199494, "grad_norm": 0.05332702025771141, "kl": 0.306640625, "learning_rate": 6.07882196949423e-07, "loss": 0.0034, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 338.8571472167969, "epoch": 0.2878017789072427, "grad_norm": 3.625446081161499, "kl": 0.1982421875, "learning_rate": 6.034683468503948e-07, "loss": 0.002, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 361.14288330078125, "epoch": 0.2884371029224905, "grad_norm": 3.3959381580352783, "kl": 0.375, "learning_rate": 5.991445609204641e-07, "loss": 0.0037, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 404.2857360839844, "epoch": 0.28907242693773827, "grad_norm": 0.06705067306756973, "kl": 0.27734375, "learning_rate": 5.949110205770292e-07, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 368.14288330078125, "epoch": 0.289707750952986, "grad_norm": 0.0792422741651535, "kl": 0.30078125, "learning_rate": 5.90767903450964e-07, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 356.0000305175781, "epoch": 0.2903430749682338, "grad_norm": 3.4945454597473145, "kl": 0.1982421875, "learning_rate": 5.867153833791652e-07, "loss": 0.002, "reward": 0.4571428596973419, "reward_std": 0.053452249616384506, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.05714286118745804, "rewards/format_reward": 1.0, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 371.5714416503906, "epoch": 0.2909783989834816, "grad_norm": 3.341874837875366, "kl": 0.3203125, "learning_rate": 5.827536303972587e-07, "loss": 0.0032, "reward": 0.44761908054351807, "reward_std": 0.12598817050457, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0476190522313118, "rewards/format_reward": 1.0, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 395.8571472167969, "epoch": 0.29161372299872934, "grad_norm": 0.0777309313416481, "kl": 0.30078125, "learning_rate": 5.78882810732465e-07, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 461.857177734375, "epoch": 0.29224904701397714, "grad_norm": 0.0932033360004425, "kl": 0.3203125, "learning_rate": 5.75103086796625e-07, "loss": 0.0035, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 419.4285888671875, "epoch": 0.2928843710292249, "grad_norm": 3.0678720474243164, "kl": 0.1650390625, "learning_rate": 5.714146171793846e-07, "loss": 0.0016, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 438.71429443359375, "epoch": 0.2935196950444727, "grad_norm": 0.08228779584169388, "kl": 0.216796875, "learning_rate": 5.678175566415422e-07, "loss": 0.0025, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 443.71429443359375, "epoch": 0.29415501905972047, "grad_norm": 2.509739398956299, "kl": 0.349609375, "learning_rate": 5.643120561085528e-07, "loss": 0.0035, "reward": 0.5428571701049805, "reward_std": 0.3779645264148712, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 476.857177734375, "epoch": 0.2947903430749682, "grad_norm": 0.0598638616502285, "kl": 0.26171875, "learning_rate": 5.608982626641991e-07, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 508.0000305175781, "epoch": 0.295425667090216, "grad_norm": 0.04956334829330444, "kl": 0.2109375, "learning_rate": 5.575763195444166e-07, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 519.1428833007812, "epoch": 0.2960609911054638, "grad_norm": 3.209406614303589, "kl": 0.275390625, "learning_rate": 5.543463661312847e-07, "loss": 0.0027, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 440.857177734375, "epoch": 0.29669631512071154, "grad_norm": 0.05284934490919113, "kl": 0.26171875, "learning_rate": 5.512085379471808e-07, "loss": 0.0029, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 445.2857360839844, "epoch": 0.29733163913595934, "grad_norm": 3.3668439388275146, "kl": 0.29296875, "learning_rate": 5.481629666490903e-07, "loss": 0.0029, "reward": 0.485714316368103, "reward_std": 0.08997353911399841, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.08571429550647736, "rewards/format_reward": 1.0, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 541.0, "epoch": 0.29796696315120713, "grad_norm": 2.8946890830993652, "kl": 0.2216796875, "learning_rate": 5.452097800230853e-07, "loss": 0.0022, "reward": 0.3428571820259094, "reward_std": 0.1511857956647873, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 0.8571429252624512, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 425.4285888671875, "epoch": 0.29860228716645487, "grad_norm": 0.04910692945122719, "kl": 0.1669921875, "learning_rate": 5.423491019789623e-07, "loss": 0.002, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 238.2857208251953, "epoch": 0.29923761118170267, "grad_norm": 0.06361490488052368, "kl": 0.208984375, "learning_rate": 5.395810525450425e-07, "loss": 0.0033, "reward": 1.4000002145767212, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 1.0, "rewards/format_reward": 1.0, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 395.8571472167969, "epoch": 0.29987293519695046, "grad_norm": 0.04494641348719597, "kl": 0.2041015625, "learning_rate": 5.369057478631359e-07, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 438.0000305175781, "epoch": 0.3005082592121982, "grad_norm": 3.899231195449829, "kl": 0.2099609375, "learning_rate": 5.343233001836694e-07, "loss": 0.0021, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 566.1428833007812, "epoch": 0.301143583227446, "grad_norm": 2.999528646469116, "kl": 0.1689453125, "learning_rate": 5.318338178609754e-07, "loss": 0.0017, "reward": 0.42418450117111206, "reward_std": 0.06398611515760422, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.02418447658419609, "rewards/format_reward": 1.0, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 505.2857360839844, "epoch": 0.3017789072426938, "grad_norm": 0.056534409523010254, "kl": 0.2138671875, "learning_rate": 5.294374053487459e-07, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 376.71429443359375, "epoch": 0.30241423125794153, "grad_norm": 3.721620798110962, "kl": 0.212890625, "learning_rate": 5.271341631956511e-07, "loss": 0.0021, "reward": 0.5836734771728516, "reward_std": 0.3662114441394806, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.18367348611354828, "rewards/format_reward": 1.0, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 492.14288330078125, "epoch": 0.30304955527318933, "grad_norm": 0.05311813950538635, "kl": 0.251953125, "learning_rate": 5.249241880411181e-07, "loss": 0.0028, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 510.0000305175781, "epoch": 0.3036848792884371, "grad_norm": 3.715238332748413, "kl": 0.240234375, "learning_rate": 5.228075726112785e-07, "loss": 0.0024, "reward": 0.5428571701049805, "reward_std": 0.3779645264148712, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 455.0000305175781, "epoch": 0.30432020330368487, "grad_norm": 3.2647809982299805, "kl": 0.1640625, "learning_rate": 5.207844057150768e-07, "loss": 0.0016, "reward": 0.5428571701049805, "reward_std": 0.37796446681022644, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 481.71429443359375, "epoch": 0.30495552731893266, "grad_norm": 0.06345637142658234, "kl": 0.265625, "learning_rate": 5.188547722405437e-07, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 439.14288330078125, "epoch": 0.30559085133418046, "grad_norm": 0.06628384441137314, "kl": 0.26953125, "learning_rate": 5.170187531512351e-07, "loss": 0.003, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 422.14288330078125, "epoch": 0.3062261753494282, "grad_norm": 2.7409112453460693, "kl": 0.1591796875, "learning_rate": 5.152764254828348e-07, "loss": 0.0016, "reward": 0.5428571701049805, "reward_std": 0.3779645562171936, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 573.1428833007812, "epoch": 0.306861499364676, "grad_norm": 0.041587937623262405, "kl": 0.140625, "learning_rate": 5.136278623399225e-07, "loss": 0.0017, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 375.0000305175781, "epoch": 0.30749682337992373, "grad_norm": 2.9096410274505615, "kl": 0.2001953125, "learning_rate": 5.120731328929058e-07, "loss": 0.002, "reward": 0.971428632736206, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.5714285969734192, "rewards/format_reward": 1.0, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 454.0000305175781, "epoch": 0.30813214739517153, "grad_norm": 3.31744122505188, "kl": 0.21484375, "learning_rate": 5.106123023751187e-07, "loss": 0.0021, "reward": 0.8511278033256531, "reward_std": 0.23666103184223175, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.451127827167511, "rewards/format_reward": 1.0, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 416.4285888671875, "epoch": 0.3087674714104193, "grad_norm": 2.8466956615448, "kl": 0.19140625, "learning_rate": 5.092454320800833e-07, "loss": 0.0019, "reward": 0.4338059425354004, "reward_std": 0.06710861623287201, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.03380589187145233, "rewards/format_reward": 1.0, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 466.4285888671875, "epoch": 0.30940279542566707, "grad_norm": 5.7825751304626465, "kl": 0.65234375, "learning_rate": 5.079725793589405e-07, "loss": 0.0068, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 358.4285888671875, "epoch": 0.31003811944091486, "grad_norm": 0.06072893738746643, "kl": 0.1953125, "learning_rate": 5.067937976180407e-07, "loss": 0.0022, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 504.2857360839844, "epoch": 0.31067344345616266, "grad_norm": 0.03951825201511383, "kl": 0.1630859375, "learning_rate": 5.057091363167046e-07, "loss": 0.0019, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 235.1428680419922, "epoch": 0.3113087674714104, "grad_norm": 3.270069122314453, "kl": 0.224609375, "learning_rate": 5.047186409651489e-07, "loss": 0.0022, "reward": 1.3428572416305542, "reward_std": 0.15118582546710968, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 1.0, "rewards/format_reward": 0.8571429252624512, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 509.0000305175781, "epoch": 0.3119440914866582, "grad_norm": 0.1327732652425766, "kl": 0.298828125, "learning_rate": 5.038223531225742e-07, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 507.2857360839844, "epoch": 0.312579415501906, "grad_norm": 3.286302328109741, "kl": 0.2353515625, "learning_rate": 5.030203103954232e-07, "loss": 0.0024, "reward": 0.37142860889434814, "reward_std": 0.07559289783239365, "rewards/code_format_reward": 0.8571429252624512, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 433.71429443359375, "epoch": 0.31321473951715373, "grad_norm": 0.04749950394034386, "kl": 0.173828125, "learning_rate": 5.023125464358026e-07, "loss": 0.002, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 402.2857360839844, "epoch": 0.3138500635324015, "grad_norm": 3.375169515609741, "kl": 0.26171875, "learning_rate": 5.016990909400709e-07, "loss": 0.0026, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 491.857177734375, "epoch": 0.3144853875476493, "grad_norm": 0.05139714851975441, "kl": 0.296875, "learning_rate": 5.011799696475915e-07, "loss": 0.0033, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 323.5714416503906, "epoch": 0.31512071156289706, "grad_norm": 3.7574667930603027, "kl": 0.181640625, "learning_rate": 5.007552043396547e-07, "loss": 0.0018, "reward": 0.5428571701049805, "reward_std": 0.3779645264148712, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.1428571492433548, "rewards/format_reward": 1.0, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 534.857177734375, "epoch": 0.31575603557814486, "grad_norm": 2.853102445602417, "kl": 0.1806640625, "learning_rate": 5.004248128385618e-07, "loss": 0.0018, "reward": 0.6857143640518188, "reward_std": 0.4879501163959503, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.2857142984867096, "rewards/format_reward": 1.0, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 422.8571472167969, "epoch": 0.31639135959339265, "grad_norm": 3.089301824569702, "kl": 0.1923828125, "learning_rate": 5.001888090068784e-07, "loss": 0.0019, "reward": 0.8285714983940125, "reward_std": 0.534522533416748, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.4285714626312256, "rewards/format_reward": 1.0, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 451.2857360839844, "epoch": 0.3170266836086404, "grad_norm": 5.161578178405762, "kl": 0.57421875, "learning_rate": 5.000472027468528e-07, "loss": 0.0057, "reward": 0.2857142984867096, "reward_std": 0.1951800137758255, "rewards/code_format_reward": 0.7142857313156128, "rewards/code_reward": 0.0, "rewards/format_reward": 0.7142857313156128, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 611.4285888671875, "epoch": 0.3176620076238882, "grad_norm": 0.048863135278224945, "kl": 0.2060546875, "learning_rate": 5.000000000000001e-07, "loss": 0.0024, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 444.857177734375, "epoch": 0.318297331639136, "grad_norm": 0.047799259424209595, "kl": 0.19921875, "learning_rate": 5.000472027468528e-07, "loss": 0.0023, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 446.5714416503906, "epoch": 0.3189326556543837, "grad_norm": 0.15391913056373596, "kl": 0.283203125, "learning_rate": 5.001888090068784e-07, "loss": 0.0031, "reward": 0.40000003576278687, "reward_std": 0.0, "rewards/code_format_reward": 1.0, "rewards/code_reward": 0.0, "rewards/format_reward": 1.0, "step": 502 }, { "epoch": 0.3189326556543837, "step": 502, "total_flos": 0.0, "train_loss": 6.229176085843033e-06, "train_runtime": 154.2548, "train_samples_per_second": 22.69, "train_steps_per_second": 3.241 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }