diff --git "a/checkpoint-500/trainer_state.json" "b/checkpoint-500/trainer_state.json" deleted file mode 100644--- "a/checkpoint-500/trainer_state.json" +++ /dev/null @@ -1,9034 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "episode": 8000, - "epoch": 0.14379695869432363, - "eval_steps": 500, - "global_step": 500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "episode": 16, - "epoch": 0.00028759391738864725, - "loss/policy_avg": 0.04147649183869362, - "lr": 1e-05, - "objective/entropy": 119.65733337402344, - "objective/kl": 15.623376846313477, - "objective/non_score_reward": -1.5623377561569214, - "objective/rlhf_reward": -3.849351099133491, - "objective/scores": 0.6, - "policy/approxkl_avg": 473.7090759277344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7531497478485107, - "step": 0, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990334510803223 - }, - { - "episode": 32, - "epoch": 0.0005751878347772945, - "loss/policy_avg": 0.09634321182966232, - "lr": 9.999360940695298e-06, - "objective/entropy": -24.297130584716797, - "objective/kl": 11.720248222351074, - "objective/non_score_reward": -1.1720247268676758, - "objective/rlhf_reward": -3.2880991645157334, - "objective/scores": 0.35, - "policy/approxkl_avg": 233.3876953125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6364185214042664, - "step": 1, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988316297531128 - }, - { - "episode": 48, - "epoch": 0.0008627817521659417, - "loss/policy_avg": 0.5879926681518555, - "lr": 9.998721881390595e-06, - "objective/entropy": -123.47531127929688, - "objective/kl": 7.935818672180176, - "objective/non_score_reward": -0.7935818433761597, - "objective/rlhf_reward": -0.25060838157055054, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 142.57273864746094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6163707971572876, - "step": 2, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999882459640503 - }, - { - "episode": 64, - "epoch": 0.001150375669554589, - "loss/policy_avg": 0.380592405796051, - "lr": 9.99808282208589e-06, - "objective/entropy": -117.48745727539062, - "objective/kl": 10.153940200805664, - "objective/non_score_reward": -1.0153939723968506, - "objective/rlhf_reward": -2.682973676411015, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 190.00497436523438, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5329767465591431, - "step": 3, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998424053192139 - }, - { - "episode": 80, - "epoch": 0.001437969586943236, - "loss/policy_avg": 0.14582836627960205, - "lr": 9.997443762781187e-06, - "objective/entropy": -217.63848876953125, - "objective/kl": 10.502876281738281, - "objective/non_score_reward": -1.0502876043319702, - "objective/rlhf_reward": -2.777318381418554, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 221.2613067626953, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6339143514633179, - "step": 4, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9996079206466675 - }, - { - "episode": 96, - "epoch": 0.0017255635043318834, - "loss/policy_avg": 0.12740007042884827, - "lr": 9.996804703476484e-06, - "objective/entropy": 398.1901550292969, - "objective/kl": 14.20137882232666, - "objective/non_score_reward": -1.420137882232666, - "objective/rlhf_reward": -3.28055148422718, - "objective/scores": 0.6, - "policy/approxkl_avg": 349.208740234375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 1.005652904510498, - "step": 5, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9985274076461792 - }, - { - "episode": 112, - "epoch": 0.0020131574217205307, - "loss/policy_avg": 0.1509546935558319, - "lr": 9.99616564417178e-06, - "objective/entropy": -124.58861541748047, - "objective/kl": 8.397514343261719, - "objective/non_score_reward": -0.8397514224052429, - "objective/rlhf_reward": -1.9590056151151658, - "objective/scores": 0.35, - "policy/approxkl_avg": 87.99980163574219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7696092128753662, - "step": 6, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9961457252502441 - }, - { - "episode": 128, - "epoch": 0.002300751339109178, - "loss/policy_avg": 0.07236729562282562, - "lr": 9.995526584867077e-06, - "objective/entropy": -62.749176025390625, - "objective/kl": 10.19581413269043, - "objective/non_score_reward": -1.0195814371109009, - "objective/rlhf_reward": -2.3449923555056253, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 151.23446655273438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7507286071777344, - "step": 7, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9975416660308838 - }, - { - "episode": 144, - "epoch": 0.002588345256497825, - "loss/policy_avg": 0.1384029984474182, - "lr": 9.994887525562374e-06, - "objective/entropy": -143.49945068359375, - "objective/kl": 12.088400840759277, - "objective/non_score_reward": -1.2088401317596436, - "objective/rlhf_reward": -3.435360452532768, - "objective/scores": 0.35, - "policy/approxkl_avg": 150.72146606445312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7477531433105469, - "step": 8, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0000879764556885 - }, - { - "episode": 160, - "epoch": 0.002875939173886472, - "loss/policy_avg": -0.009389623999595642, - "lr": 9.99424846625767e-06, - "objective/entropy": -8.538755416870117, - "objective/kl": 4.930829048156738, - "objective/non_score_reward": -0.4930829405784607, - "objective/rlhf_reward": -0.6306961386496122, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 21.575889587402344, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4435485005378723, - "step": 9, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00315523147583 - }, - { - "episode": 176, - "epoch": 0.0031635330912751195, - "loss/policy_avg": 0.09865772724151611, - "lr": 9.993609406952966e-06, - "objective/entropy": -17.656417846679688, - "objective/kl": 7.901223659515381, - "objective/non_score_reward": -0.790122389793396, - "objective/rlhf_reward": -1.213078211026128, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 61.98566436767578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7360714673995972, - "step": 10, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977571964263916 - }, - { - "episode": 192, - "epoch": 0.0034511270086637668, - "loss/policy_avg": -0.005021991208195686, - "lr": 9.992970347648263e-06, - "objective/entropy": -36.69260787963867, - "objective/kl": 10.859649658203125, - "objective/non_score_reward": -1.0859650373458862, - "objective/rlhf_reward": -1.9438601382076737, - "objective/scores": 0.6, - "policy/approxkl_avg": 145.91165161132812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4879041314125061, - "step": 11, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994498491287231 - }, - { - "episode": 208, - "epoch": 0.003738720926052414, - "loss/policy_avg": 0.35356682538986206, - "lr": 9.992331288343558e-06, - "objective/entropy": -69.72517395019531, - "objective/kl": 10.624967575073242, - "objective/non_score_reward": -1.0624967813491821, - "objective/rlhf_reward": -2.6937278797298223, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 142.52261352539062, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6256821751594543, - "step": 12, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9967740774154663 - }, - { - "episode": 224, - "epoch": 0.004026314843441061, - "loss/policy_avg": 0.24467170238494873, - "lr": 9.991692229038855e-06, - "objective/entropy": -115.99034881591797, - "objective/kl": 11.337324142456055, - "objective/non_score_reward": -1.1337324380874634, - "objective/rlhf_reward": -2.8730703942185505, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 95.43186950683594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5122163891792297, - "step": 13, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000202178955078 - }, - { - "episode": 240, - "epoch": 0.004313908760829708, - "loss/policy_avg": 0.36638143658638, - "lr": 9.991053169734152e-06, - "objective/entropy": 90.19092559814453, - "objective/kl": 8.482120513916016, - "objective/non_score_reward": -0.8482120633125305, - "objective/rlhf_reward": -1.5680194973674526, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 103.84627532958984, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.45586448907852173, - "step": 14, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0012996196746826 - }, - { - "episode": 256, - "epoch": 0.004601502678218356, - "loss/policy_avg": 0.3564397394657135, - "lr": 9.990414110429449e-06, - "objective/entropy": 62.88275146484375, - "objective/kl": 8.093853950500488, - "objective/non_score_reward": -0.8093854188919067, - "objective/rlhf_reward": -1.7565889535502193, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 129.63275146484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5616360902786255, - "step": 15, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000332832336426 - }, - { - "episode": 272, - "epoch": 0.004889096595607003, - "loss/policy_avg": 0.731740415096283, - "lr": 9.989775051124744e-06, - "objective/entropy": 175.25027465820312, - "objective/kl": 13.653030395507812, - "objective/non_score_reward": -1.3653030395507812, - "objective/rlhf_reward": -4.037380088766185, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 197.69329833984375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6656568050384521, - "step": 16, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0004138946533203 - }, - { - "episode": 288, - "epoch": 0.00517669051299565, - "loss/policy_avg": 0.0038209843914955854, - "lr": 9.989135991820041e-06, - "objective/entropy": 166.37741088867188, - "objective/kl": 11.93104362487793, - "objective/non_score_reward": -1.1931045055389404, - "objective/rlhf_reward": -3.3724178135395046, - "objective/scores": 0.35, - "policy/approxkl_avg": 123.38684844970703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4346945881843567, - "step": 17, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.002516269683838 - }, - { - "episode": 304, - "epoch": 0.0054642844303842975, - "loss/policy_avg": 0.5328235626220703, - "lr": 9.988496932515338e-06, - "objective/entropy": -59.579795837402344, - "objective/kl": 14.574970245361328, - "objective/non_score_reward": -1.457497000694275, - "objective/rlhf_reward": -4.273728727307871, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 107.66255187988281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6419472098350525, - "step": 18, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0002565383911133 - }, - { - "episode": 320, - "epoch": 0.005751878347772944, - "loss/policy_avg": 0.1068505123257637, - "lr": 9.987857873210635e-06, - "objective/entropy": 25.82529067993164, - "objective/kl": 7.757124900817871, - "objective/non_score_reward": -0.7757124900817871, - "objective/rlhf_reward": -1.702849841117859, - "objective/scores": 0.35, - "policy/approxkl_avg": 35.83104705810547, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.37679600715637207, - "step": 19, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0017640590667725 - }, - { - "episode": 336, - "epoch": 0.006039472265161592, - "loss/policy_avg": 0.9153174757957458, - "lr": 9.987218813905932e-06, - "objective/entropy": 123.23423767089844, - "objective/kl": 15.62867546081543, - "objective/non_score_reward": -1.5628674030303955, - "objective/rlhf_reward": -4.770517232830882, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 175.58567810058594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6862951517105103, - "step": 20, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9951945543289185 - }, - { - "episode": 352, - "epoch": 0.006327066182550239, - "loss/policy_avg": 0.13535380363464355, - "lr": 9.986579754601228e-06, - "objective/entropy": 106.94303894042969, - "objective/kl": 14.264102935791016, - "objective/non_score_reward": -1.42641019821167, - "objective/rlhf_reward": -2.781921450735304, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 203.86151123046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4334957003593445, - "step": 21, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984745979309082 - }, - { - "episode": 368, - "epoch": 0.006614660099938887, - "loss/policy_avg": 0.08913514018058777, - "lr": 9.985940695296524e-06, - "objective/entropy": 86.8988037109375, - "objective/kl": 14.969903945922852, - "objective/non_score_reward": -1.4969902038574219, - "objective/rlhf_reward": -4.564128805597392, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 204.34201049804688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6533622741699219, - "step": 22, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9993085861206055 - }, - { - "episode": 384, - "epoch": 0.0069022540173275335, - "loss/policy_avg": 0.4681934416294098, - "lr": 9.98530163599182e-06, - "objective/entropy": -86.89934539794922, - "objective/kl": 17.868688583374023, - "objective/non_score_reward": -1.7868685722351074, - "objective/rlhf_reward": -5.821961793929262, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 180.03530883789062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6652738451957703, - "step": 23, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9969769716262817 - }, - { - "episode": 400, - "epoch": 0.00718984793471618, - "loss/policy_avg": 0.05787897855043411, - "lr": 9.984662576687117e-06, - "objective/entropy": 217.01751708984375, - "objective/kl": 7.942338466644287, - "objective/non_score_reward": -0.7942339181900024, - "objective/rlhf_reward": -1.7769354641437531, - "objective/scores": 0.35, - "policy/approxkl_avg": 14.617660522460938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7284016609191895, - "step": 24, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0007760524749756 - }, - { - "episode": 416, - "epoch": 0.007477441852104828, - "loss/policy_avg": 0.17751406133174896, - "lr": 9.984023517382414e-06, - "objective/entropy": 79.38223266601562, - "objective/kl": 13.876078605651855, - "objective/non_score_reward": -1.3876079320907593, - "objective/rlhf_reward": -4.191181921695156, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 129.87246704101562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9167462587356567, - "step": 25, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985355138778687 - }, - { - "episode": 432, - "epoch": 0.007765035769493475, - "loss/policy_avg": 0.529009222984314, - "lr": 9.983384458077711e-06, - "objective/entropy": -31.18558120727539, - "objective/kl": 14.786969184875488, - "objective/non_score_reward": -1.4786969423294067, - "objective/rlhf_reward": -4.5361854816354334, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 125.92539978027344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.43768489360809326, - "step": 26, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984157085418701 - }, - { - "episode": 448, - "epoch": 0.008052629686882123, - "loss/policy_avg": 0.3665599822998047, - "lr": 9.982745398773006e-06, - "objective/entropy": 23.827144622802734, - "objective/kl": 13.60982894897461, - "objective/non_score_reward": -1.360982894897461, - "objective/rlhf_reward": -5.443931698799133, - "objective/scores": 0.0, - "policy/approxkl_avg": 127.14844512939453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5104779601097107, - "step": 27, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987092018127441 - }, - { - "episode": 464, - "epoch": 0.00834022360427077, - "loss/policy_avg": 0.32786238193511963, - "lr": 9.982106339468303e-06, - "objective/entropy": 128.9566650390625, - "objective/kl": 11.556554794311523, - "objective/non_score_reward": -1.1556555032730103, - "objective/rlhf_reward": -3.297109279662294, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 59.29738998413086, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4545682668685913, - "step": 28, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0006556510925293 - }, - { - "episode": 480, - "epoch": 0.008627817521659416, - "loss/policy_avg": 0.2694750428199768, - "lr": 9.9814672801636e-06, - "objective/entropy": 78.4908447265625, - "objective/kl": 9.683059692382812, - "objective/non_score_reward": -0.9683058857917786, - "objective/rlhf_reward": -2.5139735278829765, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 102.18389892578125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.659131646156311, - "step": 29, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9980244636535645 - }, - { - "episode": 496, - "epoch": 0.008915411439048063, - "loss/policy_avg": -0.2861338257789612, - "lr": 9.980828220858897e-06, - "objective/entropy": -90.27975463867188, - "objective/kl": 7.361126899719238, - "objective/non_score_reward": -0.7361127138137817, - "objective/rlhf_reward": -1.1196221962300053, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 76.95925903320312, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5377018451690674, - "step": 30, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.006430149078369 - }, - { - "episode": 512, - "epoch": 0.009203005356436712, - "loss/policy_avg": 0.1336214542388916, - "lr": 9.980189161554194e-06, - "objective/entropy": 153.1845703125, - "objective/kl": 12.326415061950684, - "objective/non_score_reward": -1.232641577720642, - "objective/rlhf_reward": -3.5713163701042365, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 140.37075805664062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7196662425994873, - "step": 31, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9995383024215698 - }, - { - "episode": 528, - "epoch": 0.009490599273825359, - "loss/policy_avg": -0.03590531647205353, - "lr": 9.97955010224949e-06, - "objective/entropy": -60.39399719238281, - "objective/kl": 7.551569938659668, - "objective/non_score_reward": -0.7551569938659668, - "objective/rlhf_reward": -1.6613781240925025, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 36.98230743408203, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.617447018623352, - "step": 32, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0055394172668457 - }, - { - "episode": 544, - "epoch": 0.009778193191214006, - "loss/policy_avg": 0.23507678508758545, - "lr": 9.978911042944786e-06, - "objective/entropy": -62.405269622802734, - "objective/kl": 12.254663467407227, - "objective/non_score_reward": -1.2254663705825806, - "objective/rlhf_reward": -3.386093669923481, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 28.730735778808594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5197240114212036, - "step": 33, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973214864730835 - }, - { - "episode": 560, - "epoch": 0.010065787108602653, - "loss/policy_avg": 0.16142824292182922, - "lr": 9.978271983640083e-06, - "objective/entropy": 63.909202575683594, - "objective/kl": 11.40770149230957, - "objective/non_score_reward": -1.1407701969146729, - "objective/rlhf_reward": -1.6393620713960855, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 103.13188171386719, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6455787420272827, - "step": 34, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994690418243408 - }, - { - "episode": 576, - "epoch": 0.0103533810259913, - "loss/policy_avg": 0.10174459218978882, - "lr": 9.977632924335378e-06, - "objective/entropy": -30.112831115722656, - "objective/kl": 17.954376220703125, - "objective/non_score_reward": -1.7954376935958862, - "objective/rlhf_reward": -5.059044542089973, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 330.9220886230469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.519372820854187, - "step": 35, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978516101837158 - }, - { - "episode": 592, - "epoch": 0.010640974943379948, - "loss/policy_avg": 0.47705915570259094, - "lr": 9.976993865030675e-06, - "objective/entropy": 302.72314453125, - "objective/kl": 19.512754440307617, - "objective/non_score_reward": -1.9512755870819092, - "objective/rlhf_reward": -3.4051021099090573, - "objective/scores": 1.1, - "policy/approxkl_avg": 104.70938110351562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8252858519554138, - "step": 36, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983307123184204 - }, - { - "episode": 608, - "epoch": 0.010928568860768595, - "loss/policy_avg": 0.3472205400466919, - "lr": 9.976354805725972e-06, - "objective/entropy": -59.4378662109375, - "objective/kl": 10.388540267944336, - "objective/non_score_reward": -1.0388540029525757, - "objective/rlhf_reward": -2.3305873229828586, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 22.55358123779297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6380844712257385, - "step": 37, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981659650802612 - }, - { - "episode": 624, - "epoch": 0.011216162778157242, - "loss/policy_avg": 0.44485026597976685, - "lr": 9.975715746421269e-06, - "objective/entropy": 76.74449157714844, - "objective/kl": 10.349222183227539, - "objective/non_score_reward": -1.0349223613739014, - "objective/rlhf_reward": -4.139689266681671, - "objective/scores": 0.0, - "policy/approxkl_avg": 78.09274291992188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6423808336257935, - "step": 38, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9971048831939697 - }, - { - "episode": 640, - "epoch": 0.011503756695545889, - "loss/policy_avg": 0.37319111824035645, - "lr": 9.975076687116566e-06, - "objective/entropy": -67.30467224121094, - "objective/kl": 19.358768463134766, - "objective/non_score_reward": -1.9358769655227661, - "objective/rlhf_reward": -6.227735841067966, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 161.26229858398438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6262432336807251, - "step": 39, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987332820892334 - }, - { - "episode": 656, - "epoch": 0.011791350612934537, - "loss/policy_avg": 0.389024943113327, - "lr": 9.97443762781186e-06, - "objective/entropy": 210.994384765625, - "objective/kl": 11.99485969543457, - "objective/non_score_reward": -1.1994858980178833, - "objective/rlhf_reward": -4.797943651676178, - "objective/scores": 0.0, - "policy/approxkl_avg": 79.62628173828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7361236810684204, - "step": 40, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987305402755737 - }, - { - "episode": 672, - "epoch": 0.012078944530323184, - "loss/policy_avg": 0.4818825125694275, - "lr": 9.973798568507158e-06, - "objective/entropy": 280.91552734375, - "objective/kl": 17.216154098510742, - "objective/non_score_reward": -1.7216153144836426, - "objective/rlhf_reward": -5.435863117785797, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 82.88700866699219, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7085442543029785, - "step": 41, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000746488571167 - }, - { - "episode": 688, - "epoch": 0.012366538447711831, - "loss/policy_avg": 0.7192404270172119, - "lr": 9.973159509202454e-06, - "objective/entropy": 89.66543579101562, - "objective/kl": 12.255132675170898, - "objective/non_score_reward": -1.2255134582519531, - "objective/rlhf_reward": -0.5020534753799435, - "objective/scores": 1.1, - "policy/approxkl_avg": 79.93511199951172, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5533976554870605, - "step": 42, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999737739562988 - }, - { - "episode": 704, - "epoch": 0.012654132365100478, - "loss/policy_avg": 0.0401420071721077, - "lr": 9.972520449897751e-06, - "objective/entropy": 175.6131591796875, - "objective/kl": 12.72716999053955, - "objective/non_score_reward": -1.272716999053955, - "objective/rlhf_reward": -2.167148996831152, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 138.8005828857422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8866395354270935, - "step": 43, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9959650039672852 - }, - { - "episode": 720, - "epoch": 0.012941726282489125, - "loss/policy_avg": 0.5428536534309387, - "lr": 9.971881390593048e-06, - "objective/entropy": 122.98509216308594, - "objective/kl": 14.87851619720459, - "objective/non_score_reward": -1.487851619720459, - "objective/rlhf_reward": -4.435634577068027, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 57.47890853881836, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7034124135971069, - "step": 44, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998563528060913 - }, - { - "episode": 736, - "epoch": 0.013229320199877773, - "loss/policy_avg": 1.027585744857788, - "lr": 9.971242331288345e-06, - "objective/entropy": 119.49530792236328, - "objective/kl": 18.71068572998047, - "objective/non_score_reward": -1.8710683584213257, - "objective/rlhf_reward": -5.659444983276437, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 190.9130859375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9129630923271179, - "step": 45, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9993374347686768 - }, - { - "episode": 752, - "epoch": 0.01351691411726642, - "loss/policy_avg": -0.013658525422215462, - "lr": 9.97060327198364e-06, - "objective/entropy": 10.491897583007812, - "objective/kl": 13.526758193969727, - "objective/non_score_reward": -1.3526759147644043, - "objective/rlhf_reward": -1.0107036590576168, - "objective/scores": 1.1, - "policy/approxkl_avg": 102.07904815673828, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.482522189617157, - "step": 46, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0015859603881836 - }, - { - "episode": 768, - "epoch": 0.013804508034655067, - "loss/policy_avg": 0.18925166130065918, - "lr": 9.969964212678937e-06, - "objective/entropy": -118.86809539794922, - "objective/kl": 10.978793144226074, - "objective/non_score_reward": -1.097879409790039, - "objective/rlhf_reward": -3.0498822390133435, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 104.2835693359375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6507794857025146, - "step": 47, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996565341949463 - }, - { - "episode": 784, - "epoch": 0.014092101952043714, - "loss/policy_avg": 0.5690521597862244, - "lr": 9.969325153374234e-06, - "objective/entropy": -175.16403198242188, - "objective/kl": 19.28797149658203, - "objective/non_score_reward": -1.9287970066070557, - "objective/rlhf_reward": -6.158928765860155, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 288.65631103515625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6812887787818909, - "step": 48, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999005794525146 - }, - { - "episode": 800, - "epoch": 0.01437969586943236, - "loss/policy_avg": 0.5041743516921997, - "lr": 9.968686094069531e-06, - "objective/entropy": 318.1710205078125, - "objective/kl": 17.975252151489258, - "objective/non_score_reward": -1.79752516746521, - "objective/rlhf_reward": -5.84846519520822, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 390.5566101074219, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8034517765045166, - "step": 49, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9982198476791382 - }, - { - "episode": 816, - "epoch": 0.01466728978682101, - "loss/policy_avg": 0.21048909425735474, - "lr": 9.968047034764828e-06, - "objective/entropy": 16.597354888916016, - "objective/kl": 22.140174865722656, - "objective/non_score_reward": -2.214017629623413, - "objective/rlhf_reward": -6.733364107386146, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 327.35992431640625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4595518708229065, - "step": 50, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974133968353271 - }, - { - "episode": 832, - "epoch": 0.014954883704209656, - "loss/policy_avg": 0.6745895147323608, - "lr": 9.967407975460123e-06, - "objective/entropy": 26.577850341796875, - "objective/kl": 15.099103927612305, - "objective/non_score_reward": -1.5099103450775146, - "objective/rlhf_reward": -4.435521040026265, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 52.37441635131836, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5440022945404053, - "step": 51, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997567057609558 - }, - { - "episode": 848, - "epoch": 0.015242477621598303, - "loss/policy_avg": 1.5183483362197876, - "lr": 9.96676891615542e-06, - "objective/entropy": -133.34732055664062, - "objective/kl": 15.838411331176758, - "objective/non_score_reward": -1.58384108543396, - "objective/rlhf_reward": -4.9937288074785755, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 181.23886108398438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.491477906703949, - "step": 52, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999549388885498 - }, - { - "episode": 864, - "epoch": 0.01553007153898695, - "loss/policy_avg": 1.0412685871124268, - "lr": 9.966129856850717e-06, - "objective/entropy": -12.032562255859375, - "objective/kl": 13.811055183410645, - "objective/non_score_reward": -1.3811054229736328, - "objective/rlhf_reward": -4.008649730476078, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 197.14422607421875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5553240776062012, - "step": 53, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998724341392517 - }, - { - "episode": 880, - "epoch": 0.0158176654563756, - "loss/policy_avg": 0.11221161484718323, - "lr": 9.965490797546014e-06, - "objective/entropy": 239.8121795654297, - "objective/kl": 14.03902816772461, - "objective/non_score_reward": -1.4039026498794556, - "objective/rlhf_reward": -3.790781910690378, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 54.992515563964844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8224223852157593, - "step": 54, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998995304107666 - }, - { - "episode": 896, - "epoch": 0.016105259373764245, - "loss/policy_avg": 0.27755433320999146, - "lr": 9.96485173824131e-06, - "objective/entropy": 311.885009765625, - "objective/kl": 21.855777740478516, - "objective/non_score_reward": -2.185577630996704, - "objective/rlhf_reward": -8.742310643196106, - "objective/scores": 0.0, - "policy/approxkl_avg": 196.02963256835938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7535547018051147, - "step": 55, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9981474876403809 - }, - { - "episode": 912, - "epoch": 0.016392853291152892, - "loss/policy_avg": 0.23765933513641357, - "lr": 9.964212678936606e-06, - "objective/entropy": -135.26939392089844, - "objective/kl": 17.994558334350586, - "objective/non_score_reward": -1.7994558811187744, - "objective/rlhf_reward": -5.464490548769633, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 143.15103149414062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7061681747436523, - "step": 56, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001481771469116 - }, - { - "episode": 928, - "epoch": 0.01668044720854154, - "loss/policy_avg": -0.09936670958995819, - "lr": 9.963573619631903e-06, - "objective/entropy": 273.41107177734375, - "objective/kl": 17.296648025512695, - "objective/non_score_reward": -1.7296650409698486, - "objective/rlhf_reward": -5.559410088990612, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 170.04476928710938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.664265513420105, - "step": 57, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996645212173462 - }, - { - "episode": 944, - "epoch": 0.016968041125930186, - "loss/policy_avg": -0.40471351146698, - "lr": 9.9629345603272e-06, - "objective/entropy": 91.30682373046875, - "objective/kl": 9.7944974899292, - "objective/non_score_reward": -0.9794497489929199, - "objective/rlhf_reward": -2.361539780107096, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 46.471343994140625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6433554887771606, - "step": 58, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0122342109680176 - }, - { - "episode": 960, - "epoch": 0.017255635043318833, - "loss/policy_avg": 0.758541464805603, - "lr": 9.962295501022495e-06, - "objective/entropy": -28.099227905273438, - "objective/kl": 15.942657470703125, - "objective/non_score_reward": -1.5942658185958862, - "objective/rlhf_reward": -4.2543568483748775, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 199.81590270996094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6792592406272888, - "step": 59, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988605976104736 - }, - { - "episode": 976, - "epoch": 0.01754322896070748, - "loss/policy_avg": 0.11890214681625366, - "lr": 9.961656441717792e-06, - "objective/entropy": 48.058135986328125, - "objective/kl": 16.837148666381836, - "objective/non_score_reward": -1.6837148666381836, - "objective/rlhf_reward": -5.130739483896809, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 123.93780517578125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8319634199142456, - "step": 60, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9987045526504517 - }, - { - "episode": 992, - "epoch": 0.017830822878096127, - "loss/policy_avg": -0.3681066036224365, - "lr": 9.961017382413088e-06, - "objective/entropy": -12.798896789550781, - "objective/kl": 13.068469047546387, - "objective/non_score_reward": -1.3068468570709229, - "objective/rlhf_reward": -3.8857517450148156, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 29.726402282714844, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.425646036863327, - "step": 61, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.004940986633301 - }, - { - "episode": 1008, - "epoch": 0.018118416795484777, - "loss/policy_avg": 0.6650391221046448, - "lr": 9.960378323108385e-06, - "objective/entropy": 172.82774353027344, - "objective/kl": 19.25320053100586, - "objective/non_score_reward": -1.9253199100494385, - "objective/rlhf_reward": -5.753868768887456, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 61.55193328857422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.707613468170166, - "step": 62, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9996411800384521 - }, - { - "episode": 1024, - "epoch": 0.018406010712873424, - "loss/policy_avg": 0.006029143929481506, - "lr": 9.959739263803682e-06, - "objective/entropy": 84.45201110839844, - "objective/kl": 9.024871826171875, - "objective/non_score_reward": -0.9024871587753296, - "objective/rlhf_reward": -2.1593505545571894, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 54.238502502441406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7926748991012573, - "step": 63, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0010342597961426 - }, - { - "episode": 1040, - "epoch": 0.01869360463026207, - "loss/policy_avg": 0.43513649702072144, - "lr": 9.959100204498979e-06, - "objective/entropy": 271.2078857421875, - "objective/kl": 19.053577423095703, - "objective/non_score_reward": -1.905357837677002, - "objective/rlhf_reward": -6.2959189749061295, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 146.22186279296875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 1.0022022724151611, - "step": 64, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983612298965454 - }, - { - "episode": 1056, - "epoch": 0.018981198547650718, - "loss/policy_avg": 0.207576721906662, - "lr": 9.958461145194274e-06, - "objective/entropy": 61.16169738769531, - "objective/kl": 12.455079078674316, - "objective/non_score_reward": -1.2455079555511475, - "objective/rlhf_reward": -3.1572031929817905, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 90.93212890625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8675985336303711, - "step": 65, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000087022781372 - }, - { - "episode": 1072, - "epoch": 0.019268792465039365, - "loss/policy_avg": 0.036766890436410904, - "lr": 9.957822085889571e-06, - "objective/entropy": -109.95204162597656, - "objective/kl": 18.774991989135742, - "objective/non_score_reward": -1.8774993419647217, - "objective/rlhf_reward": -5.776663676897684, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 143.13140869140625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6979721188545227, - "step": 66, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986827373504639 - }, - { - "episode": 1088, - "epoch": 0.01955638638242801, - "loss/policy_avg": 0.2812209725379944, - "lr": 9.957183026584868e-06, - "objective/entropy": 347.89093017578125, - "objective/kl": 18.375164031982422, - "objective/non_score_reward": -1.8375165462493896, - "objective/rlhf_reward": -5.688206379831421, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 156.17637634277344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8508192300796509, - "step": 67, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0011837482452393 - }, - { - "episode": 1104, - "epoch": 0.019843980299816658, - "loss/policy_avg": 0.4886673092842102, - "lr": 9.956543967280165e-06, - "objective/entropy": 386.39532470703125, - "objective/kl": 21.181537628173828, - "objective/non_score_reward": -2.1181535720825195, - "objective/rlhf_reward": -8.472614765167236, - "objective/scores": 0.0, - "policy/approxkl_avg": 175.6392822265625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9587714672088623, - "step": 68, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9981954097747803 - }, - { - "episode": 1120, - "epoch": 0.020131574217205305, - "loss/policy_avg": 0.20124448835849762, - "lr": 9.955904907975462e-06, - "objective/entropy": -67.81639099121094, - "objective/kl": 18.70073127746582, - "objective/non_score_reward": -1.8700731992721558, - "objective/rlhf_reward": -6.05646069784936, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 92.5486831665039, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6175429224967957, - "step": 69, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00050950050354 - }, - { - "episode": 1136, - "epoch": 0.020419168134593952, - "loss/policy_avg": -0.021352097392082214, - "lr": 9.955265848670757e-06, - "objective/entropy": -17.604766845703125, - "objective/kl": 21.45330810546875, - "objective/non_score_reward": -2.1453306674957275, - "objective/rlhf_reward": -6.919463162839996, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 400.26580810546875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9343768358230591, - "step": 70, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998741149902344 - }, - { - "episode": 1152, - "epoch": 0.0207067620519826, - "loss/policy_avg": 1.1225731372833252, - "lr": 9.954626789366054e-06, - "objective/entropy": 25.78099822998047, - "objective/kl": 14.004438400268555, - "objective/non_score_reward": -1.4004437923431396, - "objective/rlhf_reward": -3.776946599754404, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 111.43013000488281, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5702972412109375, - "step": 71, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9973070621490479 - }, - { - "episode": 1168, - "epoch": 0.02099435596937125, - "loss/policy_avg": 0.25385117530822754, - "lr": 9.95398773006135e-06, - "objective/entropy": -119.72091674804688, - "objective/kl": 15.869585037231445, - "objective/non_score_reward": -1.586958408355713, - "objective/rlhf_reward": -4.400422762112553, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 68.1309814453125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6043493151664734, - "step": 72, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9978251457214355 - }, - { - "episode": 1184, - "epoch": 0.021281949886759896, - "loss/policy_avg": 1.0585708618164062, - "lr": 9.953348670756648e-06, - "objective/entropy": 175.019775390625, - "objective/kl": 22.812929153442383, - "objective/non_score_reward": -2.2812929153442383, - "objective/rlhf_reward": -7.7996585703193375, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 238.55490112304688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.764386773109436, - "step": 73, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998070478439331 - }, - { - "episode": 1200, - "epoch": 0.021569543804148543, - "loss/policy_avg": 0.3202959895133972, - "lr": 9.952709611451944e-06, - "objective/entropy": -55.03008270263672, - "objective/kl": 20.011316299438477, - "objective/non_score_reward": -2.001131772994995, - "objective/rlhf_reward": -5.60452709197998, - "objective/scores": 0.6, - "policy/approxkl_avg": 218.61663818359375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7646293640136719, - "step": 74, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000119686126709 - }, - { - "episode": 1216, - "epoch": 0.02185713772153719, - "loss/policy_avg": 0.07566210627555847, - "lr": 9.952070552147241e-06, - "objective/entropy": -71.89826965332031, - "objective/kl": 18.70985984802246, - "objective/non_score_reward": -1.8709862232208252, - "objective/rlhf_reward": -5.536533663945134, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 150.85037231445312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.622381865978241, - "step": 75, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000347137451172 - }, - { - "episode": 1232, - "epoch": 0.022144731638925837, - "loss/policy_avg": 1.0919684171676636, - "lr": 9.951431492842536e-06, - "objective/entropy": 52.233760833740234, - "objective/kl": 16.872692108154297, - "objective/non_score_reward": -1.6872694492340088, - "objective/rlhf_reward": -5.1449576354661755, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 106.15821838378906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5982179641723633, - "step": 76, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982967376708984 - }, - { - "episode": 1248, - "epoch": 0.022432325556314484, - "loss/policy_avg": 0.08637362718582153, - "lr": 9.950792433537833e-06, - "objective/entropy": -41.915985107421875, - "objective/kl": 13.70026969909668, - "objective/non_score_reward": -1.37002694606781, - "objective/rlhf_reward": -3.81824833673297, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 65.06179809570312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7901297807693481, - "step": 77, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979660511016846 - }, - { - "episode": 1264, - "epoch": 0.02271991947370313, - "loss/policy_avg": 0.25904232263565063, - "lr": 9.950153374233129e-06, - "objective/entropy": 80.24528503417969, - "objective/kl": 11.508593559265137, - "objective/non_score_reward": -1.1508593559265137, - "objective/rlhf_reward": -3.1224849848107095, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 80.6361312866211, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5768579244613647, - "step": 78, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997675895690918 - }, - { - "episode": 1280, - "epoch": 0.023007513391091777, - "loss/policy_avg": 1.0851349830627441, - "lr": 9.949514314928425e-06, - "objective/entropy": 179.42474365234375, - "objective/kl": 17.690536499023438, - "objective/non_score_reward": -1.7690538167953491, - "objective/rlhf_reward": -5.128804068045552, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 140.53465270996094, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5955301523208618, - "step": 79, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9972807168960571 - }, - { - "episode": 1296, - "epoch": 0.023295107308480424, - "loss/policy_avg": 0.10646107792854309, - "lr": 9.948875255623722e-06, - "objective/entropy": 192.84939575195312, - "objective/kl": 20.0378360748291, - "objective/non_score_reward": -2.003783702850342, - "objective/rlhf_reward": -8.015134632587433, - "objective/scores": 0.0, - "policy/approxkl_avg": 228.60391235351562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5153179168701172, - "step": 80, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999485969543457 - }, - { - "episode": 1312, - "epoch": 0.023582701225869074, - "loss/policy_avg": 0.4094586968421936, - "lr": 9.94823619631902e-06, - "objective/entropy": -145.0159912109375, - "objective/kl": 16.018333435058594, - "objective/non_score_reward": -1.6018333435058594, - "objective/rlhf_reward": -6.407333076000214, - "objective/scores": 0.0, - "policy/approxkl_avg": 113.58265686035156, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6427879333496094, - "step": 81, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9979770183563232 - }, - { - "episode": 1328, - "epoch": 0.02387029514325772, - "loss/policy_avg": 0.5852205753326416, - "lr": 9.947597137014316e-06, - "objective/entropy": -188.32510375976562, - "objective/kl": 16.71861457824707, - "objective/non_score_reward": -1.6718615293502808, - "objective/rlhf_reward": -5.328196370337887, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 47.16347122192383, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4781952202320099, - "step": 82, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979345798492432 - }, - { - "episode": 1344, - "epoch": 0.024157889060646368, - "loss/policy_avg": 0.7273481488227844, - "lr": 9.946958077709611e-06, - "objective/entropy": -261.5775146484375, - "objective/kl": 17.21273422241211, - "objective/non_score_reward": -1.721273422241211, - "objective/rlhf_reward": -5.280973825518208, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 91.5807113647461, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7938202619552612, - "step": 83, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998490333557129 - }, - { - "episode": 1360, - "epoch": 0.024445482978035015, - "loss/policy_avg": 0.6384750604629517, - "lr": 9.946319018404908e-06, - "objective/entropy": -149.5916290283203, - "objective/kl": 21.390371322631836, - "objective/non_score_reward": -2.1390371322631836, - "objective/rlhf_reward": -7.230635855227632, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 152.96697998046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.733663022518158, - "step": 84, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9965767860412598 - }, - { - "episode": 1376, - "epoch": 0.024733076895423662, - "loss/policy_avg": 1.4870085716247559, - "lr": 9.945679959100205e-06, - "objective/entropy": 279.02581787109375, - "objective/kl": 13.598295211791992, - "objective/non_score_reward": -1.3598296642303467, - "objective/rlhf_reward": -4.015486557682125, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 75.3885269165039, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9894934296607971, - "step": 85, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0009098052978516 - }, - { - "episode": 1392, - "epoch": 0.02502067081281231, - "loss/policy_avg": 0.34267449378967285, - "lr": 9.945040899795502e-06, - "objective/entropy": -9.687551498413086, - "objective/kl": 17.537944793701172, - "objective/non_score_reward": -1.7537946701049805, - "objective/rlhf_reward": -4.091459666134092, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 86.10018920898438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7380132675170898, - "step": 86, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9966371059417725 - }, - { - "episode": 1408, - "epoch": 0.025308264730200956, - "loss/policy_avg": 0.662402868270874, - "lr": 9.944401840490799e-06, - "objective/entropy": 182.38612365722656, - "objective/kl": 17.891094207763672, - "objective/non_score_reward": -1.789109230041504, - "objective/rlhf_reward": -5.705838660807952, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 27.472164154052734, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6948930025100708, - "step": 87, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9972063302993774 - }, - { - "episode": 1424, - "epoch": 0.025595858647589603, - "loss/policy_avg": 2.4642419815063477, - "lr": 9.943762781186096e-06, - "objective/entropy": 9.746139526367188, - "objective/kl": 17.692127227783203, - "objective/non_score_reward": -1.7692127227783203, - "objective/rlhf_reward": -5.735215237646727, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 221.2765350341797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6335443258285522, - "step": 88, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997534990310669 - }, - { - "episode": 1440, - "epoch": 0.02588345256497825, - "loss/policy_avg": 0.4348924160003662, - "lr": 9.94312372188139e-06, - "objective/entropy": 134.22723388671875, - "objective/kl": 27.34999656677246, - "objective/non_score_reward": -2.734999895095825, - "objective/rlhf_reward": -9.278140073240387, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 201.6007080078125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8156861662864685, - "step": 89, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9973480701446533 - }, - { - "episode": 1456, - "epoch": 0.026171046482366896, - "loss/policy_avg": 0.1291811764240265, - "lr": 9.942484662576688e-06, - "objective/entropy": 112.80955505371094, - "objective/kl": 15.703033447265625, - "objective/non_score_reward": -1.5703033208847046, - "objective/rlhf_reward": -4.333801994996007, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 155.86367797851562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5453826189041138, - "step": 90, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9969561100006104 - }, - { - "episode": 1472, - "epoch": 0.026458640399755547, - "loss/policy_avg": 0.5085259675979614, - "lr": 9.941845603271985e-06, - "objective/entropy": 178.85531616210938, - "objective/kl": 17.638357162475586, - "objective/non_score_reward": -1.7638356685638428, - "objective/rlhf_reward": -5.574390235360026, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 101.44283294677734, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.42108362913131714, - "step": 91, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9999537467956543 - }, - { - "episode": 1488, - "epoch": 0.026746234317144194, - "loss/policy_avg": 0.6299684643745422, - "lr": 9.941206543967281e-06, - "objective/entropy": -44.030662536621094, - "objective/kl": 20.842021942138672, - "objective/non_score_reward": -2.0842020511627197, - "objective/rlhf_reward": -6.977558457587643, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 154.17724609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8085545301437378, - "step": 92, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997336506843567 - }, - { - "episode": 1504, - "epoch": 0.02703382823453284, - "loss/policy_avg": 0.334034264087677, - "lr": 9.940567484662578e-06, - "objective/entropy": 137.08668518066406, - "objective/kl": 12.822792053222656, - "objective/non_score_reward": -1.2822792530059814, - "objective/rlhf_reward": -5.129117101430893, - "objective/scores": 0.0, - "policy/approxkl_avg": 28.62427520751953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5771763920783997, - "step": 93, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001514196395874 - }, - { - "episode": 1520, - "epoch": 0.027321422151921487, - "loss/policy_avg": 0.5848271250724792, - "lr": 9.939928425357874e-06, - "objective/entropy": -91.07750701904297, - "objective/kl": 12.661925315856934, - "objective/non_score_reward": -1.2661924362182617, - "objective/rlhf_reward": -3.239941294464182, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 20.221342086791992, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7387726306915283, - "step": 94, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9965746402740479 - }, - { - "episode": 1536, - "epoch": 0.027609016069310134, - "loss/policy_avg": 0.45774608850479126, - "lr": 9.93928936605317e-06, - "objective/entropy": 61.91606903076172, - "objective/kl": 13.879372596740723, - "objective/non_score_reward": -1.387937307357788, - "objective/rlhf_reward": -4.151749169826507, - "objective/scores": 0.35, - "policy/approxkl_avg": 77.81513977050781, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4593799114227295, - "step": 95, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001408576965332 - }, - { - "episode": 1552, - "epoch": 0.02789660998669878, - "loss/policy_avg": 0.15000438690185547, - "lr": 9.938650306748467e-06, - "objective/entropy": 105.67562866210938, - "objective/kl": 19.344045639038086, - "objective/non_score_reward": -1.9344044923782349, - "objective/rlhf_reward": -6.287019650550231, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 139.15414428710938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7023290991783142, - "step": 96, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000124454498291 - }, - { - "episode": 1568, - "epoch": 0.028184203904087428, - "loss/policy_avg": 0.15950141847133636, - "lr": 9.938011247443764e-06, - "objective/entropy": 163.98699951171875, - "objective/kl": 27.47066307067871, - "objective/non_score_reward": -2.7470664978027344, - "objective/rlhf_reward": -8.865559878126655, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 212.09678649902344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8216714859008789, - "step": 97, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999010562896729 - }, - { - "episode": 1584, - "epoch": 0.028471797821476075, - "loss/policy_avg": 0.08940532058477402, - "lr": 9.937372188139061e-06, - "objective/entropy": -107.25084686279297, - "objective/kl": 20.170251846313477, - "objective/non_score_reward": -2.0170252323150635, - "objective/rlhf_reward": -5.144382034183714, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 45.363121032714844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7300325632095337, - "step": 98, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9954917430877686 - }, - { - "episode": 1600, - "epoch": 0.02875939173886472, - "loss/policy_avg": 0.1482250690460205, - "lr": 9.936733128834358e-06, - "objective/entropy": 225.79022216796875, - "objective/kl": 18.687984466552734, - "objective/non_score_reward": -1.8687984943389893, - "objective/rlhf_reward": -6.024595717997894, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 224.1140594482422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.834899365901947, - "step": 99, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9977468252182007 - }, - { - "episode": 1616, - "epoch": 0.029046985656253372, - "loss/policy_avg": 0.1928468644618988, - "lr": 9.936094069529653e-06, - "objective/entropy": 8.741950988769531, - "objective/kl": 15.258420944213867, - "objective/non_score_reward": -1.5258420705795288, - "objective/rlhf_reward": -4.652770261378631, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 31.214942932128906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5689660310745239, - "step": 100, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975221157073975 - }, - { - "episode": 1632, - "epoch": 0.02933457957364202, - "loss/policy_avg": -0.0598021075129509, - "lr": 9.93545501022495e-06, - "objective/entropy": -18.486255645751953, - "objective/kl": 25.29681396484375, - "objective/non_score_reward": -2.5296812057495117, - "objective/rlhf_reward": -7.718725478649139, - "objective/scores": 0.6, - "policy/approxkl_avg": 111.83526611328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5892493724822998, - "step": 101, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977436065673828 - }, - { - "episode": 1648, - "epoch": 0.029622173491030666, - "loss/policy_avg": 0.47538769245147705, - "lr": 9.934815950920245e-06, - "objective/entropy": -110.20201873779297, - "objective/kl": 22.853384017944336, - "objective/non_score_reward": -2.2853384017944336, - "objective/rlhf_reward": -7.690755407424316, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 73.58186340332031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7267776727676392, - "step": 102, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981677532196045 - }, - { - "episode": 1664, - "epoch": 0.029909767408419313, - "loss/policy_avg": -0.00016094697639346123, - "lr": 9.934176891615542e-06, - "objective/entropy": 144.46910095214844, - "objective/kl": 16.481285095214844, - "objective/non_score_reward": -1.6481282711029053, - "objective/rlhf_reward": -5.26700029882781, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 115.55538177490234, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8190140724182129, - "step": 103, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999662160873413 - }, - { - "episode": 1680, - "epoch": 0.03019736132580796, - "loss/policy_avg": 0.3124885559082031, - "lr": 9.933537832310839e-06, - "objective/entropy": 82.22334289550781, - "objective/kl": 23.603931427001953, - "objective/non_score_reward": -2.3603932857513428, - "objective/rlhf_reward": -8.04157326221466, - "objective/scores": 0.35, - "policy/approxkl_avg": 249.64776611328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6342385411262512, - "step": 104, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999876022338867 - }, - { - "episode": 1696, - "epoch": 0.030484955243196606, - "loss/policy_avg": 0.5430713891983032, - "lr": 9.932898773006136e-06, - "objective/entropy": 120.59968566894531, - "objective/kl": 16.078868865966797, - "objective/non_score_reward": -1.6078869104385376, - "objective/rlhf_reward": -4.87528809806402, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 28.921520233154297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7989763021469116, - "step": 105, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9993932247161865 - }, - { - "episode": 1712, - "epoch": 0.030772549160585253, - "loss/policy_avg": 0.40486371517181396, - "lr": 9.932259713701433e-06, - "objective/entropy": -83.70709228515625, - "objective/kl": 21.060504913330078, - "objective/non_score_reward": -2.106050491333008, - "objective/rlhf_reward": -6.908430480750736, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 57.908729553222656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.8018752932548523, - "step": 106, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995449781417847 - }, - { - "episode": 1728, - "epoch": 0.0310601430779739, - "loss/policy_avg": 0.4627416133880615, - "lr": 9.931620654396728e-06, - "objective/entropy": -27.708335876464844, - "objective/kl": 19.676761627197266, - "objective/non_score_reward": -1.9676761627197266, - "objective/rlhf_reward": -7.870704412460327, - "objective/scores": 0.0, - "policy/approxkl_avg": 89.03375244140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7063722014427185, - "step": 107, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978171586990356 - }, - { - "episode": 1744, - "epoch": 0.03134773699536255, - "loss/policy_avg": 0.24644207954406738, - "lr": 9.930981595092025e-06, - "objective/entropy": 84.95053100585938, - "objective/kl": 17.334156036376953, - "objective/non_score_reward": -1.7334158420562744, - "objective/rlhf_reward": -5.417891824039158, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 59.91339111328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5948917269706726, - "step": 108, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983779191970825 - }, - { - "episode": 1760, - "epoch": 0.0316353309127512, - "loss/policy_avg": 0.10573781281709671, - "lr": 9.930342535787322e-06, - "objective/entropy": 37.63609313964844, - "objective/kl": 19.209318161010742, - "objective/non_score_reward": -1.9209318161010742, - "objective/rlhf_reward": -5.56102097250608, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 49.033843994140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.550654411315918, - "step": 109, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999387264251709 - }, - { - "episode": 1776, - "epoch": 0.031922924830139844, - "loss/policy_avg": 1.9212778806686401, - "lr": 9.929703476482619e-06, - "objective/entropy": -80.58729553222656, - "objective/kl": 21.281909942626953, - "objective/non_score_reward": -2.1281909942626953, - "objective/rlhf_reward": -6.779431001345316, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 400.0589599609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7861940860748291, - "step": 110, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973957538604736 - }, - { - "episode": 1792, - "epoch": 0.03221051874752849, - "loss/policy_avg": 0.275944322347641, - "lr": 9.929064417177915e-06, - "objective/entropy": -208.78277587890625, - "objective/kl": 19.070934295654297, - "objective/non_score_reward": -1.9070935249328613, - "objective/rlhf_reward": -3.228373861312866, - "objective/scores": 1.1, - "policy/approxkl_avg": 49.866607666015625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6659146547317505, - "step": 111, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001814842224121 - }, - { - "episode": 1808, - "epoch": 0.03249811266491714, - "loss/policy_avg": 0.01928192749619484, - "lr": 9.928425357873212e-06, - "objective/entropy": -55.08910369873047, - "objective/kl": 13.248590469360352, - "objective/non_score_reward": -1.3248591423034668, - "objective/rlhf_reward": -3.940187000964565, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 120.32939147949219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7349205017089844, - "step": 112, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983385801315308 - }, - { - "episode": 1824, - "epoch": 0.032785706582305785, - "loss/policy_avg": 0.46047085523605347, - "lr": 9.927786298568507e-06, - "objective/entropy": 62.72016906738281, - "objective/kl": 23.932682037353516, - "objective/non_score_reward": -2.39326810836792, - "objective/rlhf_reward": -5.173073148727417, - "objective/scores": 1.1, - "policy/approxkl_avg": 117.28857421875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7454954385757446, - "step": 113, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001185894012451 - }, - { - "episode": 1840, - "epoch": 0.03307330049969443, - "loss/policy_avg": 0.45565682649612427, - "lr": 9.927147239263804e-06, - "objective/entropy": 91.39878845214844, - "objective/kl": 23.617773056030273, - "objective/non_score_reward": -2.3617773056030273, - "objective/rlhf_reward": -8.121596846610231, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 150.77520751953125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6033438444137573, - "step": 114, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999167919158936 - }, - { - "episode": 1856, - "epoch": 0.03336089441708308, - "loss/policy_avg": 0.7295475602149963, - "lr": 9.926508179959101e-06, - "objective/entropy": 207.79177856445312, - "objective/kl": 27.30187225341797, - "objective/non_score_reward": -2.73018741607666, - "objective/rlhf_reward": -9.187415854136148, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 190.16458129882812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.674660861492157, - "step": 115, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992830753326416 - }, - { - "episode": 1872, - "epoch": 0.033648488334471725, - "loss/policy_avg": 0.6712960600852966, - "lr": 9.925869120654398e-06, - "objective/entropy": -117.77911376953125, - "objective/kl": 24.611560821533203, - "objective/non_score_reward": -2.461156129837036, - "objective/rlhf_reward": -8.485374891494198, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 180.31971740722656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8039132356643677, - "step": 116, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9965860843658447 - }, - { - "episode": 1888, - "epoch": 0.03393608225186037, - "loss/policy_avg": 0.34298884868621826, - "lr": 9.925230061349695e-06, - "objective/entropy": -77.61781311035156, - "objective/kl": 19.15323829650879, - "objective/non_score_reward": -1.9153238534927368, - "objective/rlhf_reward": -5.2612955331802365, - "objective/scores": 0.6, - "policy/approxkl_avg": 146.52737426757812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5704625248908997, - "step": 117, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9994972944259644 - }, - { - "episode": 1904, - "epoch": 0.03422367616924902, - "loss/policy_avg": 0.038467422127723694, - "lr": 9.92459100204499e-06, - "objective/entropy": 49.699798583984375, - "objective/kl": 14.500207901000977, - "objective/non_score_reward": -1.4500207901000977, - "objective/rlhf_reward": -4.3762512101727395, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 53.40303421020508, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7844617366790771, - "step": 118, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998944878578186 - }, - { - "episode": 1920, - "epoch": 0.034511270086637666, - "loss/policy_avg": 0.21085724234580994, - "lr": 9.923951942740287e-06, - "objective/entropy": 171.3436279296875, - "objective/kl": 26.085678100585938, - "objective/non_score_reward": -2.608567714691162, - "objective/rlhf_reward": -10.434271335601807, - "objective/scores": 0.0, - "policy/approxkl_avg": 62.63068389892578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7893345355987549, - "step": 119, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0008397102355957 - }, - { - "episode": 1936, - "epoch": 0.03479886400402631, - "loss/policy_avg": 2.211275100708008, - "lr": 9.923312883435584e-06, - "objective/entropy": -61.354347229003906, - "objective/kl": 20.615779876708984, - "objective/non_score_reward": -2.0615780353546143, - "objective/rlhf_reward": -6.82247974415597, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 51.41122055053711, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6323862075805664, - "step": 120, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0031816959381104 - }, - { - "episode": 1952, - "epoch": 0.03508645792141496, - "loss/policy_avg": 0.09407002478837967, - "lr": 9.92267382413088e-06, - "objective/entropy": 61.494049072265625, - "objective/kl": 16.42398452758789, - "objective/non_score_reward": -1.6423983573913574, - "objective/rlhf_reward": -5.053821527751621, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 42.54444122314453, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6436998844146729, - "step": 121, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003204345703125 - }, - { - "episode": 1968, - "epoch": 0.03537405183880361, - "loss/policy_avg": 4.1326775550842285, - "lr": 9.922034764826178e-06, - "objective/entropy": 182.5974884033203, - "objective/kl": 23.137367248535156, - "objective/non_score_reward": -2.313736915588379, - "objective/rlhf_reward": -7.895697408650799, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 43.000953674316406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4477632939815521, - "step": 122, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9996920824050903 - }, - { - "episode": 1984, - "epoch": 0.03566164575619225, - "loss/policy_avg": 0.6063717603683472, - "lr": 9.921395705521473e-06, - "objective/entropy": 374.73065185546875, - "objective/kl": 19.553401947021484, - "objective/non_score_reward": -1.9553401470184326, - "objective/rlhf_reward": -6.4213602304458615, - "objective/scores": 0.35, - "policy/approxkl_avg": 66.70773315429688, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8944922685623169, - "step": 123, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997408151626587 - }, - { - "episode": 2000, - "epoch": 0.03594923967358091, - "loss/policy_avg": -0.24504688382148743, - "lr": 9.92075664621677e-06, - "objective/entropy": 61.65036392211914, - "objective/kl": 24.3512020111084, - "objective/non_score_reward": -2.435120105743408, - "objective/rlhf_reward": -8.39884465029779, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 52.655914306640625, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.7538488507270813, - "step": 124, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.018387794494629 - }, - { - "episode": 2016, - "epoch": 0.036236833590969554, - "loss/policy_avg": 0.5044976472854614, - "lr": 9.920117586912067e-06, - "objective/entropy": 252.68862915039062, - "objective/kl": 31.767620086669922, - "objective/non_score_reward": -3.176762104034424, - "objective/rlhf_reward": -11.34779831144659, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 143.51535034179688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6717317700386047, - "step": 125, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9978291988372803 - }, - { - "episode": 2032, - "epoch": 0.0365244275083582, - "loss/policy_avg": 0.5120067596435547, - "lr": 9.919478527607362e-06, - "objective/entropy": -164.51651000976562, - "objective/kl": 27.349618911743164, - "objective/non_score_reward": -2.734961986541748, - "objective/rlhf_reward": -9.539847826957702, - "objective/scores": 0.35, - "policy/approxkl_avg": 40.27062225341797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7697643041610718, - "step": 126, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998326301574707 - }, - { - "episode": 2048, - "epoch": 0.03681202142574685, - "loss/policy_avg": 0.6887588500976562, - "lr": 9.918839468302659e-06, - "objective/entropy": 69.27521514892578, - "objective/kl": 22.67069435119629, - "objective/non_score_reward": -2.2670693397521973, - "objective/rlhf_reward": -7.6896756077683985, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 162.61952209472656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6613045930862427, - "step": 127, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999122977256775 - }, - { - "episode": 2064, - "epoch": 0.037099615343135495, - "loss/policy_avg": 0.3447116017341614, - "lr": 9.918200408997956e-06, - "objective/entropy": 212.85906982421875, - "objective/kl": 26.792640686035156, - "objective/non_score_reward": -2.6792640686035156, - "objective/rlhf_reward": -8.983723298708597, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 97.52276611328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9090495109558105, - "step": 128, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001721382141113 - }, - { - "episode": 2080, - "epoch": 0.03738720926052414, - "loss/policy_avg": 0.48409304022789, - "lr": 9.917561349693252e-06, - "objective/entropy": 190.03709411621094, - "objective/kl": 18.16942596435547, - "objective/non_score_reward": -1.8169424533843994, - "objective/rlhf_reward": -5.145064058081184, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 45.51705551147461, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.46248874068260193, - "step": 129, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9967193603515625 - }, - { - "episode": 2096, - "epoch": 0.03767480317791279, - "loss/policy_avg": 0.5664651989936829, - "lr": 9.91692229038855e-06, - "objective/entropy": 192.7556610107422, - "objective/kl": 20.14044952392578, - "objective/non_score_reward": -2.014044761657715, - "objective/rlhf_reward": -6.231350894245217, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 29.526737213134766, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8081471920013428, - "step": 130, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9979431629180908 - }, - { - "episode": 2112, - "epoch": 0.037962397095301435, - "loss/policy_avg": 0.41585665941238403, - "lr": 9.916283231083844e-06, - "objective/entropy": -49.55967712402344, - "objective/kl": 34.61396789550781, - "objective/non_score_reward": -3.4613966941833496, - "objective/rlhf_reward": -12.466984846679072, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 176.34915161132812, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7800864577293396, - "step": 131, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994279146194458 - }, - { - "episode": 2128, - "epoch": 0.03824999101269008, - "loss/policy_avg": 0.11256570369005203, - "lr": 9.915644171779141e-06, - "objective/entropy": 156.05429077148438, - "objective/kl": 29.602603912353516, - "objective/non_score_reward": -2.9602606296539307, - "objective/rlhf_reward": -7.441042518615722, - "objective/scores": 1.1, - "policy/approxkl_avg": 77.59361267089844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8300012350082397, - "step": 132, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990795850753784 - }, - { - "episode": 2144, - "epoch": 0.03853758493007873, - "loss/policy_avg": 0.5523125529289246, - "lr": 9.915005112474438e-06, - "objective/entropy": -9.20687484741211, - "objective/kl": 18.338748931884766, - "objective/non_score_reward": -1.8338749408721924, - "objective/rlhf_reward": -5.510670955452035, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 8.425865173339844, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6365004777908325, - "step": 133, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000248908996582 - }, - { - "episode": 2160, - "epoch": 0.038825178847467376, - "loss/policy_avg": -0.049435317516326904, - "lr": 9.914366053169735e-06, - "objective/entropy": -86.67240142822266, - "objective/kl": 31.985502243041992, - "objective/non_score_reward": -3.198550224304199, - "objective/rlhf_reward": -11.237941711154535, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 199.85411071777344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7536939382553101, - "step": 134, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000448703765869 - }, - { - "episode": 2176, - "epoch": 0.03911277276485602, - "loss/policy_avg": 2.6105287075042725, - "lr": 9.913726993865032e-06, - "objective/entropy": -58.581077575683594, - "objective/kl": 16.238283157348633, - "objective/non_score_reward": -1.6238282918930054, - "objective/rlhf_reward": -6.495313286781311, - "objective/scores": 0.0, - "policy/approxkl_avg": 10.216068267822266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5845435857772827, - "step": 135, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.019744396209717 - }, - { - "episode": 2192, - "epoch": 0.03940036668224467, - "loss/policy_avg": 0.24281054735183716, - "lr": 9.913087934560329e-06, - "objective/entropy": -94.54998779296875, - "objective/kl": 24.751571655273438, - "objective/non_score_reward": -2.4751572608947754, - "objective/rlhf_reward": -8.384857022556004, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 124.94600677490234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6612954139709473, - "step": 136, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9992079734802246 - }, - { - "episode": 2208, - "epoch": 0.039687960599633317, - "loss/policy_avg": 0.3692808449268341, - "lr": 9.912448875255624e-06, - "objective/entropy": 131.55032348632812, - "objective/kl": 33.53261184692383, - "objective/non_score_reward": -3.3532609939575195, - "objective/rlhf_reward": -10.489325438381407, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 40.712066650390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6237879395484924, - "step": 137, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9970109462738037 - }, - { - "episode": 2224, - "epoch": 0.03997555451702196, - "loss/policy_avg": 0.7208542823791504, - "lr": 9.911809815950921e-06, - "objective/entropy": -9.526227951049805, - "objective/kl": 23.76919937133789, - "objective/non_score_reward": -2.376919984817505, - "objective/rlhf_reward": -9.507680296897888, - "objective/scores": 0.0, - "policy/approxkl_avg": 324.29388427734375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5352585315704346, - "step": 138, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988393783569336 - }, - { - "episode": 2240, - "epoch": 0.04026314843441061, - "loss/policy_avg": 0.24535533785820007, - "lr": 9.911170756646218e-06, - "objective/entropy": -117.0184555053711, - "objective/kl": 12.411272048950195, - "objective/non_score_reward": -1.2411272525787354, - "objective/rlhf_reward": -3.5139104529336542, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 86.15127563476562, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6268381476402283, - "step": 139, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9967679977416992 - }, - { - "episode": 2256, - "epoch": 0.04055074235179926, - "loss/policy_avg": 1.7714985609054565, - "lr": 9.910531697341515e-06, - "objective/entropy": -78.9849853515625, - "objective/kl": 19.77252197265625, - "objective/non_score_reward": -1.9772523641586304, - "objective/rlhf_reward": -7.9090094566345215, - "objective/scores": 0.0, - "policy/approxkl_avg": 51.44104766845703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.48759615421295166, - "step": 140, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998913288116455 - }, - { - "episode": 2272, - "epoch": 0.040838336269187904, - "loss/policy_avg": 0.42394816875457764, - "lr": 9.909892638036812e-06, - "objective/entropy": -52.96235656738281, - "objective/kl": 23.680797576904297, - "objective/non_score_reward": -2.368079662322998, - "objective/rlhf_reward": -7.647490258487771, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 46.429683685302734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7907830476760864, - "step": 141, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9969936609268188 - }, - { - "episode": 2288, - "epoch": 0.04112593018657655, - "loss/policy_avg": 0.42378103733062744, - "lr": 9.909253578732107e-06, - "objective/entropy": -55.83934783935547, - "objective/kl": 19.108665466308594, - "objective/non_score_reward": -1.910866618156433, - "objective/rlhf_reward": -4.7197478159677715, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 87.44278717041016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7084353566169739, - "step": 142, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.99775230884552 - }, - { - "episode": 2304, - "epoch": 0.0414135241039652, - "loss/policy_avg": 0.8798868656158447, - "lr": 9.908614519427404e-06, - "objective/entropy": 397.6534729003906, - "objective/kl": 35.663856506347656, - "objective/non_score_reward": -3.5663862228393555, - "objective/rlhf_reward": -12.318133900837836, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 403.22802734375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9414247274398804, - "step": 143, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968068599700928 - }, - { - "episode": 2320, - "epoch": 0.04170111802135385, - "loss/policy_avg": 0.8013919591903687, - "lr": 9.9079754601227e-06, - "objective/entropy": 92.8883285522461, - "objective/kl": 23.82331657409668, - "objective/non_score_reward": -2.382331609725952, - "objective/rlhf_reward": -8.170076781247538, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 73.349609375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6135187149047852, - "step": 144, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0004608631134033 - }, - { - "episode": 2336, - "epoch": 0.0419887119387425, - "loss/policy_avg": 2.381652593612671, - "lr": 9.907336400817996e-06, - "objective/entropy": -49.03077697753906, - "objective/kl": 20.151020050048828, - "objective/non_score_reward": -2.0151021480560303, - "objective/rlhf_reward": -6.5041489293247015, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 37.75957489013672, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7855414152145386, - "step": 145, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0004804134368896 - }, - { - "episode": 2352, - "epoch": 0.042276305856131145, - "loss/policy_avg": 10.612133026123047, - "lr": 9.906697341513293e-06, - "objective/entropy": 31.498912811279297, - "objective/kl": 22.87250518798828, - "objective/non_score_reward": -2.2872507572174072, - "objective/rlhf_reward": -7.487143372715103, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 67.64070892333984, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6964052319526672, - "step": 146, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002523422241211 - }, - { - "episode": 2368, - "epoch": 0.04256389977351979, - "loss/policy_avg": 1.218620777130127, - "lr": 9.90605828220859e-06, - "objective/entropy": -186.7276153564453, - "objective/kl": 17.879009246826172, - "objective/non_score_reward": -1.7879009246826172, - "objective/rlhf_reward": -2.751603758335113, - "objective/scores": 1.1, - "policy/approxkl_avg": 62.54943084716797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9248688220977783, - "step": 147, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999516248703003 - }, - { - "episode": 2384, - "epoch": 0.04285149369090844, - "loss/policy_avg": 0.08095124363899231, - "lr": 9.905419222903886e-06, - "objective/entropy": -79.130859375, - "objective/kl": 17.005231857299805, - "objective/non_score_reward": -1.7005233764648438, - "objective/rlhf_reward": -5.378261108596889, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 14.491683959960938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5301992893218994, - "step": 148, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981908798217773 - }, - { - "episode": 2400, - "epoch": 0.043139087608297086, - "loss/policy_avg": 0.9632350206375122, - "lr": 9.904780163599183e-06, - "objective/entropy": 78.35763549804688, - "objective/kl": 20.501068115234375, - "objective/non_score_reward": -2.050107002258301, - "objective/rlhf_reward": -3.8004277706146237, - "objective/scores": 1.1, - "policy/approxkl_avg": 42.78419494628906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6539649963378906, - "step": 149, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986085891723633 - }, - { - "episode": 2416, - "epoch": 0.04342668152568573, - "loss/policy_avg": 0.2573207914829254, - "lr": 9.904141104294478e-06, - "objective/entropy": 139.81314086914062, - "objective/kl": 15.294164657592773, - "objective/non_score_reward": -1.5294163227081299, - "objective/rlhf_reward": -4.739063062754971, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 44.518585205078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9615079760551453, - "step": 150, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0004477500915527 - }, - { - "episode": 2432, - "epoch": 0.04371427544307438, - "loss/policy_avg": 0.1424349546432495, - "lr": 9.903502044989775e-06, - "objective/entropy": -200.99420166015625, - "objective/kl": 23.035701751708984, - "objective/non_score_reward": -2.30357027053833, - "objective/rlhf_reward": -7.835678675261837, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 147.28530883789062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5872718691825867, - "step": 151, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000128746032715 - }, - { - "episode": 2448, - "epoch": 0.044001869360463026, - "loss/policy_avg": 0.2018139809370041, - "lr": 9.902862985685072e-06, - "objective/entropy": 265.19927978515625, - "objective/kl": 22.482166290283203, - "objective/non_score_reward": -2.2482166290283203, - "objective/rlhf_reward": -7.65123014739099, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 45.75090408325195, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6457411050796509, - "step": 152, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9972870349884033 - }, - { - "episode": 2464, - "epoch": 0.04428946327785167, - "loss/policy_avg": 1.6731089353561401, - "lr": 9.902223926380369e-06, - "objective/entropy": 88.4649658203125, - "objective/kl": 23.56682586669922, - "objective/non_score_reward": -2.356682300567627, - "objective/rlhf_reward": -7.479318688588078, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 46.715396881103516, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5394150614738464, - "step": 153, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9959262609481812 - }, - { - "episode": 2480, - "epoch": 0.04457705719524032, - "loss/policy_avg": 0.28844547271728516, - "lr": 9.901584867075666e-06, - "objective/entropy": 153.28744506835938, - "objective/kl": 25.101318359375, - "objective/non_score_reward": -2.5101318359375, - "objective/rlhf_reward": -8.484268038478449, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 65.64702606201172, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.48713332414627075, - "step": 154, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9966403245925903 - }, - { - "episode": 2496, - "epoch": 0.04486465111262897, - "loss/policy_avg": 0.5096696615219116, - "lr": 9.900945807770961e-06, - "objective/entropy": 155.93728637695312, - "objective/kl": 19.505783081054688, - "objective/non_score_reward": -1.9505780935287476, - "objective/rlhf_reward": -6.3517142339662165, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 54.11931610107422, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5123778581619263, - "step": 155, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984791278839111 - }, - { - "episode": 2512, - "epoch": 0.045152245030017614, - "loss/policy_avg": 0.06939780712127686, - "lr": 9.900306748466258e-06, - "objective/entropy": 18.287960052490234, - "objective/kl": 17.8365421295166, - "objective/non_score_reward": -1.7836542129516602, - "objective/rlhf_reward": -5.792981436758667, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 29.395599365234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5711311101913452, - "step": 156, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997730255126953 - }, - { - "episode": 2528, - "epoch": 0.04543983894740626, - "loss/policy_avg": 0.6630462408065796, - "lr": 9.899667689161555e-06, - "objective/entropy": 63.43556213378906, - "objective/kl": 22.572792053222656, - "objective/non_score_reward": -2.2572789192199707, - "objective/rlhf_reward": -7.687480738669066, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 46.135581970214844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5037804841995239, - "step": 157, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9967372417449951 - }, - { - "episode": 2544, - "epoch": 0.04572743286479491, - "loss/policy_avg": 0.9545019865036011, - "lr": 9.899028629856852e-06, - "objective/entropy": 78.440673828125, - "objective/kl": 24.54024314880371, - "objective/non_score_reward": -2.454024314880371, - "objective/rlhf_reward": -8.43749509104858, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 15.233887672424316, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5358680486679077, - "step": 158, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0002801418304443 - }, - { - "episode": 2560, - "epoch": 0.046015026782183555, - "loss/policy_avg": 0.7431780099868774, - "lr": 9.898389570552149e-06, - "objective/entropy": 140.66659545898438, - "objective/kl": 15.743444442749023, - "objective/non_score_reward": -1.574344515800476, - "objective/rlhf_reward": -4.938128196929378, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 3.185384511947632, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.35770517587661743, - "step": 159, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9985697269439697 - }, - { - "episode": 2576, - "epoch": 0.0463026206995722, - "loss/policy_avg": 0.5926499366760254, - "lr": 9.897750511247446e-06, - "objective/entropy": 142.6728973388672, - "objective/kl": 26.324443817138672, - "objective/non_score_reward": -2.6324446201324463, - "objective/rlhf_reward": -10.529778361320496, - "objective/scores": 0.0, - "policy/approxkl_avg": 163.95303344726562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6692796945571899, - "step": 160, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0011281967163086 - }, - { - "episode": 2592, - "epoch": 0.04659021461696085, - "loss/policy_avg": 0.5165088772773743, - "lr": 9.89711145194274e-06, - "objective/entropy": -49.34636688232422, - "objective/kl": 23.45389175415039, - "objective/non_score_reward": -2.3453893661499023, - "objective/rlhf_reward": -7.865785443576511, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 21.34342384338379, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6546250581741333, - "step": 161, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9961342811584473 - }, - { - "episode": 2608, - "epoch": 0.046877808534349495, - "loss/policy_avg": 0.06444612145423889, - "lr": 9.896472392638038e-06, - "objective/entropy": 3.992961883544922, - "objective/kl": 22.590843200683594, - "objective/non_score_reward": -2.259084463119507, - "objective/rlhf_reward": -7.636337852478027, - "objective/scores": 0.35, - "policy/approxkl_avg": 78.1329345703125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6085139513015747, - "step": 162, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983890056610107 - }, - { - "episode": 2624, - "epoch": 0.04716540245173815, - "loss/policy_avg": -0.5813350081443787, - "lr": 9.895833333333334e-06, - "objective/entropy": 129.43629455566406, - "objective/kl": 18.305377960205078, - "objective/non_score_reward": -1.8305377960205078, - "objective/rlhf_reward": -5.841198506768107, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 35.31721496582031, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.5479708313941956, - "step": 163, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.006561756134033 - }, - { - "episode": 2640, - "epoch": 0.047452996369126796, - "loss/policy_avg": 0.771713376045227, - "lr": 9.895194274028631e-06, - "objective/entropy": -7.5702056884765625, - "objective/kl": 26.34789276123047, - "objective/non_score_reward": -2.634789228439331, - "objective/rlhf_reward": -9.023385488780672, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 189.376708984375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7600178122520447, - "step": 164, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9973194599151611 - }, - { - "episode": 2656, - "epoch": 0.04774059028651544, - "loss/policy_avg": 0.3846855163574219, - "lr": 9.894555214723928e-06, - "objective/entropy": 208.37188720703125, - "objective/kl": 24.167360305786133, - "objective/non_score_reward": -2.416736125946045, - "objective/rlhf_reward": -8.307694399093075, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 50.201210021972656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8115335702896118, - "step": 165, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998011589050293 - }, - { - "episode": 2672, - "epoch": 0.04802818420390409, - "loss/policy_avg": 0.1327829658985138, - "lr": 9.893916155419225e-06, - "objective/entropy": 137.91152954101562, - "objective/kl": 16.795686721801758, - "objective/non_score_reward": -1.6795687675476074, - "objective/rlhf_reward": -5.202503287585911, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 83.59907531738281, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6950229406356812, - "step": 166, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9975529909133911 - }, - { - "episode": 2688, - "epoch": 0.048315778121292736, - "loss/policy_avg": 0.1259589046239853, - "lr": 9.89327709611452e-06, - "objective/entropy": -62.611698150634766, - "objective/kl": 23.73765754699707, - "objective/non_score_reward": -2.373765707015991, - "objective/rlhf_reward": -8.095063066482544, - "objective/scores": 0.35, - "policy/approxkl_avg": 30.360702514648438, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4680527448654175, - "step": 167, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0010440349578857 - }, - { - "episode": 2704, - "epoch": 0.04860337203868138, - "loss/policy_avg": 1.4625483751296997, - "lr": 9.892638036809815e-06, - "objective/entropy": 191.9486083984375, - "objective/kl": 29.09479331970215, - "objective/non_score_reward": -2.9094796180725098, - "objective/rlhf_reward": -9.976058488309967, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 96.90817260742188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4573523998260498, - "step": 168, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999511957168579 - }, - { - "episode": 2720, - "epoch": 0.04889096595607003, - "loss/policy_avg": 0.39898326992988586, - "lr": 9.891998977505112e-06, - "objective/entropy": 76.32606506347656, - "objective/kl": 25.170101165771484, - "objective/non_score_reward": -2.51701021194458, - "objective/rlhf_reward": -8.58708846848762, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 109.47344970703125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7625200748443604, - "step": 169, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9968773126602173 - }, - { - "episode": 2736, - "epoch": 0.04917855987345868, - "loss/policy_avg": 1.2032477855682373, - "lr": 9.89135991820041e-06, - "objective/entropy": -85.850830078125, - "objective/kl": 23.681184768676758, - "objective/non_score_reward": -2.3681185245513916, - "objective/rlhf_reward": -7.916214673724726, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 8.655494689941406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7028899788856506, - "step": 170, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0086746215820312 - }, - { - "episode": 2752, - "epoch": 0.049466153790847324, - "loss/policy_avg": 0.2989116311073303, - "lr": 9.890720858895706e-06, - "objective/entropy": -244.5385284423828, - "objective/kl": 25.30226707458496, - "objective/non_score_reward": -2.530226707458496, - "objective/rlhf_reward": -8.74230466136108, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 80.04335021972656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7547532320022583, - "step": 171, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9984104633331299 - }, - { - "episode": 2768, - "epoch": 0.04975374770823597, - "loss/policy_avg": 0.15676680207252502, - "lr": 9.890081799591003e-06, - "objective/entropy": -46.70368576049805, - "objective/kl": 23.916561126708984, - "objective/non_score_reward": -2.3916563987731934, - "objective/rlhf_reward": -8.142793853481379, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 98.60531616210938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4980742335319519, - "step": 172, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000648021697998 - }, - { - "episode": 2784, - "epoch": 0.05004134162562462, - "loss/policy_avg": 2.26867413520813, - "lr": 9.8894427402863e-06, - "objective/entropy": -103.40653991699219, - "objective/kl": 13.933716773986816, - "objective/non_score_reward": -1.39337158203125, - "objective/rlhf_reward": -3.969366837207394, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 48.939292907714844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5038570165634155, - "step": 173, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992566108703613 - }, - { - "episode": 2800, - "epoch": 0.050328935543013265, - "loss/policy_avg": 2.38254714012146, - "lr": 9.888803680981595e-06, - "objective/entropy": 186.43983459472656, - "objective/kl": 22.118637084960938, - "objective/non_score_reward": -2.2118635177612305, - "objective/rlhf_reward": -7.5219414568244645, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 46.10081481933594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5202944278717041, - "step": 174, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9985566139221191 - }, - { - "episode": 2816, - "epoch": 0.05061652946040191, - "loss/policy_avg": 0.7353519201278687, - "lr": 9.888164621676892e-06, - "objective/entropy": 40.153011322021484, - "objective/kl": 24.53411865234375, - "objective/non_score_reward": -2.453411817550659, - "objective/rlhf_reward": -8.413647150993347, - "objective/scores": 0.35, - "policy/approxkl_avg": 73.58890533447266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5293036699295044, - "step": 175, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974313974380493 - }, - { - "episode": 2832, - "epoch": 0.05090412337779056, - "loss/policy_avg": 0.09056591242551804, - "lr": 9.887525562372189e-06, - "objective/entropy": 271.1833190917969, - "objective/kl": 18.394960403442383, - "objective/non_score_reward": -1.83949613571167, - "objective/rlhf_reward": -5.8770319251374, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 106.73664855957031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.683214545249939, - "step": 176, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000105619430542 - }, - { - "episode": 2848, - "epoch": 0.051191717295179205, - "loss/policy_avg": 1.4375742673873901, - "lr": 9.886886503067486e-06, - "objective/entropy": 21.05878448486328, - "objective/kl": 19.652141571044922, - "objective/non_score_reward": -1.9652140140533447, - "objective/rlhf_reward": -5.913445363717015, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 19.185291290283203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5441437363624573, - "step": 177, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982903003692627 - }, - { - "episode": 2864, - "epoch": 0.05147931121256785, - "loss/policy_avg": 0.9233704209327698, - "lr": 9.886247443762783e-06, - "objective/entropy": 79.99378204345703, - "objective/kl": 30.930316925048828, - "objective/non_score_reward": -3.093031644821167, - "objective/rlhf_reward": -10.42471582718366, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 71.49031829833984, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.4192318320274353, - "step": 178, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994785785675049 - }, - { - "episode": 2880, - "epoch": 0.0517669051299565, - "loss/policy_avg": 0.8243035078048706, - "lr": 9.88560838445808e-06, - "objective/entropy": -79.08426666259766, - "objective/kl": 21.69200897216797, - "objective/non_score_reward": -2.169200897216797, - "objective/rlhf_reward": -7.072683844629841, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 111.395263671875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5561103820800781, - "step": 179, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990267753601074 - }, - { - "episode": 2896, - "epoch": 0.052054499047345146, - "loss/policy_avg": 0.20029950141906738, - "lr": 9.884969325153375e-06, - "objective/entropy": 29.943138122558594, - "objective/kl": 23.813167572021484, - "objective/non_score_reward": -2.381316661834717, - "objective/rlhf_reward": -9.525267362594604, - "objective/scores": 0.0, - "policy/approxkl_avg": 42.68115997314453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6918896436691284, - "step": 180, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9963979721069336 - }, - { - "episode": 2912, - "epoch": 0.05234209296473379, - "loss/policy_avg": 0.014301195740699768, - "lr": 9.884330265848671e-06, - "objective/entropy": 164.71829223632812, - "objective/kl": 24.91703224182129, - "objective/non_score_reward": -2.4917030334472656, - "objective/rlhf_reward": -7.84410613991407, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 178.1289520263672, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.43327000737190247, - "step": 181, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9986697435379028 - }, - { - "episode": 2928, - "epoch": 0.052629686882122446, - "loss/policy_avg": 0.9095668792724609, - "lr": 9.883691206543968e-06, - "objective/entropy": 107.3670883178711, - "objective/kl": 29.20984649658203, - "objective/non_score_reward": -2.920984983444214, - "objective/rlhf_reward": -10.079820070330221, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 77.46437072753906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6489860415458679, - "step": 182, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9962208271026611 - }, - { - "episode": 2944, - "epoch": 0.05291728079951109, - "loss/policy_avg": 0.2735748589038849, - "lr": 9.883052147239265e-06, - "objective/entropy": 22.112346649169922, - "objective/kl": 20.8614444732666, - "objective/non_score_reward": -2.08614444732666, - "objective/rlhf_reward": -6.682718520582306, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 121.54977416992188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5333126187324524, - "step": 183, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991374015808105 - }, - { - "episode": 2960, - "epoch": 0.05320487471689974, - "loss/policy_avg": 0.32464680075645447, - "lr": 9.882413087934562e-06, - "objective/entropy": 188.4505615234375, - "objective/kl": 20.14493179321289, - "objective/non_score_reward": -2.014493227005005, - "objective/rlhf_reward": -6.542201125415501, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 49.658721923828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7087539434432983, - "step": 184, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0017189979553223 - }, - { - "episode": 2976, - "epoch": 0.05349246863428839, - "loss/policy_avg": 0.6234300136566162, - "lr": 9.881774028629857e-06, - "objective/entropy": -28.301137924194336, - "objective/kl": 22.407699584960938, - "objective/non_score_reward": -2.240769863128662, - "objective/rlhf_reward": -7.406820564475611, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.418500900268555, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.48261120915412903, - "step": 185, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975056648254395 - }, - { - "episode": 2992, - "epoch": 0.053780062551677034, - "loss/policy_avg": 0.1302778124809265, - "lr": 9.881134969325154e-06, - "objective/entropy": 163.4349365234375, - "objective/kl": 31.70010757446289, - "objective/non_score_reward": -3.170010566711426, - "objective/rlhf_reward": -10.280042505264282, - "objective/scores": 0.6, - "policy/approxkl_avg": 74.95491790771484, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.584516167640686, - "step": 186, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998785138130188 - }, - { - "episode": 3008, - "epoch": 0.05406765646906568, - "loss/policy_avg": 0.22809255123138428, - "lr": 9.880495910020451e-06, - "objective/entropy": 209.2935333251953, - "objective/kl": 20.395681381225586, - "objective/non_score_reward": -2.0395681858062744, - "objective/rlhf_reward": -6.816636970549254, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 63.878265380859375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6551488637924194, - "step": 187, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995040893554688 - }, - { - "episode": 3024, - "epoch": 0.05435525038645433, - "loss/policy_avg": 0.03126790001988411, - "lr": 9.879856850715748e-06, - "objective/entropy": 4.618324279785156, - "objective/kl": 24.382261276245117, - "objective/non_score_reward": -2.4382262229919434, - "objective/rlhf_reward": -8.237132870944675, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 6.476547718048096, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5421229600906372, - "step": 188, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999804496765137 - }, - { - "episode": 3040, - "epoch": 0.054642844303842975, - "loss/policy_avg": 0.3424449563026428, - "lr": 9.879217791411043e-06, - "objective/entropy": -136.56385803222656, - "objective/kl": 33.151222229003906, - "objective/non_score_reward": -3.315122127532959, - "objective/rlhf_reward": -11.901238882277887, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 49.403472900390625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5513160228729248, - "step": 189, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999922752380371 - }, - { - "episode": 3056, - "epoch": 0.05493043822123162, - "loss/policy_avg": 0.8403773307800293, - "lr": 9.87857873210634e-06, - "objective/entropy": 322.4634704589844, - "objective/kl": 26.295257568359375, - "objective/non_score_reward": -2.629525661468506, - "objective/rlhf_reward": -9.192590031653566, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 115.08587646484375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7622972130775452, - "step": 190, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968273639678955 - }, - { - "episode": 3072, - "epoch": 0.05521803213862027, - "loss/policy_avg": 0.19895562529563904, - "lr": 9.877939672801637e-06, - "objective/entropy": -226.75164794921875, - "objective/kl": 16.52016830444336, - "objective/non_score_reward": -1.6520167589187622, - "objective/rlhf_reward": -4.78323804882438, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 43.135128021240234, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.48229825496673584, - "step": 191, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.003657341003418 - }, - { - "episode": 3088, - "epoch": 0.055505626056008915, - "loss/policy_avg": 0.132611945271492, - "lr": 9.877300613496934e-06, - "objective/entropy": 138.95777893066406, - "objective/kl": 28.13532257080078, - "objective/non_score_reward": -2.8135323524475098, - "objective/rlhf_reward": -9.830297310550776, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 11.711568832397461, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6496654748916626, - "step": 192, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994869232177734 - }, - { - "episode": 3104, - "epoch": 0.05579321997339756, - "loss/policy_avg": 0.13647544384002686, - "lr": 9.876661554192229e-06, - "objective/entropy": 228.907958984375, - "objective/kl": 20.958343505859375, - "objective/non_score_reward": -2.095834255218506, - "objective/rlhf_reward": -6.435925911145146, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 77.2052230834961, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5815694332122803, - "step": 193, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.998384714126587 - }, - { - "episode": 3120, - "epoch": 0.05608081389078621, - "loss/policy_avg": 0.3623042702674866, - "lr": 9.876022494887526e-06, - "objective/entropy": 101.04753112792969, - "objective/kl": 28.680049896240234, - "objective/non_score_reward": -2.8680050373077393, - "objective/rlhf_reward": -10.021421532245025, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 129.59266662597656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5827709436416626, - "step": 194, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990119934082031 - }, - { - "episode": 3136, - "epoch": 0.056368407808174856, - "loss/policy_avg": -0.07424932718276978, - "lr": 9.875383435582823e-06, - "objective/entropy": 245.4013671875, - "objective/kl": 20.346391677856445, - "objective/non_score_reward": -2.034639358520508, - "objective/rlhf_reward": -6.01585108257917, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 31.92734718322754, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6253474950790405, - "step": 195, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9984948635101318 - }, - { - "episode": 3152, - "epoch": 0.0566560017255635, - "loss/policy_avg": 0.1401011198759079, - "lr": 9.87474437627812e-06, - "objective/entropy": 375.89263916015625, - "objective/kl": 21.685848236083984, - "objective/non_score_reward": -2.1685848236083984, - "objective/rlhf_reward": -6.274339175224304, - "objective/scores": 0.6, - "policy/approxkl_avg": 131.50494384765625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.9226024746894836, - "step": 196, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0007243156433105 - }, - { - "episode": 3168, - "epoch": 0.05694359564295215, - "loss/policy_avg": -0.030730588361620903, - "lr": 9.874105316973416e-06, - "objective/entropy": 140.85540771484375, - "objective/kl": 20.57616424560547, - "objective/non_score_reward": -2.0576162338256836, - "objective/rlhf_reward": -6.779867152781829, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 33.19469451904297, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7271202206611633, - "step": 197, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000385284423828 - }, - { - "episode": 3184, - "epoch": 0.057231189560340796, - "loss/policy_avg": 2.7618093490600586, - "lr": 9.873466257668712e-06, - "objective/entropy": 179.97198486328125, - "objective/kl": 28.560035705566406, - "objective/non_score_reward": -2.856003761291504, - "objective/rlhf_reward": -9.599186535152505, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 32.35374450683594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.653136134147644, - "step": 198, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997452974319458 - }, - { - "episode": 3200, - "epoch": 0.05751878347772944, - "loss/policy_avg": 0.6520799398422241, - "lr": 9.872827198364009e-06, - "objective/entropy": 110.88057708740234, - "objective/kl": 31.026592254638672, - "objective/non_score_reward": -3.1026594638824463, - "objective/rlhf_reward": -11.06900267890039, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 33.437408447265625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5983192324638367, - "step": 199, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984240531921387 - }, - { - "episode": 3216, - "epoch": 0.05780637739511809, - "loss/policy_avg": 0.19128543138504028, - "lr": 9.872188139059305e-06, - "objective/entropy": 234.22332763671875, - "objective/kl": 33.926361083984375, - "objective/non_score_reward": -3.3926358222961426, - "objective/rlhf_reward": -12.119944910617217, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 190.18153381347656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9237314462661743, - "step": 200, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9989691972732544 - }, - { - "episode": 3232, - "epoch": 0.058093971312506744, - "loss/policy_avg": 0.04767340421676636, - "lr": 9.871549079754602e-06, - "objective/entropy": -7.490440368652344, - "objective/kl": 16.179231643676758, - "objective/non_score_reward": -1.6179232597351074, - "objective/rlhf_reward": -4.348986568228279, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 3.6666202545166016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5502414703369141, - "step": 201, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998970031738281 - }, - { - "episode": 3248, - "epoch": 0.05838156522989539, - "loss/policy_avg": 0.009956400841474533, - "lr": 9.8709100204499e-06, - "objective/entropy": 184.87599182128906, - "objective/kl": 30.518714904785156, - "objective/non_score_reward": -3.0518715381622314, - "objective/rlhf_reward": -10.865850737600951, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 144.38037109375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46582168340682983, - "step": 202, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0004916191101074 - }, - { - "episode": 3264, - "epoch": 0.05866915914728404, - "loss/policy_avg": 0.8650859594345093, - "lr": 9.870270961145196e-06, - "objective/entropy": 60.665279388427734, - "objective/kl": 30.722930908203125, - "objective/non_score_reward": -3.0722928047180176, - "objective/rlhf_reward": -10.80821907799995, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 197.62728881835938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.45192649960517883, - "step": 203, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9978991746902466 - }, - { - "episode": 3280, - "epoch": 0.058956753064672685, - "loss/policy_avg": 0.7753949165344238, - "lr": 9.869631901840491e-06, - "objective/entropy": 224.53439331054688, - "objective/kl": 35.50615692138672, - "objective/non_score_reward": -3.5506153106689453, - "objective/rlhf_reward": -9.80246195793152, - "objective/scores": 1.1, - "policy/approxkl_avg": 207.07131958007812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625594973564148, - "step": 204, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.003218650817871 - }, - { - "episode": 3296, - "epoch": 0.05924434698206133, - "loss/policy_avg": 0.12218689173460007, - "lr": 9.868992842535788e-06, - "objective/entropy": 160.02056884765625, - "objective/kl": 20.542434692382812, - "objective/non_score_reward": -2.054243326187134, - "objective/rlhf_reward": -8.216973185539246, - "objective/scores": 0.0, - "policy/approxkl_avg": 27.70839500427246, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5086226463317871, - "step": 205, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9990867376327515 - }, - { - "episode": 3312, - "epoch": 0.05953194089944998, - "loss/policy_avg": 0.16328184306621552, - "lr": 9.868353783231085e-06, - "objective/entropy": -178.42849731445312, - "objective/kl": 12.709222793579102, - "objective/non_score_reward": -1.270922303199768, - "objective/rlhf_reward": -0.6836892724037167, - "objective/scores": 1.1, - "policy/approxkl_avg": 175.85543823242188, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6555081605911255, - "step": 206, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003648281097412 - }, - { - "episode": 3328, - "epoch": 0.059819534816838625, - "loss/policy_avg": 0.3191947340965271, - "lr": 9.867714723926382e-06, - "objective/entropy": 113.40653991699219, - "objective/kl": 23.92019271850586, - "objective/non_score_reward": -2.392019271850586, - "objective/rlhf_reward": -8.011817782130793, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 120.01425170898438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6426678895950317, - "step": 207, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998692274093628 - }, - { - "episode": 3344, - "epoch": 0.06010712873422727, - "loss/policy_avg": 0.8106866478919983, - "lr": 9.867075664621679e-06, - "objective/entropy": -161.32217407226562, - "objective/kl": 20.696407318115234, - "objective/non_score_reward": -2.069640874862671, - "objective/rlhf_reward": -6.899961569396359, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 60.63603973388672, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.424863338470459, - "step": 208, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001417636871338 - }, - { - "episode": 3360, - "epoch": 0.06039472265161592, - "loss/policy_avg": 0.5404326915740967, - "lr": 9.866436605316974e-06, - "objective/entropy": 136.03414916992188, - "objective/kl": 24.140501022338867, - "objective/non_score_reward": -2.414050340652466, - "objective/rlhf_reward": -5.256201243400573, - "objective/scores": 1.1, - "policy/approxkl_avg": 77.51190948486328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6469956040382385, - "step": 209, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992618560791016 - }, - { - "episode": 3376, - "epoch": 0.060682316569004566, - "loss/policy_avg": -0.004204496741294861, - "lr": 9.86579754601227e-06, - "objective/entropy": -205.11416625976562, - "objective/kl": 22.115215301513672, - "objective/non_score_reward": -2.211521625518799, - "objective/rlhf_reward": -7.3954881235078425, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 130.94525146484375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7604430913925171, - "step": 210, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9986162185668945 - }, - { - "episode": 3392, - "epoch": 0.06096991048639321, - "loss/policy_avg": 0.10069486498832703, - "lr": 9.865158486707568e-06, - "objective/entropy": 38.40431213378906, - "objective/kl": 21.107707977294922, - "objective/non_score_reward": -2.1107707023620605, - "objective/rlhf_reward": -6.962130549366831, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 48.98419189453125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.43749940395355225, - "step": 211, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9993665218353271 - }, - { - "episode": 3408, - "epoch": 0.06125750440378186, - "loss/policy_avg": 0.013450137339532375, - "lr": 9.864519427402863e-06, - "objective/entropy": 97.07965087890625, - "objective/kl": 26.950225830078125, - "objective/non_score_reward": -2.6950225830078125, - "objective/rlhf_reward": -8.955262298854898, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 44.33604431152344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5165296792984009, - "step": 212, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978928565979004 - }, - { - "episode": 3424, - "epoch": 0.061545098321170506, - "loss/policy_avg": 0.4735873341560364, - "lr": 9.86388036809816e-06, - "objective/entropy": -1.7870521545410156, - "objective/kl": 27.062910079956055, - "objective/non_score_reward": -2.7062911987304688, - "objective/rlhf_reward": -9.465914988253992, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 72.81141662597656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6805305480957031, - "step": 213, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998960018157959 - }, - { - "episode": 3440, - "epoch": 0.06183269223855915, - "loss/policy_avg": 0.09523998200893402, - "lr": 9.863241308793457e-06, - "objective/entropy": 32.18935012817383, - "objective/kl": 9.85006046295166, - "objective/non_score_reward": -0.9850060939788818, - "objective/rlhf_reward": -2.5161924554901995, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 3.0238242149353027, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3841710090637207, - "step": 214, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9986772537231445 - }, - { - "episode": 3456, - "epoch": 0.0621202861559478, - "loss/policy_avg": 0.720879316329956, - "lr": 9.862602249488753e-06, - "objective/entropy": 276.2146301269531, - "objective/kl": 28.97698974609375, - "objective/non_score_reward": -2.8976993560791016, - "objective/rlhf_reward": -9.928937201917755, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 191.53884887695312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7353510856628418, - "step": 215, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9999027252197266 - }, - { - "episode": 3472, - "epoch": 0.06240788007333645, - "loss/policy_avg": 0.5507330894470215, - "lr": 9.86196319018405e-06, - "objective/entropy": 250.835693359375, - "objective/kl": 29.98652458190918, - "objective/non_score_reward": -2.998652219772339, - "objective/rlhf_reward": -10.332749491155731, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 85.02761840820312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6071260571479797, - "step": 216, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968819618225098 - }, - { - "episode": 3488, - "epoch": 0.0626954739907251, - "loss/policy_avg": 0.9385891556739807, - "lr": 9.861324130879346e-06, - "objective/entropy": 82.53084564208984, - "objective/kl": 26.54790687561035, - "objective/non_score_reward": -2.6547906398773193, - "objective/rlhf_reward": -8.496456088797125, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 24.03960609436035, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4867916703224182, - "step": 217, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992834329605103 - }, - { - "episode": 3504, - "epoch": 0.06298306790811374, - "loss/policy_avg": 0.3534790575504303, - "lr": 9.860685071574642e-06, - "objective/entropy": 230.29193115234375, - "objective/kl": 21.73017120361328, - "objective/non_score_reward": -2.1730172634124756, - "objective/rlhf_reward": -6.867240424427102, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 81.97232055664062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.558746337890625, - "step": 218, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9974663257598877 - }, - { - "episode": 3520, - "epoch": 0.0632706618255024, - "loss/policy_avg": -0.15977555513381958, - "lr": 9.86004601226994e-06, - "objective/entropy": 113.71033477783203, - "objective/kl": 17.67473030090332, - "objective/non_score_reward": -1.7674732208251953, - "objective/rlhf_reward": -4.947186651007209, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 94.58512115478516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5372532606124878, - "step": 219, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001951217651367 - }, - { - "episode": 3536, - "epoch": 0.06355825574289103, - "loss/policy_avg": 0.9559342265129089, - "lr": 9.859406952965236e-06, - "objective/entropy": 173.58860778808594, - "objective/kl": 33.72608947753906, - "objective/non_score_reward": -3.3726086616516113, - "objective/rlhf_reward": -11.367727937475713, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 6.491452217102051, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6636508703231812, - "step": 220, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9996795654296875 - }, - { - "episode": 3552, - "epoch": 0.06384584966027969, - "loss/policy_avg": -0.854604184627533, - "lr": 9.858767893660533e-06, - "objective/entropy": -67.233154296875, - "objective/kl": 13.420427322387695, - "objective/non_score_reward": -1.3420426845550537, - "objective/rlhf_reward": -2.4444515898239345, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 72.4083251953125, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.8086908459663391, - "step": 221, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0127267837524414 - }, - { - "episode": 3568, - "epoch": 0.06413344357766833, - "loss/policy_avg": 0.5410902500152588, - "lr": 9.858128834355828e-06, - "objective/entropy": 175.63470458984375, - "objective/kl": 35.907081604003906, - "objective/non_score_reward": -3.5907082557678223, - "objective/rlhf_reward": -13.037319932013673, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 99.65482330322266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6278276443481445, - "step": 222, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999873399734497 - }, - { - "episode": 3584, - "epoch": 0.06442103749505698, - "loss/policy_avg": 0.3871188163757324, - "lr": 9.857489775051125e-06, - "objective/entropy": -161.75840759277344, - "objective/kl": 18.288314819335938, - "objective/non_score_reward": -1.8288315534591675, - "objective/rlhf_reward": -5.799554431232151, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 110.67633056640625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6733036041259766, - "step": 223, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991552829742432 - }, - { - "episode": 3600, - "epoch": 0.06470863141244562, - "loss/policy_avg": -0.573300838470459, - "lr": 9.856850715746422e-06, - "objective/entropy": 6.650520324707031, - "objective/kl": 26.58426284790039, - "objective/non_score_reward": -2.658426284790039, - "objective/rlhf_reward": -7.709986720920774, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 44.56170654296875, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.38651180267333984, - "step": 224, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0037808418273926 - }, - { - "episode": 3616, - "epoch": 0.06499622532983428, - "loss/policy_avg": 0.5340808629989624, - "lr": 9.856211656441719e-06, - "objective/entropy": 59.36520004272461, - "objective/kl": 28.841266632080078, - "objective/non_score_reward": -2.884126901626587, - "objective/rlhf_reward": -9.874647860944854, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 53.19476318359375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6743514537811279, - "step": 225, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9957829713821411 - }, - { - "episode": 3632, - "epoch": 0.06528381924722292, - "loss/policy_avg": 0.5914766192436218, - "lr": 9.855572597137016e-06, - "objective/entropy": 228.81517028808594, - "objective/kl": 30.393442153930664, - "objective/non_score_reward": -3.039344310760498, - "objective/rlhf_reward": -10.209965775685246, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 85.6346435546875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5778177976608276, - "step": 226, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9993802309036255 - }, - { - "episode": 3648, - "epoch": 0.06557141316461157, - "loss/policy_avg": -0.05053609609603882, - "lr": 9.854933537832313e-06, - "objective/entropy": 13.725364685058594, - "objective/kl": 25.695791244506836, - "objective/non_score_reward": -2.5695791244506836, - "objective/rlhf_reward": -8.330905745701726, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 135.800048828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3210442066192627, - "step": 227, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997122287750244 - }, - { - "episode": 3664, - "epoch": 0.06585900708200021, - "loss/policy_avg": 0.4539129137992859, - "lr": 9.854294478527608e-06, - "objective/entropy": 93.42439270019531, - "objective/kl": 30.396175384521484, - "objective/non_score_reward": -3.0396177768707275, - "objective/rlhf_reward": -10.779868462172846, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 55.461158752441406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6131182909011841, - "step": 228, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9976425170898438 - }, - { - "episode": 3680, - "epoch": 0.06614660099938886, - "loss/policy_avg": 0.228049173951149, - "lr": 9.853655419222905e-06, - "objective/entropy": -28.055843353271484, - "objective/kl": 23.269084930419922, - "objective/non_score_reward": -2.326908588409424, - "objective/rlhf_reward": -6.383914981723997, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 143.5833740234375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5752028822898865, - "step": 229, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000626802444458 - }, - { - "episode": 3696, - "epoch": 0.06643419491677752, - "loss/policy_avg": 0.10666107386350632, - "lr": 9.853016359918202e-06, - "objective/entropy": 74.64518737792969, - "objective/kl": 32.4399528503418, - "objective/non_score_reward": -3.243995189666748, - "objective/rlhf_reward": -11.525382618518218, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 181.31935119628906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5801441669464111, - "step": 230, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995744228363037 - }, - { - "episode": 3712, - "epoch": 0.06672178883416616, - "loss/policy_avg": 2.4466023445129395, - "lr": 9.852377300613498e-06, - "objective/entropy": 244.4732666015625, - "objective/kl": 27.413360595703125, - "objective/non_score_reward": -2.7413363456726074, - "objective/rlhf_reward": -9.14051639583976, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 85.86346435546875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8622180223464966, - "step": 231, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987142086029053 - }, - { - "episode": 3728, - "epoch": 0.06700938275155481, - "loss/policy_avg": 0.8113258481025696, - "lr": 9.851738241308795e-06, - "objective/entropy": 56.00733947753906, - "objective/kl": 21.946327209472656, - "objective/non_score_reward": -2.1946325302124023, - "objective/rlhf_reward": -7.419280850623531, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 29.368534088134766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.45428696274757385, - "step": 232, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980647563934326 - }, - { - "episode": 3744, - "epoch": 0.06729697666894345, - "loss/policy_avg": 0.2869613766670227, - "lr": 9.85109918200409e-06, - "objective/entropy": 128.71649169921875, - "objective/kl": 21.821929931640625, - "objective/non_score_reward": -2.182192802429199, - "objective/rlhf_reward": -7.278173069568023, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 72.65187072753906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8492765426635742, - "step": 233, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982428550720215 - }, - { - "episode": 3760, - "epoch": 0.0675845705863321, - "loss/policy_avg": 1.1545510292053223, - "lr": 9.850460122699387e-06, - "objective/entropy": -46.38230895996094, - "objective/kl": 28.68572235107422, - "objective/non_score_reward": -2.868572235107422, - "objective/rlhf_reward": -10.050457079609004, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 29.78200912475586, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.661322295665741, - "step": 234, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991018772125244 - }, - { - "episode": 3776, - "epoch": 0.06787216450372074, - "loss/policy_avg": 0.7958990335464478, - "lr": 9.849821063394683e-06, - "objective/entropy": 157.34841918945312, - "objective/kl": 28.915939331054688, - "objective/non_score_reward": -2.8915936946868896, - "objective/rlhf_reward": -10.240862164527101, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 46.19620895385742, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6968529224395752, - "step": 235, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994633197784424 - }, - { - "episode": 3792, - "epoch": 0.0681597584211094, - "loss/policy_avg": 0.6319503784179688, - "lr": 9.84918200408998e-06, - "objective/entropy": 356.89532470703125, - "objective/kl": 28.920034408569336, - "objective/non_score_reward": -2.8920035362243652, - "objective/rlhf_reward": -10.144182522495356, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 15.02867317199707, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.926424503326416, - "step": 236, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9992319345474243 - }, - { - "episode": 3808, - "epoch": 0.06844735233849804, - "loss/policy_avg": 0.29689115285873413, - "lr": 9.848542944785276e-06, - "objective/entropy": -114.8179931640625, - "objective/kl": 22.912490844726562, - "objective/non_score_reward": -2.2912492752075195, - "objective/rlhf_reward": -7.649225318225559, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.519531726837158, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4792546033859253, - "step": 237, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990766048431396 - }, - { - "episode": 3824, - "epoch": 0.06873494625588669, - "loss/policy_avg": 0.6142581701278687, - "lr": 9.847903885480573e-06, - "objective/entropy": 42.130271911621094, - "objective/kl": 30.74860382080078, - "objective/non_score_reward": -3.0748605728149414, - "objective/rlhf_reward": -10.920839407531124, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 37.97405242919922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4455175995826721, - "step": 238, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975383281707764 - }, - { - "episode": 3840, - "epoch": 0.06902254017327533, - "loss/policy_avg": 0.03958883881568909, - "lr": 9.84726482617587e-06, - "objective/entropy": 148.7663116455078, - "objective/kl": 24.86724853515625, - "objective/non_score_reward": -2.486724615097046, - "objective/rlhf_reward": -8.56829617270599, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 20.696613311767578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7744324207305908, - "step": 239, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0034701824188232 - }, - { - "episode": 3856, - "epoch": 0.06931013409066399, - "loss/policy_avg": -0.12924179434776306, - "lr": 9.846625766871167e-06, - "objective/entropy": 13.191347122192383, - "objective/kl": 36.86333465576172, - "objective/non_score_reward": -3.686333179473877, - "objective/rlhf_reward": -11.821614180446836, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 93.72460174560547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7017860412597656, - "step": 240, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.996403455734253 - }, - { - "episode": 3872, - "epoch": 0.06959772800805263, - "loss/policy_avg": 0.6671891212463379, - "lr": 9.845986707566462e-06, - "objective/entropy": 144.81239318847656, - "objective/kl": 25.728496551513672, - "objective/non_score_reward": -2.572849750518799, - "objective/rlhf_reward": -8.168692888990913, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 24.799148559570312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5449861288070679, - "step": 241, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000767707824707 - }, - { - "episode": 3888, - "epoch": 0.06988532192544128, - "loss/policy_avg": 1.4478445053100586, - "lr": 9.845347648261759e-06, - "objective/entropy": -13.714214324951172, - "objective/kl": 31.57904052734375, - "objective/non_score_reward": -3.1579039096832275, - "objective/rlhf_reward": -11.207783777912226, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 7.07413387298584, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5770883560180664, - "step": 242, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.997456669807434 - }, - { - "episode": 3904, - "epoch": 0.07017291584282992, - "loss/policy_avg": -0.1629352867603302, - "lr": 9.844708588957056e-06, - "objective/entropy": 150.56808471679688, - "objective/kl": 22.077739715576172, - "objective/non_score_reward": -2.2077741622924805, - "objective/rlhf_reward": -8.831096112728119, - "objective/scores": 0.0, - "policy/approxkl_avg": 6.039865970611572, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4642740786075592, - "step": 243, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.013947010040283 - }, - { - "episode": 3920, - "epoch": 0.07046050976021857, - "loss/policy_avg": 4.2705912590026855, - "lr": 9.844069529652353e-06, - "objective/entropy": -73.61671447753906, - "objective/kl": 27.2436580657959, - "objective/non_score_reward": -2.724365711212158, - "objective/rlhf_reward": -9.072634573253701, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 4.7233123779296875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5502868294715881, - "step": 244, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990230798721313 - }, - { - "episode": 3936, - "epoch": 0.07074810367760721, - "loss/policy_avg": 0.09502686560153961, - "lr": 9.84343047034765e-06, - "objective/entropy": 38.153350830078125, - "objective/kl": 25.953601837158203, - "objective/non_score_reward": -2.595360040664673, - "objective/rlhf_reward": -8.434029231743748, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 183.2377471923828, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8148726224899292, - "step": 245, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9989858865737915 - }, - { - "episode": 3952, - "epoch": 0.07103569759499587, - "loss/policy_avg": 0.36105144023895264, - "lr": 9.842791411042945e-06, - "objective/entropy": 46.69014358520508, - "objective/kl": 24.270606994628906, - "objective/non_score_reward": -2.427060842514038, - "objective/rlhf_reward": -8.329641216484408, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 23.915287017822266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4709934592247009, - "step": 246, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001595973968506 - }, - { - "episode": 3968, - "epoch": 0.0713232915123845, - "loss/policy_avg": 0.3951423168182373, - "lr": 9.842152351738242e-06, - "objective/entropy": 0.16453170776367188, - "objective/kl": 27.542736053466797, - "objective/non_score_reward": -2.7542738914489746, - "objective/rlhf_reward": -11.017095446586609, - "objective/scores": 0.0, - "policy/approxkl_avg": 11.038375854492188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6403580904006958, - "step": 247, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978313446044922 - }, - { - "episode": 3984, - "epoch": 0.07161088542977316, - "loss/policy_avg": 0.2933734655380249, - "lr": 9.841513292433539e-06, - "objective/entropy": -41.10125732421875, - "objective/kl": 25.373741149902344, - "objective/non_score_reward": -2.5373740196228027, - "objective/rlhf_reward": -8.770894267646176, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.429267883300781, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6396682262420654, - "step": 248, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0005202293395996 - }, - { - "episode": 4000, - "epoch": 0.07189847934716181, - "loss/policy_avg": 0.24670132994651794, - "lr": 9.840874233128836e-06, - "objective/entropy": -35.8713264465332, - "objective/kl": 30.457420349121094, - "objective/non_score_reward": -3.0457420349121094, - "objective/rlhf_reward": -10.449634091059366, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 79.78580474853516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.41042375564575195, - "step": 249, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9961113929748535 - }, - { - "episode": 4016, - "epoch": 0.07218607326455045, - "loss/policy_avg": 0.017466381192207336, - "lr": 9.840235173824132e-06, - "objective/entropy": 87.24893188476562, - "objective/kl": 17.873748779296875, - "objective/non_score_reward": -1.7873749732971191, - "objective/rlhf_reward": -5.026793541685615, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 83.72406005859375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7392317056655884, - "step": 250, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9985733032226562 - }, - { - "episode": 4032, - "epoch": 0.07247366718193911, - "loss/policy_avg": 0.209593266248703, - "lr": 9.83959611451943e-06, - "objective/entropy": -10.21453857421875, - "objective/kl": 26.26023292541504, - "objective/non_score_reward": -2.626023292541504, - "objective/rlhf_reward": -9.104092931747438, - "objective/scores": 0.35, - "policy/approxkl_avg": 2.64996337890625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5523943901062012, - "step": 251, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9995850324630737 - }, - { - "episode": 4048, - "epoch": 0.07276126109932775, - "loss/policy_avg": 0.5933290719985962, - "lr": 9.838957055214724e-06, - "objective/entropy": -18.139259338378906, - "objective/kl": 29.199474334716797, - "objective/non_score_reward": -2.9199471473693848, - "objective/rlhf_reward": -10.27978894710541, - "objective/scores": 0.35, - "policy/approxkl_avg": 50.652503967285156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5432610511779785, - "step": 252, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9982495307922363 - }, - { - "episode": 4064, - "epoch": 0.0730488550167164, - "loss/policy_avg": 1.320284366607666, - "lr": 9.838317995910021e-06, - "objective/entropy": -10.506271362304688, - "objective/kl": 28.47583770751953, - "objective/non_score_reward": -2.847583532333374, - "objective/rlhf_reward": -9.44292337723249, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 83.18882751464844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5443971157073975, - "step": 253, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997657299041748 - }, - { - "episode": 4080, - "epoch": 0.07333644893410504, - "loss/policy_avg": -0.02555149793624878, - "lr": 9.837678936605318e-06, - "objective/entropy": -81.56509399414062, - "objective/kl": 15.26602840423584, - "objective/non_score_reward": -1.526602864265442, - "objective/rlhf_reward": -4.281582649025034, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 6.358033657073975, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6404141187667847, - "step": 254, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0027146339416504 - }, - { - "episode": 4096, - "epoch": 0.0736240428514937, - "loss/policy_avg": 0.4154921770095825, - "lr": 9.837039877300615e-06, - "objective/entropy": -86.56658935546875, - "objective/kl": 15.54503059387207, - "objective/non_score_reward": -1.5545029640197754, - "objective/rlhf_reward": -4.393183226856302, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 36.390655517578125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9074146747589111, - "step": 255, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9989664554595947 - }, - { - "episode": 4112, - "epoch": 0.07391163676888234, - "loss/policy_avg": -0.2038569152355194, - "lr": 9.83640081799591e-06, - "objective/entropy": -80.65778350830078, - "objective/kl": 20.036571502685547, - "objective/non_score_reward": -2.003657102584839, - "objective/rlhf_reward": -6.189799661907266, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 28.666210174560547, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7191000580787659, - "step": 256, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999082088470459 - }, - { - "episode": 4128, - "epoch": 0.07419923068627099, - "loss/policy_avg": 0.5487632751464844, - "lr": 9.835761758691207e-06, - "objective/entropy": 64.21192932128906, - "objective/kl": 25.55659294128418, - "objective/non_score_reward": -2.555659294128418, - "objective/rlhf_reward": -8.79880495806512, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 74.83338928222656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7529090046882629, - "step": 257, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991776943206787 - }, - { - "episode": 4144, - "epoch": 0.07448682460365963, - "loss/policy_avg": 0.8301103115081787, - "lr": 9.835122699386504e-06, - "objective/entropy": 152.20065307617188, - "objective/kl": 26.725215911865234, - "objective/non_score_reward": -2.6725215911865234, - "objective/rlhf_reward": -10.690086603164673, - "objective/scores": 0.0, - "policy/approxkl_avg": 86.60305786132812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3636325001716614, - "step": 258, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996506929397583 - }, - { - "episode": 4160, - "epoch": 0.07477441852104828, - "loss/policy_avg": 0.6052212119102478, - "lr": 9.8344836400818e-06, - "objective/entropy": 92.0700454711914, - "objective/kl": 20.43947982788086, - "objective/non_score_reward": -2.043948173522949, - "objective/rlhf_reward": -3.775792723894119, - "objective/scores": 1.1, - "policy/approxkl_avg": 6.338429927825928, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4831230342388153, - "step": 259, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9987390041351318 - }, - { - "episode": 4176, - "epoch": 0.07506201243843692, - "loss/policy_avg": 0.33531126379966736, - "lr": 9.833844580777096e-06, - "objective/entropy": 103.8875732421875, - "objective/kl": 41.16206741333008, - "objective/non_score_reward": -4.116207122802734, - "objective/rlhf_reward": -14.860707316462118, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 157.35191345214844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5286747217178345, - "step": 260, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989887475967407 - }, - { - "episode": 4192, - "epoch": 0.07534960635582558, - "loss/policy_avg": 0.8983044624328613, - "lr": 9.833205521472393e-06, - "objective/entropy": -19.21771812438965, - "objective/kl": 27.187969207763672, - "objective/non_score_reward": -2.718796968460083, - "objective/rlhf_reward": -8.927777002530036, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 116.0262451171875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5258731842041016, - "step": 261, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985415935516357 - }, - { - "episode": 4208, - "epoch": 0.07563720027321422, - "loss/policy_avg": 0.3744966983795166, - "lr": 9.83256646216769e-06, - "objective/entropy": 108.31391906738281, - "objective/kl": 27.059907913208008, - "objective/non_score_reward": -2.705990791320801, - "objective/rlhf_reward": -9.090629831949869, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 132.42181396484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7807722687721252, - "step": 262, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999628067016602 - }, - { - "episode": 4224, - "epoch": 0.07592479419060287, - "loss/policy_avg": -0.06834838539361954, - "lr": 9.831927402862987e-06, - "objective/entropy": -89.212890625, - "objective/kl": 21.477336883544922, - "objective/non_score_reward": -2.147733688354492, - "objective/rlhf_reward": -4.190934514999389, - "objective/scores": 1.1, - "policy/approxkl_avg": 2.770085573196411, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6721138954162598, - "step": 263, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0005064010620117 - }, - { - "episode": 4240, - "epoch": 0.07621238810799151, - "loss/policy_avg": 0.20960178971290588, - "lr": 9.831288343558284e-06, - "objective/entropy": 7.579254150390625, - "objective/kl": 31.429780960083008, - "objective/non_score_reward": -3.1429781913757324, - "objective/rlhf_reward": -11.212662303183954, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 39.11629104614258, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7809767723083496, - "step": 264, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.996565580368042 - }, - { - "episode": 4256, - "epoch": 0.07649998202538016, - "loss/policy_avg": 0.37524640560150146, - "lr": 9.830649284253579e-06, - "objective/entropy": 211.3717498779297, - "objective/kl": 22.981361389160156, - "objective/non_score_reward": -2.2981362342834473, - "objective/rlhf_reward": -6.792545056343078, - "objective/scores": 0.6, - "policy/approxkl_avg": 6.7515716552734375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7717372179031372, - "step": 265, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991950988769531 - }, - { - "episode": 4272, - "epoch": 0.0767875759427688, - "loss/policy_avg": 1.0095475912094116, - "lr": 9.830010224948876e-06, - "objective/entropy": -20.248001098632812, - "objective/kl": 24.134700775146484, - "objective/non_score_reward": -2.4134700298309326, - "objective/rlhf_reward": -7.920546785990396, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 72.81602478027344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.48144859075546265, - "step": 266, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9972069263458252 - }, - { - "episode": 4288, - "epoch": 0.07707516986015746, - "loss/policy_avg": 0.14088629186153412, - "lr": 9.829371165644173e-06, - "objective/entropy": 199.36297607421875, - "objective/kl": 21.469898223876953, - "objective/non_score_reward": -2.1469898223876953, - "objective/rlhf_reward": -7.031699865069941, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 9.675331115722656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.8196889162063599, - "step": 267, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000971555709839 - }, - { - "episode": 4304, - "epoch": 0.0773627637775461, - "loss/policy_avg": 0.7135397791862488, - "lr": 9.82873210633947e-06, - "objective/entropy": 132.78390502929688, - "objective/kl": 29.841154098510742, - "objective/non_score_reward": -2.9841156005859375, - "objective/rlhf_reward": -10.485864262194976, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 51.49626159667969, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6590239405632019, - "step": 268, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994990825653076 - }, - { - "episode": 4320, - "epoch": 0.07765035769493475, - "loss/policy_avg": 0.6342403888702393, - "lr": 9.828093047034766e-06, - "objective/entropy": 68.02133178710938, - "objective/kl": 25.947755813598633, - "objective/non_score_reward": -2.594775676727295, - "objective/rlhf_reward": -9.000500419226986, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.322699546813965, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7533285617828369, - "step": 269, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9998431205749512 - }, - { - "episode": 4336, - "epoch": 0.0779379516123234, - "loss/policy_avg": 1.3432139158248901, - "lr": 9.827453987730061e-06, - "objective/entropy": -63.51703643798828, - "objective/kl": 25.882217407226562, - "objective/non_score_reward": -2.588221788406372, - "objective/rlhf_reward": -8.230180921331915, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 106.42034912109375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3954962491989136, - "step": 270, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984210729599 - }, - { - "episode": 4352, - "epoch": 0.07822554552971205, - "loss/policy_avg": 0.9003316760063171, - "lr": 9.826814928425358e-06, - "objective/entropy": 303.42669677734375, - "objective/kl": 33.25891876220703, - "objective/non_score_reward": -3.325892448425293, - "objective/rlhf_reward": -11.822616937573315, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 65.77352905273438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7454761266708374, - "step": 271, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9990100860595703 - }, - { - "episode": 4368, - "epoch": 0.0785131394471007, - "loss/policy_avg": 1.1572515964508057, - "lr": 9.826175869120655e-06, - "objective/entropy": -59.230491638183594, - "objective/kl": 25.21849250793457, - "objective/non_score_reward": -2.5218493938446045, - "objective/rlhf_reward": -10.087397575378418, - "objective/scores": 0.0, - "policy/approxkl_avg": 142.75778198242188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5425001382827759, - "step": 272, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998133897781372 - }, - { - "episode": 4384, - "epoch": 0.07880073336448934, - "loss/policy_avg": 0.17176832258701324, - "lr": 9.825536809815952e-06, - "objective/entropy": 213.77191162109375, - "objective/kl": 31.61981773376465, - "objective/non_score_reward": -3.1619815826416016, - "objective/rlhf_reward": -10.7005155784654, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 128.8477783203125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5492858290672302, - "step": 273, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9998712539672852 - }, - { - "episode": 4400, - "epoch": 0.079088327281878, - "loss/policy_avg": -0.22184377908706665, - "lr": 9.824897750511249e-06, - "objective/entropy": 161.00198364257812, - "objective/kl": 34.806671142578125, - "objective/non_score_reward": -3.4806675910949707, - "objective/rlhf_reward": -11.799963655249151, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 48.8912239074707, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4313841462135315, - "step": 274, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0014257431030273 - }, - { - "episode": 4416, - "epoch": 0.07937592119926663, - "loss/policy_avg": 0.590415358543396, - "lr": 9.824258691206546e-06, - "objective/entropy": -94.14356231689453, - "objective/kl": 28.92959976196289, - "objective/non_score_reward": -2.8929600715637207, - "objective/rlhf_reward": -9.90998030227481, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.575645923614502, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.449258029460907, - "step": 275, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9994723796844482 - }, - { - "episode": 4432, - "epoch": 0.07966351511665529, - "loss/policy_avg": 0.2740442454814911, - "lr": 9.823619631901841e-06, - "objective/entropy": 56.66014099121094, - "objective/kl": 24.139942169189453, - "objective/non_score_reward": -2.413994073867798, - "objective/rlhf_reward": -7.533270301596199, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 41.256080627441406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6261377334594727, - "step": 276, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990897178649902 - }, - { - "episode": 4448, - "epoch": 0.07995110903404393, - "loss/policy_avg": 0.026854295283555984, - "lr": 9.822980572597138e-06, - "objective/entropy": 135.07037353515625, - "objective/kl": 30.443017959594727, - "objective/non_score_reward": -3.044301748275757, - "objective/rlhf_reward": -12.177206993103027, - "objective/scores": 0.0, - "policy/approxkl_avg": 14.024923324584961, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5266727209091187, - "step": 277, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9980167150497437 - }, - { - "episode": 4464, - "epoch": 0.08023870295143258, - "loss/policy_avg": 0.0908375084400177, - "lr": 9.822341513292433e-06, - "objective/entropy": 98.10940551757812, - "objective/kl": 26.351314544677734, - "objective/non_score_reward": -2.635131359100342, - "objective/rlhf_reward": -9.059573057110667, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 61.92028045654297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5201822519302368, - "step": 278, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9965415000915527 - }, - { - "episode": 4480, - "epoch": 0.08052629686882122, - "loss/policy_avg": 0.3492497205734253, - "lr": 9.82170245398773e-06, - "objective/entropy": 79.57078552246094, - "objective/kl": 28.74835205078125, - "objective/non_score_reward": -2.8748350143432617, - "objective/rlhf_reward": -7.099340653419494, - "objective/scores": 1.1, - "policy/approxkl_avg": 45.850738525390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7365690469741821, - "step": 279, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999916553497314 - }, - { - "episode": 4496, - "epoch": 0.08081389078620987, - "loss/policy_avg": 0.6324511170387268, - "lr": 9.821063394683027e-06, - "objective/entropy": 116.90592956542969, - "objective/kl": 33.273155212402344, - "objective/non_score_reward": -3.3273158073425293, - "objective/rlhf_reward": -8.90926299095154, - "objective/scores": 1.1, - "policy/approxkl_avg": 50.5905647277832, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5080363750457764, - "step": 280, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990668296813965 - }, - { - "episode": 4512, - "epoch": 0.08110148470359851, - "loss/policy_avg": -0.1385992020368576, - "lr": 9.820424335378324e-06, - "objective/entropy": 72.11842346191406, - "objective/kl": 33.207122802734375, - "objective/non_score_reward": -3.320712089538574, - "objective/rlhf_reward": -13.282849073410034, - "objective/scores": 0.0, - "policy/approxkl_avg": 60.59511184692383, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8196091651916504, - "step": 281, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998537302017212 - }, - { - "episode": 4528, - "epoch": 0.08138907862098717, - "loss/policy_avg": -0.2620585262775421, - "lr": 9.81978527607362e-06, - "objective/entropy": -5.884607315063477, - "objective/kl": 39.53453063964844, - "objective/non_score_reward": -3.9534530639648438, - "objective/rlhf_reward": -13.691106977240118, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 83.97123718261719, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4607764780521393, - "step": 282, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000446319580078 - }, - { - "episode": 4544, - "epoch": 0.08167667253837581, - "loss/policy_avg": 0.8184198141098022, - "lr": 9.819146216768916e-06, - "objective/entropy": -124.17362976074219, - "objective/kl": 30.42546844482422, - "objective/non_score_reward": -3.0425467491149902, - "objective/rlhf_reward": -10.566067729059775, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 20.279199600219727, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.602076530456543, - "step": 283, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996403455734253 - }, - { - "episode": 4560, - "epoch": 0.08196426645576446, - "loss/policy_avg": 0.1789843738079071, - "lr": 9.818507157464213e-06, - "objective/entropy": 173.48333740234375, - "objective/kl": 23.40087890625, - "objective/non_score_reward": -2.340087890625, - "objective/rlhf_reward": -7.981749632445675, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 17.03640365600586, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6565302014350891, - "step": 284, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.004666566848755 - }, - { - "episode": 4576, - "epoch": 0.0822518603731531, - "loss/policy_avg": 1.0035152435302734, - "lr": 9.81786809815951e-06, - "objective/entropy": 18.757537841796875, - "objective/kl": 24.085613250732422, - "objective/non_score_reward": -2.4085617065429688, - "objective/rlhf_reward": -5.2342465877532955, - "objective/scores": 1.1, - "policy/approxkl_avg": 54.95973587036133, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5847882032394409, - "step": 285, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993207454681396 - }, - { - "episode": 4592, - "epoch": 0.08253945429054176, - "loss/policy_avg": 5.199029922485352, - "lr": 9.817229038854806e-06, - "objective/entropy": -160.87271118164062, - "objective/kl": 20.840656280517578, - "objective/non_score_reward": -2.0840654373168945, - "objective/rlhf_reward": -5.936261987686157, - "objective/scores": 0.6, - "policy/approxkl_avg": 9.209554672241211, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6878505945205688, - "step": 286, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9999942779541016 - }, - { - "episode": 4608, - "epoch": 0.0828270482079304, - "loss/policy_avg": 1.134081244468689, - "lr": 9.816589979550103e-06, - "objective/entropy": 120.20220947265625, - "objective/kl": 32.1230583190918, - "objective/non_score_reward": -3.212306499481201, - "objective/rlhf_reward": -11.449225521087648, - "objective/scores": 0.35, - "policy/approxkl_avg": 37.81696319580078, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8285540342330933, - "step": 287, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991226196289062 - }, - { - "episode": 4624, - "epoch": 0.08311464212531905, - "loss/policy_avg": 0.17092914879322052, - "lr": 9.8159509202454e-06, - "objective/entropy": 6.329719543457031, - "objective/kl": 29.584348678588867, - "objective/non_score_reward": -2.95843505859375, - "objective/rlhf_reward": -10.171880011976349, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 29.629112243652344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5302486419677734, - "step": 288, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0020604133605957 - }, - { - "episode": 4640, - "epoch": 0.0834022360427077, - "loss/policy_avg": 0.17788568139076233, - "lr": 9.815311860940695e-06, - "objective/entropy": 21.96484375, - "objective/kl": 28.446231842041016, - "objective/non_score_reward": -2.84462308883667, - "objective/rlhf_reward": -9.431081603245671, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 137.49514770507812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6393083333969116, - "step": 289, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981507062911987 - }, - { - "episode": 4656, - "epoch": 0.08368982996009634, - "loss/policy_avg": 0.4766189754009247, - "lr": 9.814672801635992e-06, - "objective/entropy": 87.13041687011719, - "objective/kl": 26.18436050415039, - "objective/non_score_reward": -2.618436098098755, - "objective/rlhf_reward": -9.095142700759274, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 64.16291809082031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4916858971118927, - "step": 290, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992941617965698 - }, - { - "episode": 4672, - "epoch": 0.083977423877485, - "loss/policy_avg": 7.575510025024414, - "lr": 9.81403374233129e-06, - "objective/entropy": -187.93580627441406, - "objective/kl": 21.01421356201172, - "objective/non_score_reward": -2.101421356201172, - "objective/rlhf_reward": -6.9550868078187555, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 58.152530670166016, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.793678343296051, - "step": 291, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997904300689697 - }, - { - "episode": 4688, - "epoch": 0.08426501779487364, - "loss/policy_avg": 0.3069241940975189, - "lr": 9.813394683026586e-06, - "objective/entropy": 95.74089050292969, - "objective/kl": 22.938138961791992, - "objective/non_score_reward": -2.293813943862915, - "objective/rlhf_reward": -7.052549543158088, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 9.722650527954102, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5521177053451538, - "step": 292, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9989383220672607 - }, - { - "episode": 4704, - "epoch": 0.08455261171226229, - "loss/policy_avg": 0.8028863072395325, - "lr": 9.812755623721883e-06, - "objective/entropy": 225.46250915527344, - "objective/kl": 32.304569244384766, - "objective/non_score_reward": -3.230457305908203, - "objective/rlhf_reward": -11.18849541346232, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 103.39628601074219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5377808809280396, - "step": 293, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9996423721313477 - }, - { - "episode": 4720, - "epoch": 0.08484020562965093, - "loss/policy_avg": 0.5835884809494019, - "lr": 9.81211656441718e-06, - "objective/entropy": 75.27652740478516, - "objective/kl": 30.011789321899414, - "objective/non_score_reward": -3.0011792182922363, - "objective/rlhf_reward": -9.882009925619636, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 10.76335334777832, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7725957632064819, - "step": 294, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982374906539917 - }, - { - "episode": 4736, - "epoch": 0.08512779954703958, - "loss/policy_avg": 0.17510247230529785, - "lr": 9.811477505112475e-06, - "objective/entropy": 153.28558349609375, - "objective/kl": 35.96855926513672, - "objective/non_score_reward": -3.596856117248535, - "objective/rlhf_reward": -12.906472566540598, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 19.366321563720703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6706559658050537, - "step": 295, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993175268173218 - }, - { - "episode": 4752, - "epoch": 0.08541539346442822, - "loss/policy_avg": 0.4794872999191284, - "lr": 9.810838445807772e-06, - "objective/entropy": 254.9187469482422, - "objective/kl": 34.023677825927734, - "objective/non_score_reward": -3.4023680686950684, - "objective/rlhf_reward": -12.158873777003631, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 62.12803268432617, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6503519415855408, - "step": 296, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9984500408172607 - }, - { - "episode": 4768, - "epoch": 0.08570298738181688, - "loss/policy_avg": 1.1904816627502441, - "lr": 9.810199386503069e-06, - "objective/entropy": 146.021484375, - "objective/kl": 35.92856216430664, - "objective/non_score_reward": -3.5928561687469482, - "objective/rlhf_reward": -12.424013684468207, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 37.72700500488281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6158914566040039, - "step": 297, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981852769851685 - }, - { - "episode": 4784, - "epoch": 0.08599058129920552, - "loss/policy_avg": 0.000278279185295105, - "lr": 9.809560327198366e-06, - "objective/entropy": 178.57492065429688, - "objective/kl": 34.800636291503906, - "objective/non_score_reward": -3.4800639152526855, - "objective/rlhf_reward": -10.99653712356207, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 41.639854431152344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7849889993667603, - "step": 298, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984562397003174 - }, - { - "episode": 4800, - "epoch": 0.08627817521659417, - "loss/policy_avg": 0.7629772424697876, - "lr": 9.808921267893663e-06, - "objective/entropy": -145.59861755371094, - "objective/kl": 28.413082122802734, - "objective/non_score_reward": -2.841308116912842, - "objective/rlhf_reward": -6.965232110023498, - "objective/scores": 1.1, - "policy/approxkl_avg": 13.004857063293457, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.684173047542572, - "step": 299, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9994359016418457 - }, - { - "episode": 4816, - "epoch": 0.08656576913398281, - "loss/policy_avg": 1.7354516983032227, - "lr": 9.808282208588958e-06, - "objective/entropy": 272.84912109375, - "objective/kl": 26.817108154296875, - "objective/non_score_reward": -2.681710720062256, - "objective/rlhf_reward": -9.211070620807346, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 35.25104904174805, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8183693885803223, - "step": 300, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998483657836914 - }, - { - "episode": 4832, - "epoch": 0.08685336305137147, - "loss/policy_avg": 0.06534934043884277, - "lr": 9.807643149284255e-06, - "objective/entropy": 152.22633361816406, - "objective/kl": 30.80361557006836, - "objective/non_score_reward": -3.0803616046905518, - "objective/rlhf_reward": -12.321446180343628, - "objective/scores": 0.0, - "policy/approxkl_avg": 187.40298461914062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619062066078186, - "step": 301, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9991655349731445 - }, - { - "episode": 4848, - "epoch": 0.0871409569687601, - "loss/policy_avg": 1.8463071584701538, - "lr": 9.80700408997955e-06, - "objective/entropy": -59.8196907043457, - "objective/kl": 31.326427459716797, - "objective/non_score_reward": -3.132642984390259, - "objective/rlhf_reward": -11.014799916537937, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 56.62882995605469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6038594245910645, - "step": 302, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993600845336914 - }, - { - "episode": 4864, - "epoch": 0.08742855088614876, - "loss/policy_avg": 0.08039037883281708, - "lr": 9.806365030674847e-06, - "objective/entropy": 40.064144134521484, - "objective/kl": 22.286996841430664, - "objective/non_score_reward": -2.2286999225616455, - "objective/rlhf_reward": -7.514799362421035, - "objective/scores": 0.35, - "policy/approxkl_avg": 38.59841537475586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.49097996950149536, - "step": 303, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9966613054275513 - }, - { - "episode": 4880, - "epoch": 0.0877161448035374, - "loss/policy_avg": 0.01872839219868183, - "lr": 9.805725971370144e-06, - "objective/entropy": 58.7380256652832, - "objective/kl": 28.672008514404297, - "objective/non_score_reward": -2.8672008514404297, - "objective/rlhf_reward": -9.735470251242319, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 149.07861328125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6022211313247681, - "step": 304, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0004193782806396 - }, - { - "episode": 4896, - "epoch": 0.08800373872092605, - "loss/policy_avg": 0.5821743011474609, - "lr": 9.80508691206544e-06, - "objective/entropy": -12.124443054199219, - "objective/kl": 24.10376739501953, - "objective/non_score_reward": -2.410377025604248, - "objective/rlhf_reward": -5.241507506370544, - "objective/scores": 1.1, - "policy/approxkl_avg": 3.3420569896698, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6286916732788086, - "step": 305, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002763509750366 - }, - { - "episode": 4912, - "epoch": 0.08829133263831469, - "loss/policy_avg": 0.32468903064727783, - "lr": 9.804447852760737e-06, - "objective/entropy": -245.09518432617188, - "objective/kl": 25.548696517944336, - "objective/non_score_reward": -2.5548696517944336, - "objective/rlhf_reward": -10.219478368759155, - "objective/scores": 0.0, - "policy/approxkl_avg": 18.726303100585938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.633787989616394, - "step": 306, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0003480911254883 - }, - { - "episode": 4928, - "epoch": 0.08857892655570335, - "loss/policy_avg": 0.5798380970954895, - "lr": 9.803808793456034e-06, - "objective/entropy": 91.35831451416016, - "objective/kl": 35.70774459838867, - "objective/non_score_reward": -3.570774555206299, - "objective/rlhf_reward": -14.283098220825195, - "objective/scores": 0.0, - "policy/approxkl_avg": 44.0499267578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4979282021522522, - "step": 307, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003609657287598 - }, - { - "episode": 4944, - "epoch": 0.088866520473092, - "loss/policy_avg": 0.36592239141464233, - "lr": 9.80316973415133e-06, - "objective/entropy": 39.27040100097656, - "objective/kl": 30.252880096435547, - "objective/non_score_reward": -3.025287628173828, - "objective/rlhf_reward": -10.775638136893434, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 3.1499075889587402, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6890300512313843, - "step": 308, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988046884536743 - }, - { - "episode": 4960, - "epoch": 0.08915411439048064, - "loss/policy_avg": 0.08172816783189774, - "lr": 9.802530674846626e-06, - "objective/entropy": -196.7550811767578, - "objective/kl": 30.32009506225586, - "objective/non_score_reward": -3.0320096015930176, - "objective/rlhf_reward": -9.204319153667662, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 8.101791381835938, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5759010910987854, - "step": 309, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000977516174316 - }, - { - "episode": 4976, - "epoch": 0.0894417083078693, - "loss/policy_avg": 0.5907818078994751, - "lr": 9.801891615541923e-06, - "objective/entropy": -3.5698318481445312, - "objective/kl": 28.213176727294922, - "objective/non_score_reward": -2.8213181495666504, - "objective/rlhf_reward": -9.72901317378576, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 56.35433578491211, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6199610233306885, - "step": 310, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979722499847412 - }, - { - "episode": 4992, - "epoch": 0.08972930222525793, - "loss/policy_avg": 0.39707911014556885, - "lr": 9.80125255623722e-06, - "objective/entropy": -11.338485717773438, - "objective/kl": 24.322521209716797, - "objective/non_score_reward": -2.4322521686553955, - "objective/rlhf_reward": -8.350406386939389, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.5820951461792, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6451054811477661, - "step": 311, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0019586086273193 - }, - { - "episode": 5008, - "epoch": 0.09001689614264659, - "loss/policy_avg": -0.07866669446229935, - "lr": 9.800613496932517e-06, - "objective/entropy": 170.05404663085938, - "objective/kl": 28.295799255371094, - "objective/non_score_reward": -2.8295798301696777, - "objective/rlhf_reward": -9.894486983020869, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 40.782066345214844, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6188048124313354, - "step": 312, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0017778873443604 - }, - { - "episode": 5024, - "epoch": 0.09030449006003523, - "loss/policy_avg": -0.23688295483589172, - "lr": 9.799974437627812e-06, - "objective/entropy": 156.63333129882812, - "objective/kl": 27.922500610351562, - "objective/non_score_reward": -2.792250156402588, - "objective/rlhf_reward": -9.718402723880157, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 22.294483184814453, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4276520609855652, - "step": 313, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0026590824127197 - }, - { - "episode": 5040, - "epoch": 0.09059208397742388, - "loss/policy_avg": 0.09796786308288574, - "lr": 9.799335378323109e-06, - "objective/entropy": -10.673637390136719, - "objective/kl": 20.40918731689453, - "objective/non_score_reward": -2.0409185886383057, - "objective/rlhf_reward": -8.163674473762512, - "objective/scores": 0.0, - "policy/approxkl_avg": 8.275084495544434, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5387430787086487, - "step": 314, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982633590698242 - }, - { - "episode": 5056, - "epoch": 0.09087967789481252, - "loss/policy_avg": 0.17557716369628906, - "lr": 9.798696319018406e-06, - "objective/entropy": 20.533397674560547, - "objective/kl": 33.14729309082031, - "objective/non_score_reward": -3.3147292137145996, - "objective/rlhf_reward": -11.702658264842583, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 58.23655700683594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6181402206420898, - "step": 315, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998816728591919 - }, - { - "episode": 5072, - "epoch": 0.09116727181220118, - "loss/policy_avg": 0.28663304448127747, - "lr": 9.798057259713703e-06, - "objective/entropy": 110.77783203125, - "objective/kl": 24.706939697265625, - "objective/non_score_reward": -2.470694065093994, - "objective/rlhf_reward": -8.278656039301472, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 21.429655075073242, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6165672540664673, - "step": 316, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001864433288574 - }, - { - "episode": 5088, - "epoch": 0.09145486572958982, - "loss/policy_avg": 0.0841158926486969, - "lr": 9.797418200409e-06, - "objective/entropy": 64.50070190429688, - "objective/kl": 32.75787353515625, - "objective/non_score_reward": -3.275787353515625, - "objective/rlhf_reward": -11.499028954569418, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 102.43559265136719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6068868637084961, - "step": 317, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9997694492340088 - }, - { - "episode": 5104, - "epoch": 0.09174245964697847, - "loss/policy_avg": 0.35147473216056824, - "lr": 9.796779141104296e-06, - "objective/entropy": 208.5213623046875, - "objective/kl": 31.126712799072266, - "objective/non_score_reward": -3.112671375274658, - "objective/rlhf_reward": -11.10904937079492, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 146.6444091796875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7985448837280273, - "step": 318, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988431930541992 - }, - { - "episode": 5120, - "epoch": 0.09203005356436711, - "loss/policy_avg": 0.19098839163780212, - "lr": 9.796140081799592e-06, - "objective/entropy": -30.1602783203125, - "objective/kl": 31.919559478759766, - "objective/non_score_reward": -3.191955804824829, - "objective/rlhf_reward": -11.36782262325287, - "objective/scores": 0.35, - "policy/approxkl_avg": 31.554279327392578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7731765508651733, - "step": 319, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0018060207366943 - }, - { - "episode": 5136, - "epoch": 0.09231764748175576, - "loss/policy_avg": 0.05387501046061516, - "lr": 9.795501022494888e-06, - "objective/entropy": 109.4754638671875, - "objective/kl": 32.21202850341797, - "objective/non_score_reward": -3.2212026119232178, - "objective/rlhf_reward": -11.434212188334808, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 7.5359039306640625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5108368396759033, - "step": 320, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981523752212524 - }, - { - "episode": 5152, - "epoch": 0.0926052413991444, - "loss/policy_avg": 0.5724260210990906, - "lr": 9.794861963190185e-06, - "objective/entropy": 62.85846710205078, - "objective/kl": 30.164125442504883, - "objective/non_score_reward": -3.0164127349853516, - "objective/rlhf_reward": -10.332317308584848, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 44.99430465698242, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7216867208480835, - "step": 321, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000819683074951 - }, - { - "episode": 5168, - "epoch": 0.09289283531653306, - "loss/policy_avg": 0.23510941863059998, - "lr": 9.794222903885482e-06, - "objective/entropy": 88.79434204101562, - "objective/kl": 33.60057830810547, - "objective/non_score_reward": -3.360057830810547, - "objective/rlhf_reward": -11.04023096561432, - "objective/scores": 0.6, - "policy/approxkl_avg": 60.40937805175781, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6445315480232239, - "step": 322, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001824378967285 - }, - { - "episode": 5184, - "epoch": 0.0931804292339217, - "loss/policy_avg": 0.10644792020320892, - "lr": 9.793583844580777e-06, - "objective/entropy": 73.26347351074219, - "objective/kl": 32.69441223144531, - "objective/non_score_reward": -3.2694411277770996, - "objective/rlhf_reward": -11.521504967418268, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 11.021139144897461, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5057616829872131, - "step": 323, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9983140230178833 - }, - { - "episode": 5200, - "epoch": 0.09346802315131035, - "loss/policy_avg": 0.7994442582130432, - "lr": 9.792944785276074e-06, - "objective/entropy": 88.5349349975586, - "objective/kl": 25.706418991088867, - "objective/non_score_reward": -2.5706419944763184, - "objective/rlhf_reward": -8.62070823234378, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 55.555015563964844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.554456889629364, - "step": 324, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997270107269287 - }, - { - "episode": 5216, - "epoch": 0.09375561706869899, - "loss/policy_avg": 0.5393191576004028, - "lr": 9.792305725971371e-06, - "objective/entropy": 74.77957153320312, - "objective/kl": 36.75124740600586, - "objective/non_score_reward": -3.6751246452331543, - "objective/rlhf_reward": -12.967165247599283, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 22.211036682128906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.47837570309638977, - "step": 325, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0005834102630615 - }, - { - "episode": 5232, - "epoch": 0.09404321098608764, - "loss/policy_avg": 0.5926495790481567, - "lr": 9.791666666666666e-06, - "objective/entropy": 94.69478607177734, - "objective/kl": 32.18170166015625, - "objective/non_score_reward": -3.218170166015625, - "objective/rlhf_reward": -11.531045725851683, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 12.2184419631958, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9364026188850403, - "step": 326, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986941814422607 - }, - { - "episode": 5248, - "epoch": 0.0943308049034763, - "loss/policy_avg": 8.741055488586426, - "lr": 9.791027607361963e-06, - "objective/entropy": 13.209190368652344, - "objective/kl": 46.40322494506836, - "objective/non_score_reward": -4.640322208404541, - "objective/rlhf_reward": -17.21965341857019, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 196.84405517578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6481121182441711, - "step": 327, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9997447729110718 - }, - { - "episode": 5264, - "epoch": 0.09461839882086494, - "loss/policy_avg": -0.0158542487770319, - "lr": 9.79038854805726e-06, - "objective/entropy": -67.68810272216797, - "objective/kl": 25.325042724609375, - "objective/non_score_reward": -2.5325045585632324, - "objective/rlhf_reward": -8.573758213725641, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.798250198364258, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7733464241027832, - "step": 328, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998700499534607 - }, - { - "episode": 5280, - "epoch": 0.09490599273825359, - "loss/policy_avg": 0.06980250030755997, - "lr": 9.789749488752557e-06, - "objective/entropy": 66.16055297851562, - "objective/kl": 28.001384735107422, - "objective/non_score_reward": -2.8001387119293213, - "objective/rlhf_reward": -9.841305100654049, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 26.662395477294922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4329299330711365, - "step": 329, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9985547065734863 - }, - { - "episode": 5296, - "epoch": 0.09519358665564223, - "loss/policy_avg": 1.1175042390823364, - "lr": 9.789110429447854e-06, - "objective/entropy": 198.39385986328125, - "objective/kl": 35.409645080566406, - "objective/non_score_reward": -3.5409646034240723, - "objective/rlhf_reward": -12.041152181402715, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 80.42436218261719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4869406819343567, - "step": 330, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000159502029419 - }, - { - "episode": 5312, - "epoch": 0.09548118057303089, - "loss/policy_avg": 0.2751445472240448, - "lr": 9.78847137014315e-06, - "objective/entropy": 171.96897888183594, - "objective/kl": 39.34714889526367, - "objective/non_score_reward": -3.9347147941589355, - "objective/rlhf_reward": -13.914030189785073, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 72.23497009277344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.49671417474746704, - "step": 331, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995276927947998 - }, - { - "episode": 5328, - "epoch": 0.09576877449041953, - "loss/policy_avg": 0.7539587616920471, - "lr": 9.787832310838446e-06, - "objective/entropy": 8.914024353027344, - "objective/kl": 21.132511138916016, - "objective/non_score_reward": -2.113251209259033, - "objective/rlhf_reward": -6.848884735170918, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 55.41283416748047, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5207578539848328, - "step": 332, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999730110168457 - }, - { - "episode": 5344, - "epoch": 0.09605636840780818, - "loss/policy_avg": 0.08111564069986343, - "lr": 9.787193251533743e-06, - "objective/entropy": -32.56279754638672, - "objective/kl": 26.932476043701172, - "objective/non_score_reward": -2.6932475566864014, - "objective/rlhf_reward": -9.447477314501924, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 31.9769344329834, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5189494490623474, - "step": 333, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998490810394287 - }, - { - "episode": 5360, - "epoch": 0.09634396232519682, - "loss/policy_avg": 0.12806567549705505, - "lr": 9.78655419222904e-06, - "objective/entropy": -60.638038635253906, - "objective/kl": 33.80628204345703, - "objective/non_score_reward": -3.3806281089782715, - "objective/rlhf_reward": -12.006740414889988, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 97.76350402832031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5655949115753174, - "step": 334, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991028308868408 - }, - { - "episode": 5376, - "epoch": 0.09663155624258547, - "loss/policy_avg": 0.4162527918815613, - "lr": 9.785915132924337e-06, - "objective/entropy": 73.74658203125, - "objective/kl": 28.956912994384766, - "objective/non_score_reward": -2.895691394805908, - "objective/rlhf_reward": -9.757937069210122, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 12.659797668457031, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6926892995834351, - "step": 335, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990322589874268 - }, - { - "episode": 5392, - "epoch": 0.09691915015997411, - "loss/policy_avg": 0.6766362190246582, - "lr": 9.785276073619633e-06, - "objective/entropy": -167.6099090576172, - "objective/kl": 33.4842414855957, - "objective/non_score_reward": -3.3484244346618652, - "objective/rlhf_reward": -10.46997794950125, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 45.80317687988281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6405798196792603, - "step": 336, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999110221862793 - }, - { - "episode": 5408, - "epoch": 0.09720674407736277, - "loss/policy_avg": 0.7705954909324646, - "lr": 9.784637014314929e-06, - "objective/entropy": 189.44476318359375, - "objective/kl": 40.57612991333008, - "objective/non_score_reward": -4.057613372802734, - "objective/rlhf_reward": -14.283041070179877, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 47.49778747558594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7007959485054016, - "step": 337, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9955544471740723 - }, - { - "episode": 5424, - "epoch": 0.0974943379947514, - "loss/policy_avg": 0.8678327798843384, - "lr": 9.783997955010226e-06, - "objective/entropy": 138.7545166015625, - "objective/kl": 43.06449890136719, - "objective/non_score_reward": -4.306450366973877, - "objective/rlhf_reward": -15.775203089328155, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 187.52108764648438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6742819547653198, - "step": 338, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000286102294922 - }, - { - "episode": 5440, - "epoch": 0.09778193191214006, - "loss/policy_avg": 0.13020552694797516, - "lr": 9.783358895705522e-06, - "objective/entropy": -34.55393981933594, - "objective/kl": 27.52876091003418, - "objective/non_score_reward": -2.7528762817382812, - "objective/rlhf_reward": -9.530552032406687, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 5.538684844970703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.45005565881729126, - "step": 339, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000762462615967 - }, - { - "episode": 5456, - "epoch": 0.0980695258295287, - "loss/policy_avg": 0.8651669025421143, - "lr": 9.78271983640082e-06, - "objective/entropy": 184.3627471923828, - "objective/kl": 31.240346908569336, - "objective/non_score_reward": -3.124034881591797, - "objective/rlhf_reward": -12.49613881111145, - "objective/scores": 0.0, - "policy/approxkl_avg": 16.510074615478516, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4378349781036377, - "step": 340, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.998918056488037 - }, - { - "episode": 5472, - "epoch": 0.09835711974691735, - "loss/policy_avg": 0.13001634180545807, - "lr": 9.782080777096116e-06, - "objective/entropy": 123.772705078125, - "objective/kl": 33.01024627685547, - "objective/non_score_reward": -3.3010246753692627, - "objective/rlhf_reward": -8.804098105430603, - "objective/scores": 1.1, - "policy/approxkl_avg": 55.6832275390625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7997548580169678, - "step": 341, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001884937286377 - }, - { - "episode": 5488, - "epoch": 0.098644713664306, - "loss/policy_avg": 0.538088321685791, - "lr": 9.781441717791413e-06, - "objective/entropy": 208.26202392578125, - "objective/kl": 28.19437026977539, - "objective/non_score_reward": -2.819437026977539, - "objective/rlhf_reward": -9.33033687897199, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 56.31122589111328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.775277316570282, - "step": 342, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9953358173370361 - }, - { - "episode": 5504, - "epoch": 0.09893230758169465, - "loss/policy_avg": -0.014354228973388672, - "lr": 9.780802658486708e-06, - "objective/entropy": -60.35287857055664, - "objective/kl": 25.630271911621094, - "objective/non_score_reward": -2.5630269050598145, - "objective/rlhf_reward": -8.926595482855959, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.3227713108062744, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4413827657699585, - "step": 343, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0038928985595703 - }, - { - "episode": 5520, - "epoch": 0.09921990149908329, - "loss/policy_avg": 0.27923208475112915, - "lr": 9.780163599182005e-06, - "objective/entropy": -24.742401123046875, - "objective/kl": 31.480648040771484, - "objective/non_score_reward": -3.1480648517608643, - "objective/rlhf_reward": -11.233009540770931, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 13.539884567260742, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5237823724746704, - "step": 344, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9998574256896973 - }, - { - "episode": 5536, - "epoch": 0.09950749541647194, - "loss/policy_avg": 0.1885061115026474, - "lr": 9.7795245398773e-06, - "objective/entropy": 182.22181701660156, - "objective/kl": 29.661117553710938, - "objective/non_score_reward": -2.966111660003662, - "objective/rlhf_reward": -10.522810748129515, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 3.2202861309051514, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6761025786399841, - "step": 345, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999419927597046 - }, - { - "episode": 5552, - "epoch": 0.0997950893338606, - "loss/policy_avg": 0.7343586087226868, - "lr": 9.778885480572597e-06, - "objective/entropy": 145.13526916503906, - "objective/kl": 45.35038375854492, - "objective/non_score_reward": -4.535038471221924, - "objective/rlhf_reward": -16.315324659618447, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 33.988563537597656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7404603958129883, - "step": 346, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.995590329170227 - }, - { - "episode": 5568, - "epoch": 0.10008268325124924, - "loss/policy_avg": 0.6405590772628784, - "lr": 9.778246421267894e-06, - "objective/entropy": 162.7369842529297, - "objective/kl": 37.150367736816406, - "objective/non_score_reward": -3.7150371074676514, - "objective/rlhf_reward": -13.379196050579905, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 36.95792770385742, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8137757778167725, - "step": 347, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9953217506408691 - }, - { - "episode": 5584, - "epoch": 0.10037027716863789, - "loss/policy_avg": 0.13212129473686218, - "lr": 9.777607361963191e-06, - "objective/entropy": 206.94252014160156, - "objective/kl": 34.0411262512207, - "objective/non_score_reward": -3.4041128158569336, - "objective/rlhf_reward": -11.216450786590576, - "objective/scores": 0.6, - "policy/approxkl_avg": 133.2515869140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7521044015884399, - "step": 348, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9959970712661743 - }, - { - "episode": 5600, - "epoch": 0.10065787108602653, - "loss/policy_avg": 0.9090590476989746, - "lr": 9.776968302658488e-06, - "objective/entropy": 55.456298828125, - "objective/kl": 24.91229248046875, - "objective/non_score_reward": -2.4912290573120117, - "objective/rlhf_reward": -8.140087719234536, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 29.07049560546875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8177493810653687, - "step": 349, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9979220628738403 - }, - { - "episode": 5616, - "epoch": 0.10094546500341518, - "loss/policy_avg": 0.46943986415863037, - "lr": 9.776329243353783e-06, - "objective/entropy": 153.11770629882812, - "objective/kl": 31.714759826660156, - "objective/non_score_reward": -3.171476125717163, - "objective/rlhf_reward": -9.762185250164244, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 49.198020935058594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3995407819747925, - "step": 350, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9990980625152588 - }, - { - "episode": 5632, - "epoch": 0.10123305892080382, - "loss/policy_avg": 0.12656962871551514, - "lr": 9.77569018404908e-06, - "objective/entropy": 109.22264862060547, - "objective/kl": 28.461389541625977, - "objective/non_score_reward": -2.8461389541625977, - "objective/rlhf_reward": -8.46083704078314, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 40.512847900390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.588497519493103, - "step": 351, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977093935012817 - }, - { - "episode": 5648, - "epoch": 0.10152065283819248, - "loss/policy_avg": 0.7170840501785278, - "lr": 9.775051124744377e-06, - "objective/entropy": 14.107101440429688, - "objective/kl": 41.7979736328125, - "objective/non_score_reward": -4.179797172546387, - "objective/rlhf_reward": -15.377552798300414, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 11.696022987365723, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7339128255844116, - "step": 352, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9979870319366455 - }, - { - "episode": 5664, - "epoch": 0.10180824675558112, - "loss/policy_avg": 0.8306883573532104, - "lr": 9.774412065439674e-06, - "objective/entropy": -67.41658782958984, - "objective/kl": 26.34395408630371, - "objective/non_score_reward": -2.6343955993652344, - "objective/rlhf_reward": -8.137582039833068, - "objective/scores": 0.6, - "policy/approxkl_avg": 115.25839233398438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6014617681503296, - "step": 353, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9970800876617432 - }, - { - "episode": 5680, - "epoch": 0.10209584067296977, - "loss/policy_avg": 2.176168918609619, - "lr": 9.77377300613497e-06, - "objective/entropy": 134.90728759765625, - "objective/kl": 31.819995880126953, - "objective/non_score_reward": -3.181999683380127, - "objective/rlhf_reward": -12.727998733520508, - "objective/scores": 0.0, - "policy/approxkl_avg": 26.059894561767578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5675879716873169, - "step": 354, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992128610610962 - }, - { - "episode": 5696, - "epoch": 0.10238343459035841, - "loss/policy_avg": 0.9548969268798828, - "lr": 9.773133946830267e-06, - "objective/entropy": -45.11736297607422, - "objective/kl": 30.003692626953125, - "objective/non_score_reward": -3.000369071960449, - "objective/rlhf_reward": -10.659840753584533, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 80.65755462646484, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6548440456390381, - "step": 355, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9980876445770264 - }, - { - "episode": 5712, - "epoch": 0.10267102850774706, - "loss/policy_avg": -0.09791913628578186, - "lr": 9.772494887525563e-06, - "objective/entropy": 59.10938262939453, - "objective/kl": 24.62106704711914, - "objective/non_score_reward": -2.462106943130493, - "objective/rlhf_reward": -8.489177667830868, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 34.42068099975586, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7636164426803589, - "step": 356, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001443862915039 - }, - { - "episode": 5728, - "epoch": 0.1029586224251357, - "loss/policy_avg": 0.6120666265487671, - "lr": 9.77185582822086e-06, - "objective/entropy": 222.30874633789062, - "objective/kl": 32.64442825317383, - "objective/non_score_reward": -3.2644426822662354, - "objective/rlhf_reward": -11.576818349774243, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 6.193035125732422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5914427042007446, - "step": 357, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.002417802810669 - }, - { - "episode": 5744, - "epoch": 0.10324621634252436, - "loss/policy_avg": 0.25659894943237305, - "lr": 9.771216768916156e-06, - "objective/entropy": 173.52723693847656, - "objective/kl": 29.877527236938477, - "objective/non_score_reward": -2.987752914428711, - "objective/rlhf_reward": -7.551011657714843, - "objective/scores": 1.1, - "policy/approxkl_avg": 18.964191436767578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7272264361381531, - "step": 358, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9968700408935547 - }, - { - "episode": 5760, - "epoch": 0.103533810259913, - "loss/policy_avg": 0.4551319479942322, - "lr": 9.770577709611453e-06, - "objective/entropy": 66.63546752929688, - "objective/kl": 29.777273178100586, - "objective/non_score_reward": -2.9777274131774902, - "objective/rlhf_reward": -9.510909175872802, - "objective/scores": 0.6, - "policy/approxkl_avg": 110.96263885498047, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.554497241973877, - "step": 359, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9965577125549316 - }, - { - "episode": 5776, - "epoch": 0.10382140417730165, - "loss/policy_avg": 1.5252394676208496, - "lr": 9.76993865030675e-06, - "objective/entropy": -97.26277923583984, - "objective/kl": 33.4285888671875, - "objective/non_score_reward": -3.3428590297698975, - "objective/rlhf_reward": -11.890483262951733, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 8.201589584350586, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4571065902709961, - "step": 360, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999207258224487 - }, - { - "episode": 5792, - "epoch": 0.10410899809469029, - "loss/policy_avg": 1.1003179550170898, - "lr": 9.769299591002045e-06, - "objective/entropy": 51.82417297363281, - "objective/kl": 34.724029541015625, - "objective/non_score_reward": -3.472402811050415, - "objective/rlhf_reward": -12.33335193893011, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 8.241430282592773, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.7186964750289917, - "step": 361, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989242553710938 - }, - { - "episode": 5808, - "epoch": 0.10439659201207895, - "loss/policy_avg": 0.40074190497398376, - "lr": 9.768660531697342e-06, - "objective/entropy": 219.26010131835938, - "objective/kl": 36.2478141784668, - "objective/non_score_reward": -3.6247811317443848, - "objective/rlhf_reward": -12.895005021158774, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.6230387687683105, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5367094278335571, - "step": 362, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9987454414367676 - }, - { - "episode": 5824, - "epoch": 0.10468418592946759, - "loss/policy_avg": 0.9861453771591187, - "lr": 9.768021472392639e-06, - "objective/entropy": -9.609394073486328, - "objective/kl": 39.06307601928711, - "objective/non_score_reward": -3.9063076972961426, - "objective/rlhf_reward": -14.109459364207918, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 78.4552993774414, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6734092235565186, - "step": 363, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9968814849853516 - }, - { - "episode": 5840, - "epoch": 0.10497177984685624, - "loss/policy_avg": 0.18136531114578247, - "lr": 9.767382413087936e-06, - "objective/entropy": 78.3685073852539, - "objective/kl": 38.321044921875, - "objective/non_score_reward": -3.8321046829223633, - "objective/rlhf_reward": -13.503589506420205, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 127.91275787353516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46624571084976196, - "step": 364, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0008468627929688 - }, - { - "episode": 5856, - "epoch": 0.10525937376424489, - "loss/policy_avg": -0.3799706697463989, - "lr": 9.766743353783233e-06, - "objective/entropy": 138.2041473388672, - "objective/kl": 46.876441955566406, - "objective/non_score_reward": -4.6876444816589355, - "objective/rlhf_reward": -14.350577926635744, - "objective/scores": 1.1, - "policy/approxkl_avg": 66.94557189941406, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5722706317901611, - "step": 365, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999732971191406 - }, - { - "episode": 5872, - "epoch": 0.10554696768163353, - "loss/policy_avg": 0.034319084137678146, - "lr": 9.76610429447853e-06, - "objective/entropy": 45.21516418457031, - "objective/kl": 30.351581573486328, - "objective/non_score_reward": -3.035158157348633, - "objective/rlhf_reward": -9.21691409194586, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 5.7516632080078125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4482240676879883, - "step": 366, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992876052856445 - }, - { - "episode": 5888, - "epoch": 0.10583456159902219, - "loss/policy_avg": 0.8865995407104492, - "lr": 9.765465235173825e-06, - "objective/entropy": -21.33509063720703, - "objective/kl": 35.2110595703125, - "objective/non_score_reward": -3.5211057662963867, - "objective/rlhf_reward": -12.35109020868937, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 39.482017517089844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6004269123077393, - "step": 367, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9980597496032715 - }, - { - "episode": 5904, - "epoch": 0.10612215551641083, - "loss/policy_avg": 0.14120006561279297, - "lr": 9.764826175869122e-06, - "objective/entropy": 314.3269348144531, - "objective/kl": 33.36817932128906, - "objective/non_score_reward": -3.336818218231201, - "objective/rlhf_reward": -11.68541360420047, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 111.91177368164062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7748069763183594, - "step": 368, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0023789405822754 - }, - { - "episode": 5920, - "epoch": 0.10640974943379948, - "loss/policy_avg": 1.3205476999282837, - "lr": 9.764187116564417e-06, - "objective/entropy": -41.12682342529297, - "objective/kl": 31.178136825561523, - "objective/non_score_reward": -3.1178135871887207, - "objective/rlhf_reward": -10.990302207882763, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 50.676719665527344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9075043797492981, - "step": 369, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977303743362427 - }, - { - "episode": 5936, - "epoch": 0.10669734335118812, - "loss/policy_avg": 0.4172307848930359, - "lr": 9.763548057259714e-06, - "objective/entropy": 151.11341857910156, - "objective/kl": 29.471710205078125, - "objective/non_score_reward": -2.947171211242676, - "objective/rlhf_reward": -10.410082795707089, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 31.20602035522461, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6370272636413574, - "step": 370, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9985219240188599 - }, - { - "episode": 5952, - "epoch": 0.10698493726857677, - "loss/policy_avg": -0.09500053524971008, - "lr": 9.76290899795501e-06, - "objective/entropy": -34.93052673339844, - "objective/kl": 32.19451904296875, - "objective/non_score_reward": -3.219452142715454, - "objective/rlhf_reward": -11.273688945833761, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 64.82252502441406, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7023112773895264, - "step": 371, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001070499420166 - }, - { - "episode": 5968, - "epoch": 0.10727253118596541, - "loss/policy_avg": 0.6650490760803223, - "lr": 9.762269938650308e-06, - "objective/entropy": -44.10865783691406, - "objective/kl": 27.115589141845703, - "objective/non_score_reward": -2.7115590572357178, - "objective/rlhf_reward": -9.021407242092202, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 34.185760498046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7515213489532471, - "step": 372, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9973523616790771 - }, - { - "episode": 5984, - "epoch": 0.10756012510335407, - "loss/policy_avg": 0.7072340846061707, - "lr": 9.761630879345604e-06, - "objective/entropy": 4.434268951416016, - "objective/kl": 43.21569061279297, - "objective/non_score_reward": -4.321569442749023, - "objective/rlhf_reward": -15.770505034717257, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.586810350418091, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6180188655853271, - "step": 373, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001654624938965 - }, - { - "episode": 6000, - "epoch": 0.10784771902074271, - "loss/policy_avg": 1.28859281539917, - "lr": 9.7609918200409e-06, - "objective/entropy": -139.96766662597656, - "objective/kl": 30.635095596313477, - "objective/non_score_reward": -3.063509464263916, - "objective/rlhf_reward": -7.854037737846375, - "objective/scores": 1.1, - "policy/approxkl_avg": 153.5921630859375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7665011882781982, - "step": 374, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9986308813095093 - }, - { - "episode": 6016, - "epoch": 0.10813531293813136, - "loss/policy_avg": 1.1559712886810303, - "lr": 9.760352760736196e-06, - "objective/entropy": 112.28376007080078, - "objective/kl": 48.56169891357422, - "objective/non_score_reward": -4.856169700622559, - "objective/rlhf_reward": -18.00084741850671, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 29.986862182617188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.86993408203125, - "step": 375, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9950189590454102 - }, - { - "episode": 6032, - "epoch": 0.10842290685552, - "loss/policy_avg": 0.43735095858573914, - "lr": 9.759713701431493e-06, - "objective/entropy": 161.14744567871094, - "objective/kl": 20.346540451049805, - "objective/non_score_reward": -2.034654140472412, - "objective/rlhf_reward": -6.657663944180369, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 8.951998710632324, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.7470377683639526, - "step": 376, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000926971435547 - }, - { - "episode": 6048, - "epoch": 0.10871050077290866, - "loss/policy_avg": 0.25953274965286255, - "lr": 9.75907464212679e-06, - "objective/entropy": -127.31167602539062, - "objective/kl": 32.83821105957031, - "objective/non_score_reward": -3.283820867538452, - "objective/rlhf_reward": -13.135283589363098, - "objective/scores": 0.0, - "policy/approxkl_avg": 43.502960205078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7563947439193726, - "step": 377, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9980974197387695 - }, - { - "episode": 6064, - "epoch": 0.1089980946902973, - "loss/policy_avg": 1.1847639083862305, - "lr": 9.758435582822087e-06, - "objective/entropy": 53.43251037597656, - "objective/kl": 30.13711929321289, - "objective/non_score_reward": -3.013711929321289, - "objective/rlhf_reward": -7.654847121238708, - "objective/scores": 1.1, - "policy/approxkl_avg": 24.648468017578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5256083607673645, - "step": 378, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0001864433288574 - }, - { - "episode": 6080, - "epoch": 0.10928568860768595, - "loss/policy_avg": 0.10543081164360046, - "lr": 9.757796523517384e-06, - "objective/entropy": 216.22293090820312, - "objective/kl": 33.44567108154297, - "objective/non_score_reward": -3.344566822052002, - "objective/rlhf_reward": -11.99966583499084, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 30.81055450439453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5716228485107422, - "step": 379, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968583583831787 - }, - { - "episode": 6096, - "epoch": 0.10957328252507459, - "loss/policy_avg": 0.3527596592903137, - "lr": 9.75715746421268e-06, - "objective/entropy": -127.59818267822266, - "objective/kl": 31.49237632751465, - "objective/non_score_reward": -3.149237632751465, - "objective/rlhf_reward": -12.596950769424438, - "objective/scores": 0.0, - "policy/approxkl_avg": 19.017166137695312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4384676218032837, - "step": 380, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995179176330566 - }, - { - "episode": 6112, - "epoch": 0.10986087644246324, - "loss/policy_avg": 0.9311287999153137, - "lr": 9.756518404907976e-06, - "objective/entropy": 117.0103530883789, - "objective/kl": 30.302433013916016, - "objective/non_score_reward": -3.030243158340454, - "objective/rlhf_reward": -10.459113364637481, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 65.951171875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.41777661442756653, - "step": 381, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992172718048096 - }, - { - "episode": 6128, - "epoch": 0.11014847035985188, - "loss/policy_avg": 0.027314603328704834, - "lr": 9.755879345603273e-06, - "objective/entropy": 82.98536682128906, - "objective/kl": 41.457672119140625, - "objective/non_score_reward": -4.1457672119140625, - "objective/rlhf_reward": -15.241434386282592, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 5.37526273727417, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7102963924407959, - "step": 382, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000887632369995 - }, - { - "episode": 6144, - "epoch": 0.11043606427724054, - "loss/policy_avg": -0.5239760279655457, - "lr": 9.75524028629857e-06, - "objective/entropy": -19.319984436035156, - "objective/kl": 31.706575393676758, - "objective/non_score_reward": -3.1706576347351074, - "objective/rlhf_reward": -10.282630062103273, - "objective/scores": 0.6, - "policy/approxkl_avg": 33.74637222290039, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.521426796913147, - "step": 383, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0061190128326416 - }, - { - "episode": 6160, - "epoch": 0.11072365819462919, - "loss/policy_avg": 0.19491565227508545, - "lr": 9.754601226993867e-06, - "objective/entropy": 153.27801513671875, - "objective/kl": 30.898479461669922, - "objective/non_score_reward": -3.0898478031158447, - "objective/rlhf_reward": -10.84361919144028, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 24.972707748413086, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6943016648292542, - "step": 384, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9995365142822266 - }, - { - "episode": 6176, - "epoch": 0.11101125211201783, - "loss/policy_avg": 0.9045780897140503, - "lr": 9.753962167689162e-06, - "objective/entropy": 229.45260620117188, - "objective/kl": 45.034461975097656, - "objective/non_score_reward": -4.503446578979492, - "objective/rlhf_reward": -16.563187460513458, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 8.020683288574219, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5668317079544067, - "step": 385, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000062942504883 - }, - { - "episode": 6192, - "epoch": 0.11129884602940648, - "loss/policy_avg": 0.33030185103416443, - "lr": 9.753323108384459e-06, - "objective/entropy": 153.65707397460938, - "objective/kl": 42.31884002685547, - "objective/non_score_reward": -4.231884002685547, - "objective/rlhf_reward": -15.371276705470635, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 176.17214965820312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6527254581451416, - "step": 386, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980921745300293 - }, - { - "episode": 6208, - "epoch": 0.11158643994679512, - "loss/policy_avg": 1.2582824230194092, - "lr": 9.752684049079756e-06, - "objective/entropy": 212.47308349609375, - "objective/kl": 41.99869918823242, - "objective/non_score_reward": -4.1998701095581055, - "objective/rlhf_reward": -14.852069209294257, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 34.943233489990234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.551336407661438, - "step": 387, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.997947335243225 - }, - { - "episode": 6224, - "epoch": 0.11187403386418378, - "loss/policy_avg": 1.361016035079956, - "lr": 9.752044989775053e-06, - "objective/entropy": -335.09619140625, - "objective/kl": 30.397010803222656, - "objective/non_score_reward": -3.039701223373413, - "objective/rlhf_reward": -10.333976383480142, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 14.473678588867188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9271190166473389, - "step": 388, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9977775812149048 - }, - { - "episode": 6240, - "epoch": 0.11216162778157242, - "loss/policy_avg": 0.34025201201438904, - "lr": 9.751405930470348e-06, - "objective/entropy": 50.92825698852539, - "objective/kl": 39.54961013793945, - "objective/non_score_reward": -3.954960823059082, - "objective/rlhf_reward": -13.995015259059976, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 8.61404037475586, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.47977396845817566, - "step": 389, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9968550205230713 - }, - { - "episode": 6256, - "epoch": 0.11244922169896107, - "loss/policy_avg": 0.012692228890955448, - "lr": 9.750766871165645e-06, - "objective/entropy": -33.92766571044922, - "objective/kl": 31.518718719482422, - "objective/non_score_reward": -3.151872158050537, - "objective/rlhf_reward": -8.207488393783569, - "objective/scores": 1.1, - "policy/approxkl_avg": 84.33369445800781, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5067895650863647, - "step": 390, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9976071119308472 - }, - { - "episode": 6272, - "epoch": 0.11273681561634971, - "loss/policy_avg": 0.5984074473381042, - "lr": 9.750127811860941e-06, - "objective/entropy": -239.443359375, - "objective/kl": 31.10334014892578, - "objective/non_score_reward": -3.1103343963623047, - "objective/rlhf_reward": -10.041337525844575, - "objective/scores": 0.6, - "policy/approxkl_avg": 30.063674926757812, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7201836705207825, - "step": 391, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000786304473877 - }, - { - "episode": 6288, - "epoch": 0.11302440953373837, - "loss/policy_avg": 0.7581092715263367, - "lr": 9.749488752556238e-06, - "objective/entropy": 85.20730590820312, - "objective/kl": 40.380855560302734, - "objective/non_score_reward": -4.038085460662842, - "objective/rlhf_reward": -14.701744298549041, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 18.875045776367188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5445913672447205, - "step": 392, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9985957145690918 - }, - { - "episode": 6304, - "epoch": 0.113312003451127, - "loss/policy_avg": 1.7639085054397583, - "lr": 9.748849693251534e-06, - "objective/entropy": 124.08705139160156, - "objective/kl": 37.808753967285156, - "objective/non_score_reward": -3.7808759212493896, - "objective/rlhf_reward": -13.298674936565469, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 16.500898361206055, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.4794216752052307, - "step": 393, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0014162063598633 - }, - { - "episode": 6320, - "epoch": 0.11359959736851566, - "loss/policy_avg": 0.012201100587844849, - "lr": 9.74821063394683e-06, - "objective/entropy": 200.1130828857422, - "objective/kl": 30.82569122314453, - "objective/non_score_reward": -3.082569122314453, - "objective/rlhf_reward": -7.930276489257812, - "objective/scores": 1.1, - "policy/approxkl_avg": 7.863556861877441, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5342352390289307, - "step": 394, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999612808227539 - }, - { - "episode": 6336, - "epoch": 0.1138871912859043, - "loss/policy_avg": 2.2059273719787598, - "lr": 9.747571574642127e-06, - "objective/entropy": -69.09872436523438, - "objective/kl": 40.18467330932617, - "objective/non_score_reward": -4.018467426300049, - "objective/rlhf_reward": -14.650037605960932, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 23.521875381469727, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5724920034408569, - "step": 395, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9997222423553467 - }, - { - "episode": 6352, - "epoch": 0.11417478520329295, - "loss/policy_avg": 0.4041597843170166, - "lr": 9.746932515337424e-06, - "objective/entropy": -215.51731872558594, - "objective/kl": 27.624664306640625, - "objective/non_score_reward": -2.7624664306640625, - "objective/rlhf_reward": -11.049865961074829, - "objective/scores": 0.0, - "policy/approxkl_avg": 39.29521560668945, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6042770743370056, - "step": 396, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989702701568604 - }, - { - "episode": 6368, - "epoch": 0.11446237912068159, - "loss/policy_avg": 0.4775196313858032, - "lr": 9.746293456032721e-06, - "objective/entropy": 41.82182693481445, - "objective/kl": 36.31709289550781, - "objective/non_score_reward": -3.631709575653076, - "objective/rlhf_reward": -12.702010269435952, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 44.893619537353516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.37915006279945374, - "step": 397, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001455783843994 - }, - { - "episode": 6384, - "epoch": 0.11474997303807025, - "loss/policy_avg": 0.056639641523361206, - "lr": 9.745654396728016e-06, - "objective/entropy": -153.1647186279297, - "objective/kl": 32.43135452270508, - "objective/non_score_reward": -3.243135452270508, - "objective/rlhf_reward": -11.310682540357696, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 2.9430336952209473, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7443736791610718, - "step": 398, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000112533569336 - }, - { - "episode": 6400, - "epoch": 0.11503756695545889, - "loss/policy_avg": 0.045253098011016846, - "lr": 9.745015337423313e-06, - "objective/entropy": -105.165283203125, - "objective/kl": 39.292572021484375, - "objective/non_score_reward": -3.929257392883301, - "objective/rlhf_reward": -14.236076953823925, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 31.803394317626953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.41279107332229614, - "step": 399, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001852512359619 - }, - { - "episode": 6416, - "epoch": 0.11532516087284754, - "loss/policy_avg": 1.3353252410888672, - "lr": 9.74437627811861e-06, - "objective/entropy": 56.36566925048828, - "objective/kl": 36.79115676879883, - "objective/non_score_reward": -3.6791152954101562, - "objective/rlhf_reward": -12.983128563563028, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 52.49983215332031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5429282188415527, - "step": 400, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9969511032104492 - }, - { - "episode": 6432, - "epoch": 0.11561275479023618, - "loss/policy_avg": 0.19346949458122253, - "lr": 9.743737218813907e-06, - "objective/entropy": 94.13348388671875, - "objective/kl": 33.9053840637207, - "objective/non_score_reward": -3.3905386924743652, - "objective/rlhf_reward": -12.111556748957977, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 10.45969009399414, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5399774312973022, - "step": 401, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9987437725067139 - }, - { - "episode": 6448, - "epoch": 0.11590034870762483, - "loss/policy_avg": 0.14212624728679657, - "lr": 9.743098159509204e-06, - "objective/entropy": -67.64189147949219, - "objective/kl": 23.04766273498535, - "objective/non_score_reward": -2.3047664165496826, - "objective/rlhf_reward": -7.662806778159693, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 17.699844360351562, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5857589244842529, - "step": 402, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000443696975708 - }, - { - "episode": 6464, - "epoch": 0.11618794262501349, - "loss/policy_avg": 2.842088222503662, - "lr": 9.7424591002045e-06, - "objective/entropy": 104.11701965332031, - "objective/kl": 37.51358413696289, - "objective/non_score_reward": -3.7513585090637207, - "objective/rlhf_reward": -12.081715498806211, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 18.802593231201172, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8919892311096191, - "step": 403, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999840497970581 - }, - { - "episode": 6480, - "epoch": 0.11647553654240213, - "loss/policy_avg": 3.926600456237793, - "lr": 9.741820040899796e-06, - "objective/entropy": -60.85142517089844, - "objective/kl": 39.3304557800293, - "objective/non_score_reward": -3.9330458641052246, - "objective/rlhf_reward": -15.732182502746582, - "objective/scores": 0.0, - "policy/approxkl_avg": 15.211052894592285, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6057410836219788, - "step": 404, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9988353252410889 - }, - { - "episode": 6496, - "epoch": 0.11676313045979078, - "loss/policy_avg": 0.7047057747840881, - "lr": 9.741180981595093e-06, - "objective/entropy": 86.78068542480469, - "objective/kl": 32.590457916259766, - "objective/non_score_reward": -3.2590458393096924, - "objective/rlhf_reward": -10.913477124945196, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 73.14445495605469, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5404595136642456, - "step": 405, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992141723632812 - }, - { - "episode": 6512, - "epoch": 0.11705072437717942, - "loss/policy_avg": 0.7668646574020386, - "lr": 9.74054192229039e-06, - "objective/entropy": 9.115959167480469, - "objective/kl": 35.6148796081543, - "objective/non_score_reward": -3.561488389968872, - "objective/rlhf_reward": -12.123247566000494, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 24.980825424194336, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.684908390045166, - "step": 406, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991612434387207 - }, - { - "episode": 6528, - "epoch": 0.11733831829456808, - "loss/policy_avg": 0.901952862739563, - "lr": 9.739902862985686e-06, - "objective/entropy": 47.42900848388672, - "objective/kl": 36.136173248291016, - "objective/non_score_reward": -3.613617420196533, - "objective/rlhf_reward": -12.72113610903422, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 29.850797653198242, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5216494202613831, - "step": 407, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996336698532104 - }, - { - "episode": 6544, - "epoch": 0.11762591221195672, - "loss/policy_avg": 0.4201366901397705, - "lr": 9.739263803680983e-06, - "objective/entropy": -11.0733642578125, - "objective/kl": 35.00093078613281, - "objective/non_score_reward": -3.5000932216644287, - "objective/rlhf_reward": -9.600372886657714, - "objective/scores": 1.1, - "policy/approxkl_avg": 32.18763732910156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5420930981636047, - "step": 408, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997605323791504 - }, - { - "episode": 6560, - "epoch": 0.11791350612934537, - "loss/policy_avg": 1.302764892578125, - "lr": 9.73862474437628e-06, - "objective/entropy": 168.5387420654297, - "objective/kl": 26.525001525878906, - "objective/non_score_reward": -2.6525001525878906, - "objective/rlhf_reward": -9.094229185374912, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 59.64923858642578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.529288649559021, - "step": 409, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9986768960952759 - }, - { - "episode": 6576, - "epoch": 0.11820110004673401, - "loss/policy_avg": 1.0619229078292847, - "lr": 9.737985685071575e-06, - "objective/entropy": -54.82817459106445, - "objective/kl": 37.211219787597656, - "objective/non_score_reward": -3.7211220264434814, - "objective/rlhf_reward": -13.328228085246636, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 47.928985595703125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5946022272109985, - "step": 410, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001032590866089 - }, - { - "episode": 6592, - "epoch": 0.11848869396412266, - "loss/policy_avg": 0.4641076922416687, - "lr": 9.737346625766872e-06, - "objective/entropy": 80.71646881103516, - "objective/kl": 35.40373992919922, - "objective/non_score_reward": -3.5403738021850586, - "objective/rlhf_reward": -12.680542829449536, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 4.019253730773926, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5870873928070068, - "step": 411, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998197555541992 - }, - { - "episode": 6608, - "epoch": 0.1187762878815113, - "loss/policy_avg": 0.3565133213996887, - "lr": 9.736707566462167e-06, - "objective/entropy": 122.7892074584961, - "objective/kl": 39.498130798339844, - "objective/non_score_reward": -3.9498136043548584, - "objective/rlhf_reward": -13.67654818512586, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 63.53807830810547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.581372857093811, - "step": 412, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0015203952789307 - }, - { - "episode": 6624, - "epoch": 0.11906388179889996, - "loss/policy_avg": 0.14506877958774567, - "lr": 9.736068507157464e-06, - "objective/entropy": 193.5592041015625, - "objective/kl": 30.521562576293945, - "objective/non_score_reward": -3.052156448364258, - "objective/rlhf_reward": -10.475291983286539, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 22.271638870239258, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7713235020637512, - "step": 413, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9975152015686035 - }, - { - "episode": 6640, - "epoch": 0.1193514757162886, - "loss/policy_avg": 0.9468994736671448, - "lr": 9.735429447852761e-06, - "objective/entropy": 148.8424835205078, - "objective/kl": 37.5145378112793, - "objective/non_score_reward": -3.7514538764953613, - "objective/rlhf_reward": -15.005815267562866, - "objective/scores": 0.0, - "policy/approxkl_avg": 9.498788833618164, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.38916516304016113, - "step": 414, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000758171081543 - }, - { - "episode": 6656, - "epoch": 0.11963906963367725, - "loss/policy_avg": 0.7254658937454224, - "lr": 9.734790388548058e-06, - "objective/entropy": 56.421714782714844, - "objective/kl": 33.228389739990234, - "objective/non_score_reward": -3.32283878326416, - "objective/rlhf_reward": -11.46652638462455, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 17.776447296142578, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6019710302352905, - "step": 415, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001137256622314 - }, - { - "episode": 6672, - "epoch": 0.11992666355106589, - "loss/policy_avg": 0.12397602200508118, - "lr": 9.734151329243355e-06, - "objective/entropy": -148.2471466064453, - "objective/kl": 25.882095336914062, - "objective/non_score_reward": -2.588209629058838, - "objective/rlhf_reward": -8.974236705390316, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 0.8484023809432983, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4417728781700134, - "step": 416, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.002113103866577 - }, - { - "episode": 6688, - "epoch": 0.12021425746845454, - "loss/policy_avg": -0.03540700674057007, - "lr": 9.73351226993865e-06, - "objective/entropy": -65.22505187988281, - "objective/kl": 25.781585693359375, - "objective/non_score_reward": -2.5781586170196533, - "objective/rlhf_reward": -10.312634468078613, - "objective/scores": 0.0, - "policy/approxkl_avg": 0.9484915733337402, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.44973552227020264, - "step": 417, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.003369092941284 - }, - { - "episode": 6704, - "epoch": 0.12050185138584318, - "loss/policy_avg": 2.237513303756714, - "lr": 9.732873210633947e-06, - "objective/entropy": 72.41790008544922, - "objective/kl": 41.708648681640625, - "objective/non_score_reward": -4.170865058898926, - "objective/rlhf_reward": -13.759741698147032, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 6.452242851257324, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.46763697266578674, - "step": 418, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999596357345581 - }, - { - "episode": 6720, - "epoch": 0.12078944530323184, - "loss/policy_avg": -0.033215656876564026, - "lr": 9.732234151329244e-06, - "objective/entropy": 116.18624877929688, - "objective/kl": 41.70143508911133, - "objective/non_score_reward": -4.170144081115723, - "objective/rlhf_reward": -15.018715386808502, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 0.950503408908844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5579153299331665, - "step": 419, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0016934871673584 - }, - { - "episode": 6736, - "epoch": 0.12107703922062048, - "loss/policy_avg": 0.5230793952941895, - "lr": 9.73159509202454e-06, - "objective/entropy": 87.67442321777344, - "objective/kl": 42.121944427490234, - "objective/non_score_reward": -4.212194442749023, - "objective/rlhf_reward": -15.523264799147768, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 44.811546325683594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5887485146522522, - "step": 420, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986464977264404 - }, - { - "episode": 6752, - "epoch": 0.12136463313800913, - "loss/policy_avg": 0.09617140889167786, - "lr": 9.730956032719838e-06, - "objective/entropy": 197.31307983398438, - "objective/kl": 41.32299041748047, - "objective/non_score_reward": -4.132298469543457, - "objective/rlhf_reward": -15.129195547103883, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.07308292388916, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5032777786254883, - "step": 421, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003557205200195 - }, - { - "episode": 6768, - "epoch": 0.12165222705539779, - "loss/policy_avg": 0.0820683017373085, - "lr": 9.730316973415135e-06, - "objective/entropy": 90.92608642578125, - "objective/kl": 33.22870635986328, - "objective/non_score_reward": -3.3228707313537598, - "objective/rlhf_reward": -11.735222904887749, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 6.888459205627441, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6309401392936707, - "step": 422, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998487949371338 - }, - { - "episode": 6784, - "epoch": 0.12193982097278643, - "loss/policy_avg": 0.13335853815078735, - "lr": 9.72967791411043e-06, - "objective/entropy": 58.8111686706543, - "objective/kl": 17.325424194335938, - "objective/non_score_reward": -1.7325425148010254, - "objective/rlhf_reward": -5.196836725870767, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 2.40964412689209, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4101444184780121, - "step": 423, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999779462814331 - }, - { - "episode": 6800, - "epoch": 0.12222741489017508, - "loss/policy_avg": 1.1839892864227295, - "lr": 9.729038854805727e-06, - "objective/entropy": 278.55230712890625, - "objective/kl": 36.13326644897461, - "objective/non_score_reward": -3.6133267879486084, - "objective/rlhf_reward": -12.7199738184611, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 37.6474609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6956678628921509, - "step": 424, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999556303024292 - }, - { - "episode": 6816, - "epoch": 0.12251500880756372, - "loss/policy_avg": 0.5160382390022278, - "lr": 9.728399795501023e-06, - "objective/entropy": -4.561044692993164, - "objective/kl": 48.20618438720703, - "objective/non_score_reward": -4.820618152618408, - "objective/rlhf_reward": -17.801520469601513, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 29.267677307128906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6663553714752197, - "step": 425, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9961378574371338 - }, - { - "episode": 6832, - "epoch": 0.12280260272495237, - "loss/policy_avg": -0.027832061052322388, - "lr": 9.72776073619632e-06, - "objective/entropy": -14.169868469238281, - "objective/kl": 28.816591262817383, - "objective/non_score_reward": -2.8816590309143066, - "objective/rlhf_reward": -10.126636123657228, - "objective/scores": 0.35, - "policy/approxkl_avg": 10.623421669006348, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.42182457447052, - "step": 426, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0000364780426025 - }, - { - "episode": 6848, - "epoch": 0.12309019664234101, - "loss/policy_avg": 0.9478355050086975, - "lr": 9.727121676891617e-06, - "objective/entropy": -48.67333221435547, - "objective/kl": 22.937318801879883, - "objective/non_score_reward": -2.293731927871704, - "objective/rlhf_reward": -7.227516422944005, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.106391191482544, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6140183210372925, - "step": 427, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0030503273010254 - }, - { - "episode": 6864, - "epoch": 0.12337779055972967, - "loss/policy_avg": 0.6610305309295654, - "lr": 9.726482617586912e-06, - "objective/entropy": 80.99835968017578, - "objective/kl": 39.61425018310547, - "objective/non_score_reward": -3.961425304412842, - "objective/rlhf_reward": -14.520187649756593, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.697940349578857, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5119843482971191, - "step": 428, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000978946685791 - }, - { - "episode": 6880, - "epoch": 0.1236653844771183, - "loss/policy_avg": 0.11895343661308289, - "lr": 9.72584355828221e-06, - "objective/entropy": 162.822021484375, - "objective/kl": 44.34868621826172, - "objective/non_score_reward": -4.434868812561035, - "objective/rlhf_reward": -16.339474773406984, - "objective/scores": 0.35, - "policy/approxkl_avg": 4.151267051696777, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5969315767288208, - "step": 429, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984312057495117 - }, - { - "episode": 6896, - "epoch": 0.12395297839450696, - "loss/policy_avg": 0.5579686164855957, - "lr": 9.725204498977506e-06, - "objective/entropy": -17.16387367248535, - "objective/kl": 37.852745056152344, - "objective/non_score_reward": -3.7852747440338135, - "objective/rlhf_reward": -10.741099214553834, - "objective/scores": 1.1, - "policy/approxkl_avg": 41.654693603515625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4921834468841553, - "step": 430, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981064796447754 - }, - { - "episode": 6912, - "epoch": 0.1242405723118956, - "loss/policy_avg": 0.15593896806240082, - "lr": 9.724565439672803e-06, - "objective/entropy": 149.734130859375, - "objective/kl": 25.60231590270996, - "objective/non_score_reward": -2.5602316856384277, - "objective/rlhf_reward": -8.416097994121621, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 30.770790100097656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4205164313316345, - "step": 431, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001136541366577 - }, - { - "episode": 6928, - "epoch": 0.12452816622928425, - "loss/policy_avg": 0.7011826038360596, - "lr": 9.7239263803681e-06, - "objective/entropy": 55.692283630371094, - "objective/kl": 43.931175231933594, - "objective/non_score_reward": -4.393117904663086, - "objective/rlhf_reward": -15.83913828531901, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 20.95511245727539, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3865165710449219, - "step": 432, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988900423049927 - }, - { - "episode": 6944, - "epoch": 0.1248157601466729, - "loss/policy_avg": 1.022209882736206, - "lr": 9.723287321063397e-06, - "objective/entropy": 83.84861755371094, - "objective/kl": 42.09056854248047, - "objective/non_score_reward": -4.209057331085205, - "objective/rlhf_reward": -15.011400456699441, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 58.270423889160156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6480612754821777, - "step": 433, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9980558156967163 - }, - { - "episode": 6960, - "epoch": 0.12510335406406153, - "loss/policy_avg": 0.5657510757446289, - "lr": 9.722648261758692e-06, - "objective/entropy": 115.53985595703125, - "objective/kl": 33.222572326660156, - "objective/non_score_reward": -3.3222572803497314, - "objective/rlhf_reward": -11.684908661905844, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 22.127004623413086, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8776997923851013, - "step": 434, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9980456829071045 - }, - { - "episode": 6976, - "epoch": 0.1253909479814502, - "loss/policy_avg": 0.861635684967041, - "lr": 9.722009202453989e-06, - "objective/entropy": 191.2237548828125, - "objective/kl": 33.726585388183594, - "objective/non_score_reward": -3.3726587295532227, - "objective/rlhf_reward": -11.757301584879556, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 66.25660705566406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5842768549919128, - "step": 435, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0004849433898926 - }, - { - "episode": 6992, - "epoch": 0.12567854189883884, - "loss/policy_avg": 0.30258873105049133, - "lr": 9.721370143149284e-06, - "objective/entropy": 179.46835327148438, - "objective/kl": 39.91570281982422, - "objective/non_score_reward": -3.9915707111358643, - "objective/rlhf_reward": -14.362163100306113, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 8.522405624389648, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.45521217584609985, - "step": 436, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9980738162994385 - }, - { - "episode": 7008, - "epoch": 0.12596613581622748, - "loss/policy_avg": 0.9346391558647156, - "lr": 9.720731083844581e-06, - "objective/entropy": 37.353126525878906, - "objective/kl": 43.99368667602539, - "objective/non_score_reward": -4.3993682861328125, - "objective/rlhf_reward": -16.11652124207771, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 6.334951877593994, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7622473835945129, - "step": 437, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0002927780151367 - }, - { - "episode": 7024, - "epoch": 0.12625372973361612, - "loss/policy_avg": 1.3644543886184692, - "lr": 9.720092024539878e-06, - "objective/entropy": -80.11536407470703, - "objective/kl": 26.775297164916992, - "objective/non_score_reward": -2.677529811859131, - "objective/rlhf_reward": -9.33151695975433, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 41.969295501708984, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.37151363492012024, - "step": 438, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997823238372803 - }, - { - "episode": 7040, - "epoch": 0.1265413236510048, - "loss/policy_avg": -0.03804589435458183, - "lr": 9.719452965235175e-06, - "objective/entropy": -68.57923889160156, - "objective/kl": 41.42705535888672, - "objective/non_score_reward": -4.14270544052124, - "objective/rlhf_reward": -15.170821285247804, - "objective/scores": 0.35, - "policy/approxkl_avg": 0.610215425491333, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.3907659649848938, - "step": 439, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002786636352539 - }, - { - "episode": 7056, - "epoch": 0.12682891756839343, - "loss/policy_avg": 0.9513897895812988, - "lr": 9.718813905930472e-06, - "objective/entropy": 176.7696533203125, - "objective/kl": 32.5645751953125, - "objective/non_score_reward": -3.2564573287963867, - "objective/rlhf_reward": -11.666579568122309, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 84.38311004638672, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.43554389476776123, - "step": 440, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9961769580841064 - }, - { - "episode": 7072, - "epoch": 0.12711651148578207, - "loss/policy_avg": 1.6144888401031494, - "lr": 9.718174846625767e-06, - "objective/entropy": -14.703704833984375, - "objective/kl": 21.40297508239746, - "objective/non_score_reward": -2.1402974128723145, - "objective/rlhf_reward": -6.1611901283264165, - "objective/scores": 0.6, - "policy/approxkl_avg": 90.28463745117188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6536741852760315, - "step": 441, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9973065853118896 - }, - { - "episode": 7088, - "epoch": 0.12740410540317074, - "loss/policy_avg": -0.564086377620697, - "lr": 9.717535787321064e-06, - "objective/entropy": -92.54092407226562, - "objective/kl": 27.47213363647461, - "objective/non_score_reward": -2.747213363647461, - "objective/rlhf_reward": -9.432593672481135, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 24.76102066040039, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4654249846935272, - "step": 442, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999948263168335 - }, - { - "episode": 7104, - "epoch": 0.12769169932055938, - "loss/policy_avg": 0.7788177728652954, - "lr": 9.71689672801636e-06, - "objective/entropy": -28.373756408691406, - "objective/kl": 35.91747283935547, - "objective/non_score_reward": -3.591747283935547, - "objective/rlhf_reward": -13.025353243857055, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 27.097518920898438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.38184916973114014, - "step": 443, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997880220413208 - }, - { - "episode": 7120, - "epoch": 0.12797929323794802, - "loss/policy_avg": 0.3843851685523987, - "lr": 9.716257668711657e-06, - "objective/entropy": 140.32058715820312, - "objective/kl": 37.66426467895508, - "objective/non_score_reward": -3.7664265632629395, - "objective/rlhf_reward": -13.332372681299844, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 19.241600036621094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.349905788898468, - "step": 444, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0018460750579834 - }, - { - "episode": 7136, - "epoch": 0.12826688715533666, - "loss/policy_avg": 0.540947437286377, - "lr": 9.715618609406954e-06, - "objective/entropy": 148.06629943847656, - "objective/kl": 42.55817413330078, - "objective/non_score_reward": -4.25581693649292, - "objective/rlhf_reward": -15.66401835653631, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 75.46281433105469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.541397213935852, - "step": 445, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983032941818237 - }, - { - "episode": 7152, - "epoch": 0.12855448107272532, - "loss/policy_avg": 1.184004306793213, - "lr": 9.714979550102251e-06, - "objective/entropy": 84.38250732421875, - "objective/kl": 33.90479278564453, - "objective/non_score_reward": -3.390479564666748, - "objective/rlhf_reward": -12.220281770735411, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 51.5472412109375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5456966161727905, - "step": 446, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9997706413269043 - }, - { - "episode": 7168, - "epoch": 0.12884207499011396, - "loss/policy_avg": 1.1816997528076172, - "lr": 9.714340490797546e-06, - "objective/entropy": -57.552371978759766, - "objective/kl": 30.747276306152344, - "objective/non_score_reward": -3.074728012084961, - "objective/rlhf_reward": -7.898911571502686, - "objective/scores": 1.1, - "policy/approxkl_avg": 35.125282287597656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.30223560333251953, - "step": 447, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9982578754425049 - }, - { - "episode": 7184, - "epoch": 0.1291296689075026, - "loss/policy_avg": 0.3517414927482605, - "lr": 9.713701431492843e-06, - "objective/entropy": 172.75254821777344, - "objective/kl": 32.79669189453125, - "objective/non_score_reward": -3.2796695232391357, - "objective/rlhf_reward": -11.456818585813629, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 27.51565170288086, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4375608265399933, - "step": 448, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9987891912460327 - }, - { - "episode": 7200, - "epoch": 0.12941726282489124, - "loss/policy_avg": 0.3732157051563263, - "lr": 9.71306237218814e-06, - "objective/entropy": 71.94863891601562, - "objective/kl": 37.338172912597656, - "objective/non_score_reward": -3.7338175773620605, - "objective/rlhf_reward": -12.535269832611085, - "objective/scores": 0.6, - "policy/approxkl_avg": 85.17916870117188, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.43261417746543884, - "step": 449, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.99893057346344 - }, - { - "episode": 7216, - "epoch": 0.1297048567422799, - "loss/policy_avg": 1.4543174505233765, - "lr": 9.712423312883437e-06, - "objective/entropy": 258.2192687988281, - "objective/kl": 49.02899169921875, - "objective/non_score_reward": -4.902898788452148, - "objective/rlhf_reward": -18.286083135634584, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 21.199018478393555, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6569823026657104, - "step": 450, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996167778968811 - }, - { - "episode": 7232, - "epoch": 0.12999245065966855, - "loss/policy_avg": 0.49340057373046875, - "lr": 9.711784253578734e-06, - "objective/entropy": -43.6832160949707, - "objective/kl": 33.31085968017578, - "objective/non_score_reward": -3.3310861587524414, - "objective/rlhf_reward": -11.87374721011673, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 28.588882446289062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6205878257751465, - "step": 451, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999979019165039 - }, - { - "episode": 7248, - "epoch": 0.1302800445770572, - "loss/policy_avg": 0.6000991463661194, - "lr": 9.711145194274029e-06, - "objective/entropy": 90.76286315917969, - "objective/kl": 45.31011962890625, - "objective/non_score_reward": -4.531011581420898, - "objective/rlhf_reward": -16.745444872466425, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 113.64555358886719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5617672204971313, - "step": 452, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9989123344421387 - }, - { - "episode": 7264, - "epoch": 0.13056763849444583, - "loss/policy_avg": 1.042801022529602, - "lr": 9.710506134969326e-06, - "objective/entropy": 189.15316772460938, - "objective/kl": 36.73876953125, - "objective/non_score_reward": -3.673877477645874, - "objective/rlhf_reward": -13.295509910583498, - "objective/scores": 0.35, - "policy/approxkl_avg": 21.024385452270508, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4428805708885193, - "step": 453, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9950382709503174 - }, - { - "episode": 7280, - "epoch": 0.1308552324118345, - "loss/policy_avg": 0.6862419843673706, - "lr": 9.709867075664623e-06, - "objective/entropy": -38.022178649902344, - "objective/kl": 48.085838317871094, - "objective/non_score_reward": -4.808583736419678, - "objective/rlhf_reward": -17.71856340149277, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 89.67850494384766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5650781393051147, - "step": 454, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9975779056549072 - }, - { - "episode": 7296, - "epoch": 0.13114282632922314, - "loss/policy_avg": -0.21822161972522736, - "lr": 9.70922801635992e-06, - "objective/entropy": 24.07161521911621, - "objective/kl": 39.552284240722656, - "objective/non_score_reward": -3.955228328704834, - "objective/rlhf_reward": -14.339960935528637, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 4.527206897735596, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.32756006717681885, - "step": 455, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0025227069854736 - }, - { - "episode": 7312, - "epoch": 0.13143042024661178, - "loss/policy_avg": 0.050335630774497986, - "lr": 9.708588957055215e-06, - "objective/entropy": 47.709957122802734, - "objective/kl": 34.94654083251953, - "objective/non_score_reward": -3.4946541786193848, - "objective/rlhf_reward": -11.054897938610289, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 20.525684356689453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.40393322706222534, - "step": 456, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999319314956665 - }, - { - "episode": 7328, - "epoch": 0.13171801416400042, - "loss/policy_avg": -0.2107769250869751, - "lr": 9.707949897750512e-06, - "objective/entropy": 4.4464111328125, - "objective/kl": 30.348583221435547, - "objective/non_score_reward": -3.034858226776123, - "objective/rlhf_reward": -10.760830977050167, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 9.349994659423828, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4865412712097168, - "step": 457, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.025125503540039 - }, - { - "episode": 7344, - "epoch": 0.1320056080813891, - "loss/policy_avg": 0.9970263242721558, - "lr": 9.707310838445809e-06, - "objective/entropy": 197.11566162109375, - "objective/kl": 38.80963897705078, - "objective/non_score_reward": -3.8809640407562256, - "objective/rlhf_reward": -14.008084261211092, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 64.41117858886719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.47768014669418335, - "step": 458, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9979461431503296 - }, - { - "episode": 7360, - "epoch": 0.13229320199877773, - "loss/policy_avg": 0.19499164819717407, - "lr": 9.706671779141105e-06, - "objective/entropy": 109.55068969726562, - "objective/kl": 34.07399368286133, - "objective/non_score_reward": -3.4073991775512695, - "objective/rlhf_reward": -13.629597425460815, - "objective/scores": 0.0, - "policy/approxkl_avg": 8.495365142822266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.565830647945404, - "step": 459, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999122977256775 - }, - { - "episode": 7376, - "epoch": 0.13258079591616637, - "loss/policy_avg": 0.5532440543174744, - "lr": 9.7060327198364e-06, - "objective/entropy": 83.35699462890625, - "objective/kl": 32.4083251953125, - "objective/non_score_reward": -3.240832567214966, - "objective/rlhf_reward": -11.359210524622519, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.45071268081665, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4963003993034363, - "step": 460, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.011878490447998 - }, - { - "episode": 7392, - "epoch": 0.13286838983355503, - "loss/policy_avg": 0.552447497844696, - "lr": 9.705393660531698e-06, - "objective/entropy": 160.72750854492188, - "objective/kl": 47.16038131713867, - "objective/non_score_reward": -4.716038227081299, - "objective/rlhf_reward": -17.440320809085932, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 79.7755126953125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3849433958530426, - "step": 461, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9991737604141235 - }, - { - "episode": 7408, - "epoch": 0.13315598375094367, - "loss/policy_avg": 1.8305895328521729, - "lr": 9.704754601226994e-06, - "objective/entropy": 148.47381591796875, - "objective/kl": 32.803104400634766, - "objective/non_score_reward": -3.280310869216919, - "objective/rlhf_reward": -11.779607584982543, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.440328121185303, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.34545716643333435, - "step": 462, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0012433528900146 - }, - { - "episode": 7424, - "epoch": 0.1334435776683323, - "loss/policy_avg": -0.21606217324733734, - "lr": 9.704115541922291e-06, - "objective/entropy": -211.98297119140625, - "objective/kl": 22.90569305419922, - "objective/non_score_reward": -2.2905690670013428, - "objective/rlhf_reward": -7.337447400363992, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 3.513453483581543, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6393148899078369, - "step": 463, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0019214153289795 - }, - { - "episode": 7440, - "epoch": 0.13373117158572095, - "loss/policy_avg": 1.7707817554473877, - "lr": 9.703476482617588e-06, - "objective/entropy": -42.62212371826172, - "objective/kl": 38.86042022705078, - "objective/non_score_reward": -3.8860418796539307, - "objective/rlhf_reward": -13.987908928599907, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 6.585241317749023, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5668050050735474, - "step": 464, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999806880950928 - }, - { - "episode": 7456, - "epoch": 0.13401876550310962, - "loss/policy_avg": 0.5606961250305176, - "lr": 9.702837423312883e-06, - "objective/entropy": 46.48912811279297, - "objective/kl": 41.47301483154297, - "objective/non_score_reward": -4.14730167388916, - "objective/rlhf_reward": -14.18920729160309, - "objective/scores": 0.6, - "policy/approxkl_avg": 15.323009490966797, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4189653694629669, - "step": 465, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.002035617828369 - }, - { - "episode": 7472, - "epoch": 0.13430635942049826, - "loss/policy_avg": 0.3116866946220398, - "lr": 9.70219836400818e-06, - "objective/entropy": 202.82122802734375, - "objective/kl": 30.228025436401367, - "objective/non_score_reward": -3.0228028297424316, - "objective/rlhf_reward": -10.266382093700479, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 3.352916955947876, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5167281627655029, - "step": 466, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003514289855957 - }, - { - "episode": 7488, - "epoch": 0.1345939533378869, - "loss/policy_avg": 0.9980499148368835, - "lr": 9.701559304703477e-06, - "objective/entropy": 127.01738739013672, - "objective/kl": 39.83085632324219, - "objective/non_score_reward": -3.9830856323242188, - "objective/rlhf_reward": -14.45138919633186, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 25.166885375976562, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.736939549446106, - "step": 467, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994404315948486 - }, - { - "episode": 7504, - "epoch": 0.13488154725527554, - "loss/policy_avg": 0.21544580161571503, - "lr": 9.700920245398774e-06, - "objective/entropy": 233.09375, - "objective/kl": 32.72058868408203, - "objective/non_score_reward": -3.272059202194214, - "objective/rlhf_reward": -11.72898670408575, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 78.07327270507812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62492835521698, - "step": 468, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9992430210113525 - }, - { - "episode": 7520, - "epoch": 0.1351691411726642, - "loss/policy_avg": 0.4316645860671997, - "lr": 9.700281186094071e-06, - "objective/entropy": -37.32112121582031, - "objective/kl": 29.643779754638672, - "objective/non_score_reward": -2.9643778800964355, - "objective/rlhf_reward": -10.406913261027679, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 80.32553100585938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8007365465164185, - "step": 469, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981448650360107 - }, - { - "episode": 7536, - "epoch": 0.13545673509005285, - "loss/policy_avg": -0.22065140306949615, - "lr": 9.699642126789368e-06, - "objective/entropy": -288.51220703125, - "objective/kl": 31.09638023376465, - "objective/non_score_reward": -3.109638214111328, - "objective/rlhf_reward": -10.882293074336602, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 15.306625366210938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6064466238021851, - "step": 470, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000288248062134 - }, - { - "episode": 7552, - "epoch": 0.1357443290074415, - "loss/policy_avg": 0.7062017917633057, - "lr": 9.699003067484663e-06, - "objective/entropy": -185.96678161621094, - "objective/kl": 38.07769012451172, - "objective/non_score_reward": -3.8077688217163086, - "objective/rlhf_reward": -12.307355795742247, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 21.195262908935547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6327893137931824, - "step": 471, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9975873231887817 - }, - { - "episode": 7568, - "epoch": 0.13603192292483013, - "loss/policy_avg": 0.21052365005016327, - "lr": 9.69836400817996e-06, - "objective/entropy": -114.1561050415039, - "objective/kl": 38.60865020751953, - "objective/non_score_reward": -3.8608651161193848, - "objective/rlhf_reward": -15.443459749221802, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.808056831359863, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587588906288147, - "step": 472, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999895453453064 - }, - { - "episode": 7584, - "epoch": 0.1363195168422188, - "loss/policy_avg": 0.9177588224411011, - "lr": 9.697724948875257e-06, - "objective/entropy": 91.9778823852539, - "objective/kl": 49.228004455566406, - "objective/non_score_reward": -4.9228010177612305, - "objective/rlhf_reward": -18.34956817915979, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 252.69491577148438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5273959636688232, - "step": 473, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998390555381775 - }, - { - "episode": 7600, - "epoch": 0.13660711075960744, - "loss/policy_avg": 0.2135259062051773, - "lr": 9.697085889570554e-06, - "objective/entropy": -91.76605224609375, - "objective/kl": 20.413612365722656, - "objective/non_score_reward": -2.0413613319396973, - "objective/rlhf_reward": -6.741612751682368, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 1.6130738258361816, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4268151521682739, - "step": 474, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9995267391204834 - }, - { - "episode": 7616, - "epoch": 0.13689470467699608, - "loss/policy_avg": 0.7761150598526001, - "lr": 9.69644683026585e-06, - "objective/entropy": 25.679851531982422, - "objective/kl": 40.76634979248047, - "objective/non_score_reward": -4.076634883880615, - "objective/rlhf_reward": -14.927937605468134, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.822543144226074, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7041196823120117, - "step": 475, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984660148620605 - }, - { - "episode": 7632, - "epoch": 0.13718229859438472, - "loss/policy_avg": 1.497192144393921, - "lr": 9.695807770961146e-06, - "objective/entropy": -73.21554565429688, - "objective/kl": 31.698223114013672, - "objective/non_score_reward": -3.1698226928710938, - "objective/rlhf_reward": -10.731878827290473, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.129430770874023, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6742293834686279, - "step": 476, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998528242111206 - }, - { - "episode": 7648, - "epoch": 0.13746989251177338, - "loss/policy_avg": 0.7623737454414368, - "lr": 9.695168711656443e-06, - "objective/entropy": -212.29415893554688, - "objective/kl": 26.89659881591797, - "objective/non_score_reward": -2.689659833908081, - "objective/rlhf_reward": -9.433126482993288, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 40.945072174072266, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6739322543144226, - "step": 477, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999362587928772 - }, - { - "episode": 7664, - "epoch": 0.13775748642916202, - "loss/policy_avg": 0.8399478793144226, - "lr": 9.694529652351738e-06, - "objective/entropy": -28.784271240234375, - "objective/kl": 46.6888542175293, - "objective/non_score_reward": -4.668885707855225, - "objective/rlhf_reward": -17.31629320356695, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 33.967262268066406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7294750213623047, - "step": 478, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998927116394043 - }, - { - "episode": 7680, - "epoch": 0.13804508034655066, - "loss/policy_avg": 0.48977339267730713, - "lr": 9.693890593047035e-06, - "objective/entropy": -103.34806823730469, - "objective/kl": 35.017757415771484, - "objective/non_score_reward": -3.5017752647399902, - "objective/rlhf_reward": -12.583269674976435, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 104.5495376586914, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6413424015045166, - "step": 479, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9937057495117188 - }, - { - "episode": 7696, - "epoch": 0.13833267426393933, - "loss/policy_avg": 0.20745977759361267, - "lr": 9.693251533742331e-06, - "objective/entropy": 57.130958557128906, - "objective/kl": 41.31460189819336, - "objective/non_score_reward": -4.131460189819336, - "objective/rlhf_reward": -15.010069215091402, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 30.427654266357422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5963334441184998, - "step": 480, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0048885345458984 - }, - { - "episode": 7712, - "epoch": 0.13862026818132797, - "loss/policy_avg": 0.15290355682373047, - "lr": 9.692612474437628e-06, - "objective/entropy": 106.57427978515625, - "objective/kl": 41.791648864746094, - "objective/non_score_reward": -4.179165363311768, - "objective/rlhf_reward": -13.792941962124083, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 30.64380645751953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.521045446395874, - "step": 481, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998927354812622 - }, - { - "episode": 7728, - "epoch": 0.1389078620987166, - "loss/policy_avg": 0.7474868893623352, - "lr": 9.691973415132925e-06, - "objective/entropy": 228.7914581298828, - "objective/kl": 35.901405334472656, - "objective/non_score_reward": -3.5901405811309814, - "objective/rlhf_reward": -11.960562801361085, - "objective/scores": 0.6, - "policy/approxkl_avg": 88.05641174316406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5807632207870483, - "step": 482, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9998561143875122 - }, - { - "episode": 7744, - "epoch": 0.13919545601610525, - "loss/policy_avg": 1.6704270839691162, - "lr": 9.691334355828222e-06, - "objective/entropy": -124.35450744628906, - "objective/kl": 36.7768440246582, - "objective/non_score_reward": -3.6776845455169678, - "objective/rlhf_reward": -13.286906321247187, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 20.388381958007812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6395858526229858, - "step": 483, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9977163076400757 - }, - { - "episode": 7760, - "epoch": 0.13948304993349392, - "loss/policy_avg": 0.41683727502822876, - "lr": 9.690695296523517e-06, - "objective/entropy": 82.72738647460938, - "objective/kl": 38.18916702270508, - "objective/non_score_reward": -3.8189167976379395, - "objective/rlhf_reward": -13.851835568149653, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 7.188452243804932, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5991804599761963, - "step": 484, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0014636516571045 - }, - { - "episode": 7776, - "epoch": 0.13977064385088256, - "loss/policy_avg": 0.9677872657775879, - "lr": 9.690056237218814e-06, - "objective/entropy": 33.29289627075195, - "objective/kl": 28.069137573242188, - "objective/non_score_reward": -2.8069138526916504, - "objective/rlhf_reward": -9.868405782912655, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.909322738647461, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3673190474510193, - "step": 485, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988304376602173 - }, - { - "episode": 7792, - "epoch": 0.1400582377682712, - "loss/policy_avg": 1.4412565231323242, - "lr": 9.689417177914111e-06, - "objective/entropy": 143.29071044921875, - "objective/kl": 37.918006896972656, - "objective/non_score_reward": -3.7918009757995605, - "objective/rlhf_reward": -13.71660516700302, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 25.657358169555664, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4064646065235138, - "step": 486, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9968105554580688 - }, - { - "episode": 7808, - "epoch": 0.14034583168565984, - "loss/policy_avg": 0.5567857027053833, - "lr": 9.688778118609408e-06, - "objective/entropy": 108.03604125976562, - "objective/kl": 25.368505477905273, - "objective/non_score_reward": -2.5368504524230957, - "objective/rlhf_reward": -8.485543017805206, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 7.421592712402344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5672407746315002, - "step": 487, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002861499786377 - }, - { - "episode": 7824, - "epoch": 0.1406334256030485, - "loss/policy_avg": 0.1908845454454422, - "lr": 9.688139059304705e-06, - "objective/entropy": 155.91831970214844, - "objective/kl": 27.6815128326416, - "objective/non_score_reward": -2.7681517601013184, - "objective/rlhf_reward": -6.672606325149536, - "objective/scores": 1.1, - "policy/approxkl_avg": 10.237553596496582, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4108530282974243, - "step": 488, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9994752407073975 - }, - { - "episode": 7840, - "epoch": 0.14092101952043715, - "loss/policy_avg": 0.9416247606277466, - "lr": 9.6875e-06, - "objective/entropy": -103.95333862304688, - "objective/kl": 41.44330978393555, - "objective/non_score_reward": -4.144330978393555, - "objective/rlhf_reward": -14.177324867248537, - "objective/scores": 0.6, - "policy/approxkl_avg": 14.313966751098633, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.464949369430542, - "step": 489, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988256692886353 - }, - { - "episode": 7856, - "epoch": 0.1412086134378258, - "loss/policy_avg": 2.2338528633117676, - "lr": 9.686860940695297e-06, - "objective/entropy": 47.52754211425781, - "objective/kl": 42.061561584472656, - "objective/non_score_reward": -4.206155776977539, - "objective/rlhf_reward": -15.49911001685254, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 55.551963806152344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7129836082458496, - "step": 490, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9957895278930664 - }, - { - "episode": 7872, - "epoch": 0.14149620735521443, - "loss/policy_avg": 0.20792043209075928, - "lr": 9.686221881390594e-06, - "objective/entropy": -1.685638427734375, - "objective/kl": 38.568145751953125, - "objective/non_score_reward": -3.8568148612976074, - "objective/rlhf_reward": -14.10174730780713, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.467138290405273, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4478898048400879, - "step": 491, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003159046173096 - }, - { - "episode": 7888, - "epoch": 0.1417838012726031, - "loss/policy_avg": -0.18662777543067932, - "lr": 9.68558282208589e-06, - "objective/entropy": -26.272117614746094, - "objective/kl": 42.39691925048828, - "objective/non_score_reward": -4.239691734313965, - "objective/rlhf_reward": -15.633254084616823, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 55.918922424316406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5446948409080505, - "step": 492, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999772548675537 - }, - { - "episode": 7904, - "epoch": 0.14207139518999173, - "loss/policy_avg": 0.31151118874549866, - "lr": 9.684943762781188e-06, - "objective/entropy": 33.51483154296875, - "objective/kl": 41.72319030761719, - "objective/non_score_reward": -4.172318935394287, - "objective/rlhf_reward": -15.265443642337885, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 126.68960571289062, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.847740650177002, - "step": 493, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9983421564102173 - }, - { - "episode": 7920, - "epoch": 0.14235898910738037, - "loss/policy_avg": 0.7131699323654175, - "lr": 9.684304703476484e-06, - "objective/entropy": -26.66382598876953, - "objective/kl": 45.098487854003906, - "objective/non_score_reward": -4.5098490715026855, - "objective/rlhf_reward": -16.58879766902481, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 113.07894897460938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4858088493347168, - "step": 494, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995558261871338 - }, - { - "episode": 7936, - "epoch": 0.142646583024769, - "loss/policy_avg": 0.7710833549499512, - "lr": 9.68366564417178e-06, - "objective/entropy": 83.98237609863281, - "objective/kl": 33.111812591552734, - "objective/non_score_reward": -3.3111815452575684, - "objective/rlhf_reward": -11.763773563320994, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 51.01200866699219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5449756979942322, - "step": 495, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9964442253112793 - }, - { - "episode": 7952, - "epoch": 0.14293417694215768, - "loss/policy_avg": 0.6315375566482544, - "lr": 9.683026584867076e-06, - "objective/entropy": -325.8221435546875, - "objective/kl": 24.298229217529297, - "objective/non_score_reward": -2.4298229217529297, - "objective/rlhf_reward": -8.360041939948482, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.398147702217102, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5080133676528931, - "step": 496, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000394344329834 - }, - { - "episode": 7968, - "epoch": 0.14322177085954632, - "loss/policy_avg": 0.2566729485988617, - "lr": 9.682387525562373e-06, - "objective/entropy": -97.19512939453125, - "objective/kl": 27.388530731201172, - "objective/non_score_reward": -2.7388532161712646, - "objective/rlhf_reward": -9.596163117621822, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 83.0306167602539, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6368144154548645, - "step": 497, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9983546733856201 - }, - { - "episode": 7984, - "epoch": 0.14350936477693496, - "loss/policy_avg": 2.3038244247436523, - "lr": 9.68174846625767e-06, - "objective/entropy": -312.97418212890625, - "objective/kl": 39.9110221862793, - "objective/non_score_reward": -3.9911022186279297, - "objective/rlhf_reward": -15.96440851688385, - "objective/scores": 0.0, - "policy/approxkl_avg": 229.93643188476562, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5881428718566895, - "step": 498, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9984025955200195 - }, - { - "episode": 8000, - "epoch": 0.14379695869432363, - "loss/policy_avg": 0.4790765643119812, - "lr": 9.681109406952967e-06, - "objective/entropy": 102.07373809814453, - "objective/kl": 39.629451751708984, - "objective/non_score_reward": -3.9629452228546143, - "objective/rlhf_reward": -14.189921622694122, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 38.121917724609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5411556959152222, - "step": 499, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9958930015563965 - } - ], - "logging_steps": 500, - "max_steps": 7824, - "num_input_tokens_seen": 0, - "num_train_epochs": 9.0, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": true, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 0, - "train_batch_size": null, - "trial_name": null, - "trial_params": null -}