{ "best_metric": null, "best_model_checkpoint": null, "episode": 1000, "epoch": 0.18779342723004694, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 1, "epoch": 0.00018779342723004695, "eps": 0, "loss/policy_avg": -2.6540936232777312e-05, "loss/value_avg": 14.486083984375, "lr": 3e-06, "objective/entropy": 90.44258117675781, "objective/kl": 0.0024810805916786194, "objective/non_score_reward": -0.00012405402958393097, "objective/rlhf_reward": -11.0136137008667, "objective/scores": -11.013489723205566, "policy/approxkl_avg": 7.648613120636583e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.532650351524353, "step": 1, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999532103538513, "val/ratio_var": NaN }, { "episode": 2, "epoch": 0.0003755868544600939, "eps": 0, "loss/policy_avg": 2.158812822017353e-05, "loss/value_avg": 11.799023628234863, "lr": 2.9970000000000003e-06, "objective/entropy": 56.35115051269531, "objective/kl": -0.8070715665817261, "objective/non_score_reward": 0.040353573858737946, "objective/rlhf_reward": -9.768648147583008, "objective/scores": -9.809001922607422, "policy/approxkl_avg": 7.080922159730108e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2128353118896484, "step": 2, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000128746032715, "val/ratio_var": NaN }, { "episode": 3, "epoch": 0.0005633802816901409, "eps": 0, "loss/policy_avg": 9.010423127620015e-06, "loss/value_avg": 11.766682624816895, "lr": 2.994e-06, "objective/entropy": 108.8889389038086, "objective/kl": -0.5920640230178833, "objective/non_score_reward": 0.029603198170661926, "objective/rlhf_reward": -10.216741561889648, "objective/scores": -10.246344566345215, "policy/approxkl_avg": 1.3102368257023045e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7693861722946167, "step": 3, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998500347137451, "val/ratio_var": NaN }, { "episode": 4, "epoch": 0.0007511737089201878, "eps": 0, "loss/policy_avg": -5.56909799342975e-06, "loss/value_avg": 13.272283554077148, "lr": 2.9910000000000002e-06, "objective/entropy": 84.5605697631836, "objective/kl": 1.3825104236602783, "objective/non_score_reward": -0.06912551820278168, "objective/rlhf_reward": -11.116378784179688, "objective/scores": -11.047253608703613, "policy/approxkl_avg": 7.429797932445581e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8778421878814697, "step": 4, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999828934669495, "val/ratio_var": NaN }, { "episode": 5, "epoch": 0.0009389671361502347, "eps": 0, "loss/policy_avg": -3.0791983590461314e-05, "loss/value_avg": 12.206611633300781, "lr": 2.988e-06, "objective/entropy": 76.26399230957031, "objective/kl": -0.5283395648002625, "objective/non_score_reward": 0.026416979730129242, "objective/rlhf_reward": -11.44277572631836, "objective/scores": -11.469192504882812, "policy/approxkl_avg": 4.5821234806453504e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6988295316696167, "step": 5, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000548362731934, "val/ratio_var": NaN }, { "episode": 6, "epoch": 0.0011267605633802818, "eps": 0, "loss/policy_avg": -1.731908560032025e-05, "loss/value_avg": 12.190022468566895, "lr": 2.9850000000000002e-06, "objective/entropy": 86.91709899902344, "objective/kl": 1.706704020500183, "objective/non_score_reward": -0.08533520251512527, "objective/rlhf_reward": -11.549877166748047, "objective/scores": -11.464542388916016, "policy/approxkl_avg": 6.806941854620163e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6084100008010864, "step": 6, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000361204147339, "val/ratio_var": NaN }, { "episode": 7, "epoch": 0.0013145539906103286, "eps": 0, "loss/policy_avg": 4.083255407749675e-05, "loss/value_avg": 9.225897789001465, "lr": 2.982e-06, "objective/entropy": 56.397979736328125, "objective/kl": 1.5345274209976196, "objective/non_score_reward": -0.07672636210918427, "objective/rlhf_reward": -10.609889030456543, "objective/scores": -10.533163070678711, "policy/approxkl_avg": 6.17609288156018e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2468584775924683, "step": 7, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000213384628296, "val/ratio_var": NaN }, { "episode": 8, "epoch": 0.0015023474178403756, "eps": 0, "loss/policy_avg": -6.891646626172587e-05, "loss/value_avg": 8.974214553833008, "lr": 2.979e-06, "objective/entropy": 75.94960021972656, "objective/kl": 2.3212406635284424, "objective/non_score_reward": -0.11606204509735107, "objective/rlhf_reward": -10.558777809143066, "objective/scores": -10.442715644836426, "policy/approxkl_avg": 7.62259944053767e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6314882040023804, "step": 8, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999828338623047, "val/ratio_var": NaN }, { "episode": 9, "epoch": 0.0016901408450704226, "eps": 0, "loss/policy_avg": 1.510584115749225e-05, "loss/value_avg": 7.217881679534912, "lr": 2.976e-06, "objective/entropy": 90.28285217285156, "objective/kl": -0.28180134296417236, "objective/non_score_reward": 0.014090072363615036, "objective/rlhf_reward": -10.327535629272461, "objective/scores": -10.341626167297363, "policy/approxkl_avg": 6.991678702661375e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7381937503814697, "step": 9, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999794363975525, "val/ratio_var": NaN }, { "episode": 10, "epoch": 0.0018779342723004694, "eps": 0, "loss/policy_avg": -4.556943895295262e-05, "loss/value_avg": 10.557513236999512, "lr": 2.973e-06, "objective/entropy": 102.24281311035156, "objective/kl": 3.242715358734131, "objective/non_score_reward": -0.1621357798576355, "objective/rlhf_reward": -11.895389556884766, "objective/scores": -11.733253479003906, "policy/approxkl_avg": 9.427915159676559e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.982313632965088, "step": 10, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000002145767212, "val/ratio_var": NaN }, { "episode": 11, "epoch": 0.0020657276995305163, "eps": 0, "loss/policy_avg": 0.00013205240247771144, "loss/value_avg": 7.533830642700195, "lr": 2.97e-06, "objective/entropy": 129.00747680664062, "objective/kl": 0.750293493270874, "objective/non_score_reward": -0.03751467540860176, "objective/rlhf_reward": -11.248336791992188, "objective/scores": -11.210822105407715, "policy/approxkl_avg": 1.4738931497504382e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8483725786209106, "step": 11, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000438690185547, "val/ratio_var": NaN }, { "episode": 12, "epoch": 0.0022535211267605635, "eps": 0, "loss/policy_avg": -6.954624950594734e-06, "loss/value_avg": 4.137267589569092, "lr": 2.967e-06, "objective/entropy": 106.32261657714844, "objective/kl": 4.431699752807617, "objective/non_score_reward": -0.2215849906206131, "objective/rlhf_reward": -8.891958236694336, "objective/scores": -8.67037296295166, "policy/approxkl_avg": 1.1889778050999666e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.876338243484497, "step": 12, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000017762184143, "val/ratio_var": NaN }, { "episode": 13, "epoch": 0.0024413145539906103, "eps": 0, "loss/policy_avg": 1.4026210010342766e-05, "loss/value_avg": 9.449892044067383, "lr": 2.964e-06, "objective/entropy": 95.6624755859375, "objective/kl": 1.3649709224700928, "objective/non_score_reward": -0.06824854761362076, "objective/rlhf_reward": -10.962462425231934, "objective/scores": -10.894213676452637, "policy/approxkl_avg": 1.0764010482944286e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7576006650924683, "step": 13, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998936057090759, "val/ratio_var": NaN }, { "episode": 14, "epoch": 0.002629107981220657, "eps": 0, "loss/policy_avg": 3.1489247476201854e-07, "loss/value_avg": 8.275161743164062, "lr": 2.961e-06, "objective/entropy": 105.96736145019531, "objective/kl": 3.4637176990509033, "objective/non_score_reward": -0.17318589985370636, "objective/rlhf_reward": -11.769269943237305, "objective/scores": -11.596083641052246, "policy/approxkl_avg": 7.071152197113406e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.824642539024353, "step": 14, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000712871551514, "val/ratio_var": NaN }, { "episode": 15, "epoch": 0.0028169014084507044, "eps": 0, "loss/policy_avg": 0.00013110322470311075, "loss/value_avg": 6.648995399475098, "lr": 2.958e-06, "objective/entropy": 118.96111297607422, "objective/kl": 1.9561591148376465, "objective/non_score_reward": -0.09780795872211456, "objective/rlhf_reward": -11.107560157775879, "objective/scores": -11.00975227355957, "policy/approxkl_avg": 9.799950362321397e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.117682695388794, "step": 15, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000369548797607, "val/ratio_var": NaN }, { "episode": 16, "epoch": 0.003004694835680751, "eps": 0, "loss/policy_avg": -8.82778549566865e-05, "loss/value_avg": 6.694149494171143, "lr": 2.955e-06, "objective/entropy": 79.55110168457031, "objective/kl": 2.108759641647339, "objective/non_score_reward": -0.1054379865527153, "objective/rlhf_reward": -11.98199462890625, "objective/scores": -11.876556396484375, "policy/approxkl_avg": 8.424951403185332e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7588996887207031, "step": 16, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000343322753906, "val/ratio_var": NaN }, { "episode": 17, "epoch": 0.003192488262910798, "eps": 0, "loss/policy_avg": 0.0001728129864204675, "loss/value_avg": 4.943781852722168, "lr": 2.952e-06, "objective/entropy": 145.1512451171875, "objective/kl": 1.2299878597259521, "objective/non_score_reward": -0.06149939447641373, "objective/rlhf_reward": -10.52453899383545, "objective/scores": -10.46303939819336, "policy/approxkl_avg": 1.5223808702558017e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.809386134147644, "step": 17, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999275207519531, "val/ratio_var": NaN }, { "episode": 18, "epoch": 0.0033802816901408453, "eps": 0, "loss/policy_avg": -0.00011015838390449062, "loss/value_avg": 4.582302093505859, "lr": 2.949e-06, "objective/entropy": 101.7217788696289, "objective/kl": 2.889047861099243, "objective/non_score_reward": -0.14445239305496216, "objective/rlhf_reward": -10.81423568725586, "objective/scores": -10.669783592224121, "policy/approxkl_avg": 1.1545481726216167e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1047110557556152, "step": 18, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999497532844543, "val/ratio_var": NaN }, { "episode": 19, "epoch": 0.003568075117370892, "eps": 0, "loss/policy_avg": -2.103481710946653e-05, "loss/value_avg": 6.498551845550537, "lr": 2.946e-06, "objective/entropy": 77.13409423828125, "objective/kl": 3.430631637573242, "objective/non_score_reward": -0.1715315878391266, "objective/rlhf_reward": -11.659293174743652, "objective/scores": -11.487761497497559, "policy/approxkl_avg": 4.072197867799332e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5056037902832031, "step": 19, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000556707382202, "val/ratio_var": NaN }, { "episode": 20, "epoch": 0.003755868544600939, "eps": 0, "loss/policy_avg": 5.612958193523809e-05, "loss/value_avg": 3.702760696411133, "lr": 2.943e-06, "objective/entropy": 76.89835357666016, "objective/kl": 3.17977237701416, "objective/non_score_reward": -0.1589886099100113, "objective/rlhf_reward": -11.411824226379395, "objective/scores": -11.252835273742676, "policy/approxkl_avg": 7.774686849870704e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6909548044204712, "step": 20, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001057386398315, "val/ratio_var": NaN }, { "episode": 21, "epoch": 0.003943661971830986, "eps": 0, "loss/policy_avg": 4.0990002162288874e-05, "loss/value_avg": 5.600470066070557, "lr": 2.9400000000000002e-06, "objective/entropy": 122.47428131103516, "objective/kl": 0.7818828821182251, "objective/non_score_reward": -0.03909413143992424, "objective/rlhf_reward": -10.868765830993652, "objective/scores": -10.829671859741211, "policy/approxkl_avg": 1.0502575520376922e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.951524257659912, "step": 21, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999860525131226, "val/ratio_var": NaN }, { "episode": 22, "epoch": 0.0041314553990610325, "eps": 0, "loss/policy_avg": -2.8947613827767782e-05, "loss/value_avg": 4.149322032928467, "lr": 2.937e-06, "objective/entropy": 91.36865234375, "objective/kl": 0.633216381072998, "objective/non_score_reward": -0.03166081756353378, "objective/rlhf_reward": -9.124651908874512, "objective/scores": -9.09299087524414, "policy/approxkl_avg": 6.058338897219073e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5152864456176758, "step": 22, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999745488166809, "val/ratio_var": NaN }, { "episode": 23, "epoch": 0.00431924882629108, "eps": 0, "loss/policy_avg": -2.4003802536753938e-05, "loss/value_avg": 4.4201555252075195, "lr": 2.934e-06, "objective/entropy": 95.92005920410156, "objective/kl": 6.137463092803955, "objective/non_score_reward": -0.3068731725215912, "objective/rlhf_reward": -11.429823875427246, "objective/scores": -11.122950553894043, "policy/approxkl_avg": 6.228873417057912e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8420778512954712, "step": 23, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999921977519989, "val/ratio_var": NaN }, { "episode": 24, "epoch": 0.004507042253521127, "eps": 0, "loss/policy_avg": 0.00010305980686098337, "loss/value_avg": 3.895770311355591, "lr": 2.931e-06, "objective/entropy": 90.50055694580078, "objective/kl": -1.1275094747543335, "objective/non_score_reward": 0.05637548863887787, "objective/rlhf_reward": -11.460719108581543, "objective/scores": -11.517094612121582, "policy/approxkl_avg": 1.1377586872640677e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8623600006103516, "step": 24, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000786781311035, "val/ratio_var": NaN }, { "episode": 25, "epoch": 0.004694835680751174, "eps": 0, "loss/policy_avg": 0.00011273600102867931, "loss/value_avg": 4.3795905113220215, "lr": 2.928e-06, "objective/entropy": 115.70638275146484, "objective/kl": -0.676271915435791, "objective/non_score_reward": 0.03381359204649925, "objective/rlhf_reward": -9.378944396972656, "objective/scores": -9.412757873535156, "policy/approxkl_avg": 1.1129555588240692e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0625805854797363, "step": 25, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000838041305542, "val/ratio_var": NaN }, { "episode": 26, "epoch": 0.004882629107981221, "eps": 0, "loss/policy_avg": -0.00011668115621432662, "loss/value_avg": 3.1358349323272705, "lr": 2.925e-06, "objective/entropy": 83.19625854492188, "objective/kl": 2.288026809692383, "objective/non_score_reward": -0.11440135538578033, "objective/rlhf_reward": -11.104059219360352, "objective/scores": -10.989657402038574, "policy/approxkl_avg": 1.0224009372450382e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4735153913497925, "step": 26, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000040531158447, "val/ratio_var": NaN }, { "episode": 27, "epoch": 0.0050704225352112674, "eps": 0, "loss/policy_avg": -2.3378515834338032e-05, "loss/value_avg": 2.322936773300171, "lr": 2.922e-06, "objective/entropy": 115.6414566040039, "objective/kl": 1.1482548713684082, "objective/non_score_reward": -0.057412728667259216, "objective/rlhf_reward": -10.864422798156738, "objective/scores": -10.80700969696045, "policy/approxkl_avg": 8.128109385552307e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1444644927978516, "step": 27, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999807476997375, "val/ratio_var": NaN }, { "episode": 28, "epoch": 0.005258215962441314, "eps": 0, "loss/policy_avg": 0.00019169753068126738, "loss/value_avg": 4.061575412750244, "lr": 2.919e-06, "objective/entropy": 88.08735656738281, "objective/kl": 1.3133659362792969, "objective/non_score_reward": -0.06566829979419708, "objective/rlhf_reward": -10.043410301208496, "objective/scores": -9.977742195129395, "policy/approxkl_avg": 1.1498546825805533e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6513787508010864, "step": 28, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.99993497133255, "val/ratio_var": NaN }, { "episode": 29, "epoch": 0.005446009389671361, "eps": 0, "loss/policy_avg": -8.381313091376796e-05, "loss/value_avg": 4.145792484283447, "lr": 2.916e-06, "objective/entropy": 113.21372985839844, "objective/kl": -0.3414345383644104, "objective/non_score_reward": 0.017071709036827087, "objective/rlhf_reward": -10.725777626037598, "objective/scores": -10.742849349975586, "policy/approxkl_avg": 9.835668635105321e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7891845703125, "step": 29, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000144243240356, "val/ratio_var": NaN }, { "episode": 30, "epoch": 0.005633802816901409, "eps": 0, "loss/policy_avg": -2.555127366576926e-06, "loss/value_avg": 1.8354614973068237, "lr": 2.913e-06, "objective/entropy": 123.514892578125, "objective/kl": 7.1594648361206055, "objective/non_score_reward": -0.35797327756881714, "objective/rlhf_reward": -10.713311195373535, "objective/scores": -10.355338096618652, "policy/approxkl_avg": 1.393191695342466e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7196943759918213, "step": 30, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000720024108887, "val/ratio_var": NaN }, { "episode": 31, "epoch": 0.0058215962441314556, "eps": 0, "loss/policy_avg": 2.2114447347121313e-05, "loss/value_avg": 3.1718387603759766, "lr": 2.91e-06, "objective/entropy": 128.23077392578125, "objective/kl": 5.189370155334473, "objective/non_score_reward": -0.25946855545043945, "objective/rlhf_reward": -10.484077453613281, "objective/scores": -10.224609375, "policy/approxkl_avg": 1.4628041355990717e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.351806640625, "step": 31, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9997697472572327, "val/ratio_var": NaN }, { "episode": 32, "epoch": 0.006009389671361502, "eps": 0, "loss/policy_avg": 1.9334396711201407e-05, "loss/value_avg": 1.1630524396896362, "lr": 2.907e-06, "objective/entropy": 94.7955093383789, "objective/kl": 4.363494396209717, "objective/non_score_reward": -0.21817469596862793, "objective/rlhf_reward": -10.61274528503418, "objective/scores": -10.394570350646973, "policy/approxkl_avg": 6.663707097231963e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8709832429885864, "step": 32, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999009966850281, "val/ratio_var": NaN }, { "episode": 33, "epoch": 0.006197183098591549, "eps": 0, "loss/policy_avg": -3.66174936061725e-05, "loss/value_avg": 2.7939913272857666, "lr": 2.904e-06, "objective/entropy": 90.96968841552734, "objective/kl": 5.214733600616455, "objective/non_score_reward": -0.26073670387268066, "objective/rlhf_reward": -12.032078742980957, "objective/scores": -11.771342277526855, "policy/approxkl_avg": 1.6065015984167985e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3584352731704712, "step": 33, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000051498413086, "val/ratio_var": NaN }, { "episode": 34, "epoch": 0.006384976525821596, "eps": 0, "loss/policy_avg": -7.692373401368968e-06, "loss/value_avg": 1.0703469514846802, "lr": 2.901e-06, "objective/entropy": 110.87727355957031, "objective/kl": 5.749995708465576, "objective/non_score_reward": -0.2874998152256012, "objective/rlhf_reward": -8.95866870880127, "objective/scores": -8.67116928100586, "policy/approxkl_avg": 1.5204211933905754e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.044541835784912, "step": 34, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000173568725586, "val/ratio_var": NaN }, { "episode": 35, "epoch": 0.006572769953051643, "eps": 0, "loss/policy_avg": 1.851567685662303e-05, "loss/value_avg": 2.6688926219940186, "lr": 2.898e-06, "objective/entropy": 86.99762725830078, "objective/kl": 4.505297660827637, "objective/non_score_reward": -0.22526486217975616, "objective/rlhf_reward": -11.03963565826416, "objective/scores": -10.814371109008789, "policy/approxkl_avg": 5.150129211983767e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6791784763336182, "step": 35, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000016450881958, "val/ratio_var": NaN }, { "episode": 36, "epoch": 0.0067605633802816905, "eps": 0, "loss/policy_avg": -2.9851806175429374e-05, "loss/value_avg": 1.9055718183517456, "lr": 2.895e-06, "objective/entropy": 63.99182891845703, "objective/kl": 2.4275705814361572, "objective/non_score_reward": -0.12137851119041443, "objective/rlhf_reward": -10.836272239685059, "objective/scores": -10.714893341064453, "policy/approxkl_avg": 3.309247986749142e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.311838984489441, "step": 36, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999628067016602, "val/ratio_var": NaN }, { "episode": 37, "epoch": 0.006948356807511737, "eps": 0, "loss/policy_avg": -6.459793803514913e-05, "loss/value_avg": 3.957153797149658, "lr": 2.892e-06, "objective/entropy": 48.798622131347656, "objective/kl": 3.975572109222412, "objective/non_score_reward": -0.19877861440181732, "objective/rlhf_reward": -11.976491928100586, "objective/scores": -11.777713775634766, "policy/approxkl_avg": 1.9264781769834372e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1463946104049683, "step": 37, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000495910644531, "val/ratio_var": NaN }, { "episode": 38, "epoch": 0.007136150234741784, "eps": 0, "loss/policy_avg": -9.883574966806918e-05, "loss/value_avg": 1.991323471069336, "lr": 2.8889999999999998e-06, "objective/entropy": 146.75433349609375, "objective/kl": 0.8421511650085449, "objective/non_score_reward": -0.04210756719112396, "objective/rlhf_reward": -11.244985580444336, "objective/scores": -11.20287799835205, "policy/approxkl_avg": 1.5024387778339587e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3156001567840576, "step": 38, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999986469745636, "val/ratio_var": NaN }, { "episode": 39, "epoch": 0.007323943661971831, "eps": 0, "loss/policy_avg": 8.762557263253257e-05, "loss/value_avg": 1.920060634613037, "lr": 2.886e-06, "objective/entropy": 88.22615051269531, "objective/kl": 10.147639274597168, "objective/non_score_reward": -0.5073819756507874, "objective/rlhf_reward": -12.446175575256348, "objective/scores": -11.938793182373047, "policy/approxkl_avg": 4.917680129779001e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.325266718864441, "step": 39, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999947190284729, "val/ratio_var": NaN }, { "episode": 40, "epoch": 0.007511737089201878, "eps": 0, "loss/policy_avg": 6.104415206209524e-06, "loss/value_avg": 0.8917202353477478, "lr": 2.883e-06, "objective/entropy": 127.18856811523438, "objective/kl": 11.66610336303711, "objective/non_score_reward": -0.5833052396774292, "objective/rlhf_reward": -11.548681259155273, "objective/scores": -10.965375900268555, "policy/approxkl_avg": 8.404946072460007e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2139525413513184, "step": 40, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000052452087402, "val/ratio_var": NaN }, { "episode": 41, "epoch": 0.007699530516431925, "eps": 0, "loss/policy_avg": 3.3891425118781626e-05, "loss/value_avg": 2.1987857818603516, "lr": 2.88e-06, "objective/entropy": 106.74901580810547, "objective/kl": 10.004382133483887, "objective/non_score_reward": -0.5002191066741943, "objective/rlhf_reward": -12.167585372924805, "objective/scores": -11.667366027832031, "policy/approxkl_avg": 7.151591319143336e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.289180040359497, "step": 41, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000055193901062, "val/ratio_var": NaN }, { "episode": 42, "epoch": 0.007887323943661971, "eps": 0, "loss/policy_avg": -9.6474053862039e-05, "loss/value_avg": 2.085190534591675, "lr": 2.877e-06, "objective/entropy": 62.83269119262695, "objective/kl": 2.3958017826080322, "objective/non_score_reward": -0.11979008466005325, "objective/rlhf_reward": -8.464539527893066, "objective/scores": -8.344749450683594, "policy/approxkl_avg": 5.743742192976242e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4821524620056152, "step": 42, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000126361846924, "val/ratio_var": NaN }, { "episode": 43, "epoch": 0.008075117370892018, "eps": 0, "loss/policy_avg": -2.8772174118785188e-05, "loss/value_avg": 4.1844611167907715, "lr": 2.874e-06, "objective/entropy": 114.22052764892578, "objective/kl": 0.43981385231018066, "objective/non_score_reward": -0.02199070155620575, "objective/rlhf_reward": -12.45682430267334, "objective/scores": -12.434833526611328, "policy/approxkl_avg": 7.945141788923138e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.053934335708618, "step": 43, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000972747802734, "val/ratio_var": NaN }, { "episode": 44, "epoch": 0.008262910798122065, "eps": 0, "loss/policy_avg": -0.000106460640381556, "loss/value_avg": 2.9273459911346436, "lr": 2.871e-06, "objective/entropy": 133.04550170898438, "objective/kl": 4.506217956542969, "objective/non_score_reward": -0.22531089186668396, "objective/rlhf_reward": -10.46045207977295, "objective/scores": -10.235140800476074, "policy/approxkl_avg": 1.616369758039582e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1005630493164062, "step": 44, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000616312026978, "val/ratio_var": NaN }, { "episode": 45, "epoch": 0.008450704225352112, "eps": 0, "loss/policy_avg": -5.434144441096578e-06, "loss/value_avg": 1.1862280368804932, "lr": 2.868e-06, "objective/entropy": 101.84860229492188, "objective/kl": 7.871419906616211, "objective/non_score_reward": -0.39357098937034607, "objective/rlhf_reward": -11.007119178771973, "objective/scores": -10.613548278808594, "policy/approxkl_avg": 5.8926627133359943e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1164045333862305, "step": 45, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000081062316895, "val/ratio_var": NaN }, { "episode": 46, "epoch": 0.00863849765258216, "eps": 0, "loss/policy_avg": -2.249231874884572e-05, "loss/value_avg": 1.3462660312652588, "lr": 2.865e-06, "objective/entropy": 84.99420166015625, "objective/kl": 7.619663238525391, "objective/non_score_reward": -0.3809831142425537, "objective/rlhf_reward": -9.478157043457031, "objective/scores": -9.097173690795898, "policy/approxkl_avg": 7.49967767887938e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4456557035446167, "step": 46, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999799132347107, "val/ratio_var": NaN }, { "episode": 47, "epoch": 0.008826291079812207, "eps": 0, "loss/policy_avg": -6.67572021484375e-05, "loss/value_avg": 1.0165573358535767, "lr": 2.862e-06, "objective/entropy": 66.35957336425781, "objective/kl": 0.9106484055519104, "objective/non_score_reward": -0.04553245007991791, "objective/rlhf_reward": -11.847959518432617, "objective/scores": -11.802427291870117, "policy/approxkl_avg": 6.093904403314809e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5423583984375, "step": 47, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001013278961182, "val/ratio_var": NaN }, { "episode": 48, "epoch": 0.009014084507042254, "eps": 0, "loss/policy_avg": 4.4849683035863563e-05, "loss/value_avg": 1.9666733741760254, "lr": 2.859e-06, "objective/entropy": 62.499908447265625, "objective/kl": 1.6098904609680176, "objective/non_score_reward": -0.08049450814723969, "objective/rlhf_reward": -11.564969062805176, "objective/scores": -11.484474182128906, "policy/approxkl_avg": 4.901587757899506e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1136497259140015, "step": 48, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000277757644653, "val/ratio_var": NaN }, { "episode": 49, "epoch": 0.0092018779342723, "eps": 0, "loss/policy_avg": -1.861464261310175e-05, "loss/value_avg": 1.1666669845581055, "lr": 2.856e-06, "objective/entropy": 67.25932312011719, "objective/kl": 7.191189289093018, "objective/non_score_reward": -0.35955947637557983, "objective/rlhf_reward": -11.880326271057129, "objective/scores": -11.520767211914062, "policy/approxkl_avg": 8.41771878867803e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6355948448181152, "step": 49, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000905990600586, "val/ratio_var": NaN }, { "episode": 50, "epoch": 0.009389671361502348, "eps": 0, "loss/policy_avg": 5.038279596192297e-06, "loss/value_avg": 1.3691303730010986, "lr": 2.853e-06, "objective/entropy": 87.73133850097656, "objective/kl": 7.58608341217041, "objective/non_score_reward": -0.3793042004108429, "objective/rlhf_reward": -11.530131340026855, "objective/scores": -11.150827407836914, "policy/approxkl_avg": 8.476425250592001e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8863179683685303, "step": 50, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000296831130981, "val/ratio_var": NaN }, { "episode": 51, "epoch": 0.009577464788732394, "eps": 0, "loss/policy_avg": -3.130480763502419e-05, "loss/value_avg": 2.132746934890747, "lr": 2.85e-06, "objective/entropy": 115.10606384277344, "objective/kl": 4.390524387359619, "objective/non_score_reward": -0.2195262312889099, "objective/rlhf_reward": -12.25935173034668, "objective/scores": -12.039825439453125, "policy/approxkl_avg": 9.490728558603223e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.002812147140503, "step": 51, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000275373458862, "val/ratio_var": NaN }, { "episode": 52, "epoch": 0.009765258215962441, "eps": 0, "loss/policy_avg": 0.00010477371688466519, "loss/value_avg": 1.9598627090454102, "lr": 2.847e-06, "objective/entropy": 127.09746551513672, "objective/kl": 6.9939656257629395, "objective/non_score_reward": -0.3496982455253601, "objective/rlhf_reward": -11.009757995605469, "objective/scores": -10.660059928894043, "policy/approxkl_avg": 1.1446445569163188e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3958187103271484, "step": 52, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998849034309387, "val/ratio_var": NaN }, { "episode": 53, "epoch": 0.009953051643192488, "eps": 0, "loss/policy_avg": -2.4507629859726876e-05, "loss/value_avg": 0.8414835929870605, "lr": 2.844e-06, "objective/entropy": 96.98820495605469, "objective/kl": 6.676660060882568, "objective/non_score_reward": -0.3338330388069153, "objective/rlhf_reward": -11.65561580657959, "objective/scores": -11.321783065795898, "policy/approxkl_avg": 1.4336886522414716e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6719555854797363, "step": 53, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001496076583862, "val/ratio_var": NaN }, { "episode": 54, "epoch": 0.010140845070422535, "eps": 0, "loss/policy_avg": -0.00013297908299136907, "loss/value_avg": 0.8910483717918396, "lr": 2.841e-06, "objective/entropy": 113.40684509277344, "objective/kl": 2.6107702255249023, "objective/non_score_reward": -0.13053849339485168, "objective/rlhf_reward": -10.657209396362305, "objective/scores": -10.526670455932617, "policy/approxkl_avg": 1.5372177131212084e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8619270324707031, "step": 54, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000078797340393, "val/ratio_var": NaN }, { "episode": 55, "epoch": 0.010328638497652582, "eps": 0, "loss/policy_avg": 2.384185791015625e-05, "loss/value_avg": 1.2867923974990845, "lr": 2.8379999999999998e-06, "objective/entropy": 83.54084014892578, "objective/kl": -2.5146942138671875, "objective/non_score_reward": 0.12573471665382385, "objective/rlhf_reward": -11.015146255493164, "objective/scores": -11.140880584716797, "policy/approxkl_avg": 7.817519787067795e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7360817193984985, "step": 55, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998679161071777, "val/ratio_var": NaN }, { "episode": 56, "epoch": 0.010516431924882629, "eps": 0, "loss/policy_avg": -2.077390490740072e-05, "loss/value_avg": 1.340038537979126, "lr": 2.835e-06, "objective/entropy": 70.49563598632812, "objective/kl": 3.038593053817749, "objective/non_score_reward": -0.15192964673042297, "objective/rlhf_reward": -8.427001953125, "objective/scores": -8.27507209777832, "policy/approxkl_avg": 7.749152075575694e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5707592964172363, "step": 56, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000008344650269, "val/ratio_var": NaN }, { "episode": 57, "epoch": 0.010704225352112675, "eps": 0, "loss/policy_avg": -1.8853061192203313e-05, "loss/value_avg": 0.9871846437454224, "lr": 2.8319999999999997e-06, "objective/entropy": 103.52996826171875, "objective/kl": 9.19045352935791, "objective/non_score_reward": -0.45952269434928894, "objective/rlhf_reward": -10.30367374420166, "objective/scores": -9.844151496887207, "policy/approxkl_avg": 1.0477547363052508e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7314821481704712, "step": 57, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000463724136353, "val/ratio_var": NaN }, { "episode": 58, "epoch": 0.010892018779342722, "eps": 0, "loss/policy_avg": 6.043236135155894e-05, "loss/value_avg": 1.3239272832870483, "lr": 2.829e-06, "objective/entropy": 116.81423950195312, "objective/kl": 5.677644729614258, "objective/non_score_reward": -0.2838822305202484, "objective/rlhf_reward": -11.41634750366211, "objective/scores": -11.132465362548828, "policy/approxkl_avg": 8.638802739824314e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2698214054107666, "step": 58, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000132322311401, "val/ratio_var": NaN }, { "episode": 59, "epoch": 0.01107981220657277, "eps": 0, "loss/policy_avg": 6.777385715395212e-05, "loss/value_avg": 0.6727828979492188, "lr": 2.8259999999999997e-06, "objective/entropy": 106.17517852783203, "objective/kl": 10.683148384094238, "objective/non_score_reward": -0.5341574549674988, "objective/rlhf_reward": -11.980522155761719, "objective/scores": -11.446364402770996, "policy/approxkl_avg": 8.359666736623694e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7164422273635864, "step": 59, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000511407852173, "val/ratio_var": NaN }, { "episode": 60, "epoch": 0.011267605633802818, "eps": 0, "loss/policy_avg": -6.670322181889787e-05, "loss/value_avg": 0.9024935960769653, "lr": 2.823e-06, "objective/entropy": 86.5704345703125, "objective/kl": 15.158479690551758, "objective/non_score_reward": -0.7579240202903748, "objective/rlhf_reward": -11.120813369750977, "objective/scores": -10.362889289855957, "policy/approxkl_avg": 8.601389822615602e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9122798442840576, "step": 60, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000865459442139, "val/ratio_var": NaN }, { "episode": 61, "epoch": 0.011455399061032864, "eps": 0, "loss/policy_avg": 7.8885059338063e-05, "loss/value_avg": 2.075629234313965, "lr": 2.82e-06, "objective/entropy": 119.55440521240234, "objective/kl": 8.682973861694336, "objective/non_score_reward": -0.4341486692428589, "objective/rlhf_reward": -11.503044128417969, "objective/scores": -11.06889533996582, "policy/approxkl_avg": 1.3398937426245539e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.159679412841797, "step": 61, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000330209732056, "val/ratio_var": NaN }, { "episode": 62, "epoch": 0.011643192488262911, "eps": 0, "loss/policy_avg": -2.1007825125707313e-05, "loss/value_avg": 1.0716050863265991, "lr": 2.817e-06, "objective/entropy": 92.87950134277344, "objective/kl": 4.641087532043457, "objective/non_score_reward": -0.23205438256263733, "objective/rlhf_reward": -11.421432495117188, "objective/scores": -11.189377784729004, "policy/approxkl_avg": 8.487394609346666e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8937066793441772, "step": 62, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999229907989502, "val/ratio_var": NaN }, { "episode": 63, "epoch": 0.011830985915492958, "eps": 0, "loss/policy_avg": -8.605561561125796e-06, "loss/value_avg": 0.3560955822467804, "lr": 2.814e-06, "objective/entropy": 51.12089157104492, "objective/kl": 3.4639410972595215, "objective/non_score_reward": -0.17319706082344055, "objective/rlhf_reward": -10.589929580688477, "objective/scores": -10.416732788085938, "policy/approxkl_avg": 8.638239989977592e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9904670119285583, "step": 63, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000261068344116, "val/ratio_var": NaN }, { "episode": 64, "epoch": 0.012018779342723005, "eps": 0, "loss/policy_avg": 0.00016400049207732081, "loss/value_avg": 1.7807611227035522, "lr": 2.8110000000000003e-06, "objective/entropy": 132.6045379638672, "objective/kl": 5.921619892120361, "objective/non_score_reward": -0.296081006526947, "objective/rlhf_reward": -10.555618286132812, "objective/scores": -10.259537696838379, "policy/approxkl_avg": 1.296887432999938e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3249120712280273, "step": 64, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999074339866638, "val/ratio_var": NaN }, { "episode": 65, "epoch": 0.012206572769953052, "eps": 0, "loss/policy_avg": 3.868678686558269e-05, "loss/value_avg": 0.9352507591247559, "lr": 2.808e-06, "objective/entropy": 108.50924682617188, "objective/kl": 6.801136493682861, "objective/non_score_reward": -0.34005680680274963, "objective/rlhf_reward": -9.479233741760254, "objective/scores": -9.139177322387695, "policy/approxkl_avg": 1.7262111384752643e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8271254301071167, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001357793807983, "val/ratio_var": NaN }, { "episode": 66, "epoch": 0.012394366197183098, "eps": 0, "loss/policy_avg": -0.00011637526040431112, "loss/value_avg": 0.7669923901557922, "lr": 2.8050000000000002e-06, "objective/entropy": 91.91571044921875, "objective/kl": 5.970813274383545, "objective/non_score_reward": -0.2985406816005707, "objective/rlhf_reward": -11.293431282043457, "objective/scores": -10.994890213012695, "policy/approxkl_avg": 7.934018242394814e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.735215663909912, "step": 66, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000560283660889, "val/ratio_var": NaN }, { "episode": 67, "epoch": 0.012582159624413145, "eps": 0, "loss/policy_avg": 2.1907519567321287e-06, "loss/value_avg": 0.9098306894302368, "lr": 2.802e-06, "objective/entropy": 98.06114196777344, "objective/kl": 0.9793438911437988, "objective/non_score_reward": -0.04896720126271248, "objective/rlhf_reward": -10.813344955444336, "objective/scores": -10.76437759399414, "policy/approxkl_avg": 8.746538782133939e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9435113668441772, "step": 67, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999716281890869, "val/ratio_var": NaN }, { "episode": 68, "epoch": 0.012769953051643192, "eps": 0, "loss/policy_avg": 1.9163455817761132e-06, "loss/value_avg": 1.59093177318573, "lr": 2.7990000000000002e-06, "objective/entropy": 99.10956573486328, "objective/kl": 5.278895378112793, "objective/non_score_reward": -0.2639448046684265, "objective/rlhf_reward": -11.562628746032715, "objective/scores": -11.298684120178223, "policy/approxkl_avg": 1.448861866037987e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.743101954460144, "step": 68, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000709295272827, "val/ratio_var": NaN }, { "episode": 69, "epoch": 0.012957746478873239, "eps": 0, "loss/policy_avg": -4.698195698438212e-05, "loss/value_avg": 0.40410807728767395, "lr": 2.7960000000000004e-06, "objective/entropy": 71.88630676269531, "objective/kl": 4.847630500793457, "objective/non_score_reward": -0.2423815280199051, "objective/rlhf_reward": -10.779484748840332, "objective/scores": -10.537103652954102, "policy/approxkl_avg": 1.1875818017870188e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5886484384536743, "step": 69, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000667572021484, "val/ratio_var": NaN }, { "episode": 70, "epoch": 0.013145539906103286, "eps": 0, "loss/policy_avg": -3.258687138441019e-05, "loss/value_avg": 0.719239354133606, "lr": 2.793e-06, "objective/entropy": 98.8798828125, "objective/kl": 7.76128625869751, "objective/non_score_reward": -0.38806432485580444, "objective/rlhf_reward": -11.612489700317383, "objective/scores": -11.224425315856934, "policy/approxkl_avg": 9.905823361577859e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9469962120056152, "step": 70, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000438690185547, "val/ratio_var": NaN }, { "episode": 71, "epoch": 0.013333333333333334, "eps": 0, "loss/policy_avg": 6.536267756018788e-05, "loss/value_avg": 0.9536187052726746, "lr": 2.7900000000000004e-06, "objective/entropy": 84.67236328125, "objective/kl": 10.136289596557617, "objective/non_score_reward": -0.5068144798278809, "objective/rlhf_reward": -11.827306747436523, "objective/scores": -11.3204927444458, "policy/approxkl_avg": 7.58969065373094e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5808796882629395, "step": 71, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999906420707703, "val/ratio_var": NaN }, { "episode": 72, "epoch": 0.013521126760563381, "eps": 0, "loss/policy_avg": -0.0001073513412848115, "loss/value_avg": 0.9634543657302856, "lr": 2.787e-06, "objective/entropy": 83.6346206665039, "objective/kl": 11.093542098999023, "objective/non_score_reward": -0.5546771287918091, "objective/rlhf_reward": -11.005587577819824, "objective/scores": -10.450910568237305, "policy/approxkl_avg": 9.139143486436296e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5940885543823242, "step": 72, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999434351921082, "val/ratio_var": NaN }, { "episode": 73, "epoch": 0.013708920187793428, "eps": 0, "loss/policy_avg": 4.274665116099641e-05, "loss/value_avg": 0.4753601551055908, "lr": 2.7840000000000004e-06, "objective/entropy": 96.01142883300781, "objective/kl": 20.253341674804688, "objective/non_score_reward": -1.012667179107666, "objective/rlhf_reward": -10.810651779174805, "objective/scores": -9.797985076904297, "policy/approxkl_avg": 2.533678298277664e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8561781644821167, "step": 73, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000131130218506, "val/ratio_var": NaN }, { "episode": 74, "epoch": 0.013896713615023475, "eps": 0, "loss/policy_avg": 8.325307135237381e-05, "loss/value_avg": 1.147961974143982, "lr": 2.781e-06, "objective/entropy": 106.52192687988281, "objective/kl": 10.864701271057129, "objective/non_score_reward": -0.5432350635528564, "objective/rlhf_reward": -12.618977546691895, "objective/scores": -12.075742721557617, "policy/approxkl_avg": 8.56951203331846e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.953304648399353, "step": 74, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999954104423523, "val/ratio_var": NaN }, { "episode": 75, "epoch": 0.014084507042253521, "eps": 0, "loss/policy_avg": -4.8313500883523375e-05, "loss/value_avg": 0.6109148859977722, "lr": 2.7780000000000003e-06, "objective/entropy": 70.34735870361328, "objective/kl": 2.294069290161133, "objective/non_score_reward": -0.11470344662666321, "objective/rlhf_reward": -10.730679512023926, "objective/scores": -10.615976333618164, "policy/approxkl_avg": 7.145714420175864e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4720113277435303, "step": 75, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999332427978516, "val/ratio_var": NaN }, { "episode": 76, "epoch": 0.014272300469483568, "eps": 0, "loss/policy_avg": -2.9689861094084335e-06, "loss/value_avg": 0.8361274003982544, "lr": 2.775e-06, "objective/entropy": 52.2647705078125, "objective/kl": 10.118078231811523, "objective/non_score_reward": -0.505903959274292, "objective/rlhf_reward": -11.049149513244629, "objective/scores": -10.543245315551758, "policy/approxkl_avg": 1.8652384525807975e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.081895351409912, "step": 76, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000255107879639, "val/ratio_var": NaN }, { "episode": 77, "epoch": 0.014460093896713615, "eps": 0, "loss/policy_avg": -3.781858322327025e-05, "loss/value_avg": 1.1105821132659912, "lr": 2.7720000000000003e-06, "objective/entropy": 117.66423034667969, "objective/kl": 13.094846725463867, "objective/non_score_reward": -0.6547423601150513, "objective/rlhf_reward": -12.94738483428955, "objective/scores": -12.292642593383789, "policy/approxkl_avg": 1.5348501847256557e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9910451173782349, "step": 77, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000033378601074, "val/ratio_var": NaN }, { "episode": 78, "epoch": 0.014647887323943662, "eps": 0, "loss/policy_avg": -3.2182007998926565e-05, "loss/value_avg": 0.7051318287849426, "lr": 2.769e-06, "objective/entropy": 110.6497573852539, "objective/kl": 4.367968559265137, "objective/non_score_reward": -0.21839839220046997, "objective/rlhf_reward": -11.061654090881348, "objective/scores": -10.843255996704102, "policy/approxkl_avg": 1.1454759629714317e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0120067596435547, "step": 78, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999765753746033, "val/ratio_var": NaN }, { "episode": 79, "epoch": 0.014835680751173709, "eps": 0, "loss/policy_avg": 6.719805242028087e-05, "loss/value_avg": 0.6816800832748413, "lr": 2.7660000000000003e-06, "objective/entropy": 57.519039154052734, "objective/kl": 10.642614364624023, "objective/non_score_reward": -0.5321307182312012, "objective/rlhf_reward": -11.610565185546875, "objective/scores": -11.078433990478516, "policy/approxkl_avg": 5.2539522954475615e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4319653511047363, "step": 79, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000165700912476, "val/ratio_var": NaN }, { "episode": 80, "epoch": 0.015023474178403756, "eps": 0, "loss/policy_avg": 1.5533194527961314e-05, "loss/value_avg": 0.5895752310752869, "lr": 2.763e-06, "objective/entropy": 118.89598846435547, "objective/kl": 7.38232421875, "objective/non_score_reward": -0.3691161870956421, "objective/rlhf_reward": -11.019342422485352, "objective/scores": -10.650226593017578, "policy/approxkl_avg": 1.1037506197908442e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.157585859298706, "step": 80, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000044345855713, "val/ratio_var": NaN }, { "episode": 81, "epoch": 0.015211267605633802, "eps": 0, "loss/policy_avg": 1.992369652725756e-05, "loss/value_avg": 1.1628302335739136, "lr": 2.7600000000000003e-06, "objective/entropy": 79.7040786743164, "objective/kl": 5.036663055419922, "objective/non_score_reward": -0.25183314085006714, "objective/rlhf_reward": -10.631261825561523, "objective/scores": -10.37942886352539, "policy/approxkl_avg": 4.4152354661264326e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7419594526290894, "step": 81, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999467730522156, "val/ratio_var": NaN }, { "episode": 82, "epoch": 0.01539906103286385, "eps": 0, "loss/policy_avg": -8.902009722078219e-05, "loss/value_avg": 0.7622122168540955, "lr": 2.757e-06, "objective/entropy": 90.30813598632812, "objective/kl": 11.041743278503418, "objective/non_score_reward": -0.5520871877670288, "objective/rlhf_reward": -12.498211860656738, "objective/scores": -11.946125030517578, "policy/approxkl_avg": 6.761251825082581e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8083127737045288, "step": 82, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999771118164062, "val/ratio_var": NaN }, { "episode": 83, "epoch": 0.015586854460093896, "eps": 0, "loss/policy_avg": 7.239827391458675e-05, "loss/value_avg": 1.1908960342407227, "lr": 2.7540000000000002e-06, "objective/entropy": 99.2619857788086, "objective/kl": 6.396444797515869, "objective/non_score_reward": -0.3198222517967224, "objective/rlhf_reward": -10.028944969177246, "objective/scores": -9.709122657775879, "policy/approxkl_avg": 4.0597594619384836e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.014031171798706, "step": 83, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000303983688354, "val/ratio_var": NaN }, { "episode": 84, "epoch": 0.015774647887323943, "eps": 0, "loss/policy_avg": 3.1039398891152814e-05, "loss/value_avg": 0.9935503602027893, "lr": 2.751e-06, "objective/entropy": 106.16294860839844, "objective/kl": 6.526541709899902, "objective/non_score_reward": -0.3263270854949951, "objective/rlhf_reward": -10.440317153930664, "objective/scores": -10.11398983001709, "policy/approxkl_avg": 9.30452230818446e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0596556663513184, "step": 84, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000300407409668, "val/ratio_var": NaN }, { "episode": 85, "epoch": 0.01596244131455399, "eps": 0, "loss/policy_avg": -5.0787657528417185e-05, "loss/value_avg": 0.7484345436096191, "lr": 2.748e-06, "objective/entropy": 120.02645874023438, "objective/kl": 3.578875780105591, "objective/non_score_reward": -0.17894375324249268, "objective/rlhf_reward": -12.096677780151367, "objective/scores": -11.917734146118164, "policy/approxkl_avg": 1.3715953173232265e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0890214443206787, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000050067901611, "val/ratio_var": NaN }, { "episode": 86, "epoch": 0.016150234741784036, "eps": 0, "loss/policy_avg": -2.7278683774056844e-05, "loss/value_avg": 0.7041056156158447, "lr": 2.745e-06, "objective/entropy": 112.40882873535156, "objective/kl": 7.833919048309326, "objective/non_score_reward": -0.3916959762573242, "objective/rlhf_reward": -11.294679641723633, "objective/scores": -10.902983665466309, "policy/approxkl_avg": 7.522852030206195e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.906924843788147, "step": 86, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001083612442017, "val/ratio_var": NaN }, { "episode": 87, "epoch": 0.016338028169014085, "eps": 0, "loss/policy_avg": -2.3216571207740344e-05, "loss/value_avg": 0.709381103515625, "lr": 2.742e-06, "objective/entropy": 125.18426513671875, "objective/kl": 5.164757251739502, "objective/non_score_reward": -0.2582378685474396, "objective/rlhf_reward": -10.478553771972656, "objective/scores": -10.220315933227539, "policy/approxkl_avg": 1.0354914792287673e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.956172227859497, "step": 87, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000797510147095, "val/ratio_var": NaN }, { "episode": 88, "epoch": 0.01652582159624413, "eps": 0, "loss/policy_avg": 5.954166772426106e-05, "loss/value_avg": 0.6163420081138611, "lr": 2.7390000000000004e-06, "objective/entropy": 91.77549743652344, "objective/kl": 8.149456024169922, "objective/non_score_reward": -0.40747278928756714, "objective/rlhf_reward": -12.208357810974121, "objective/scores": -11.800885200500488, "policy/approxkl_avg": 1.0428956187524818e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9492602348327637, "step": 88, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999653100967407, "val/ratio_var": NaN }, { "episode": 89, "epoch": 0.01671361502347418, "eps": 0, "loss/policy_avg": 2.7031268473365344e-05, "loss/value_avg": 0.6552960276603699, "lr": 2.736e-06, "objective/entropy": 26.965755462646484, "objective/kl": 6.5906662940979, "objective/non_score_reward": -0.32953333854675293, "objective/rlhf_reward": -11.021146774291992, "objective/scores": -10.69161319732666, "policy/approxkl_avg": 3.1922546384066663e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5508365631103516, "step": 89, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999833106994629, "val/ratio_var": NaN }, { "episode": 90, "epoch": 0.016901408450704224, "eps": 0, "loss/policy_avg": 5.227214933256619e-05, "loss/value_avg": 0.6207807064056396, "lr": 2.7330000000000003e-06, "objective/entropy": 108.39534759521484, "objective/kl": 6.505982398986816, "objective/non_score_reward": -0.32529911398887634, "objective/rlhf_reward": -11.484232902526855, "objective/scores": -11.158933639526367, "policy/approxkl_avg": 8.246072980000463e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9117661714553833, "step": 90, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000008463859558, "val/ratio_var": NaN }, { "episode": 91, "epoch": 0.017089201877934272, "eps": 0, "loss/policy_avg": 8.340151907759719e-06, "loss/value_avg": 0.8453884720802307, "lr": 2.73e-06, "objective/entropy": 106.80381774902344, "objective/kl": 9.37204360961914, "objective/non_score_reward": -0.46860218048095703, "objective/rlhf_reward": -8.036922454833984, "objective/scores": -7.568319797515869, "policy/approxkl_avg": 9.578982229641042e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8644559383392334, "step": 91, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001009702682495, "val/ratio_var": NaN }, { "episode": 92, "epoch": 0.01727699530516432, "eps": 0, "loss/policy_avg": -1.5987539882189594e-05, "loss/value_avg": 2.0757484436035156, "lr": 2.7270000000000003e-06, "objective/entropy": 118.8765869140625, "objective/kl": 13.844137191772461, "objective/non_score_reward": -0.692206859588623, "objective/rlhf_reward": -11.983621597290039, "objective/scores": -11.291415214538574, "policy/approxkl_avg": 9.22142362469458e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1409244537353516, "step": 92, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000555515289307, "val/ratio_var": NaN }, { "episode": 93, "epoch": 0.017464788732394366, "eps": 0, "loss/policy_avg": -3.9316571928793564e-05, "loss/value_avg": 0.43395325541496277, "lr": 2.724e-06, "objective/entropy": 68.4170150756836, "objective/kl": 14.738064765930176, "objective/non_score_reward": -0.736903190612793, "objective/rlhf_reward": -11.76559829711914, "objective/scores": -11.028695106506348, "policy/approxkl_avg": 5.231740374256333e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7697983980178833, "step": 93, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000802278518677, "val/ratio_var": NaN }, { "episode": 94, "epoch": 0.017652582159624414, "eps": 0, "loss/policy_avg": -5.8637473557610065e-05, "loss/value_avg": 0.3008868098258972, "lr": 2.7210000000000003e-06, "objective/entropy": 78.66854858398438, "objective/kl": 5.160335540771484, "objective/non_score_reward": -0.25801679491996765, "objective/rlhf_reward": -11.200693130493164, "objective/scores": -10.942676544189453, "policy/approxkl_avg": 5.759839538654887e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6833381652832031, "step": 94, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999896228313446, "val/ratio_var": NaN }, { "episode": 95, "epoch": 0.01784037558685446, "eps": 0, "loss/policy_avg": 1.2460744756026543e-06, "loss/value_avg": 0.6554837226867676, "lr": 2.718e-06, "objective/entropy": 54.729148864746094, "objective/kl": 10.022623062133789, "objective/non_score_reward": -0.5011311769485474, "objective/rlhf_reward": -12.714678764343262, "objective/scores": -12.213547706604004, "policy/approxkl_avg": 5.0584063160385995e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1739479303359985, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.00005042552948, "val/ratio_var": NaN }, { "episode": 96, "epoch": 0.018028169014084508, "eps": 0, "loss/policy_avg": -9.92541026789695e-05, "loss/value_avg": 0.8343378305435181, "lr": 2.7150000000000003e-06, "objective/entropy": 94.93035888671875, "objective/kl": 11.69411849975586, "objective/non_score_reward": -0.5847059488296509, "objective/rlhf_reward": -9.52371883392334, "objective/scores": -8.93901252746582, "policy/approxkl_avg": 7.912997546100087e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0619587898254395, "step": 96, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999560117721558, "val/ratio_var": NaN }, { "episode": 97, "epoch": 0.018215962441314553, "eps": 0, "loss/policy_avg": 0.0001273829984711483, "loss/value_avg": 0.6525049209594727, "lr": 2.712e-06, "objective/entropy": 102.16277313232422, "objective/kl": 5.069347858428955, "objective/non_score_reward": -0.2534674108028412, "objective/rlhf_reward": -10.724151611328125, "objective/scores": -10.470684051513672, "policy/approxkl_avg": 1.2048556641275354e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.173123359680176, "step": 97, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999117255210876, "val/ratio_var": NaN }, { "episode": 98, "epoch": 0.0184037558685446, "eps": 0, "loss/policy_avg": -7.230830669868737e-05, "loss/value_avg": 0.5554779767990112, "lr": 2.7090000000000002e-06, "objective/entropy": 102.30531311035156, "objective/kl": 14.285614013671875, "objective/non_score_reward": -0.7142806649208069, "objective/rlhf_reward": -12.2979736328125, "objective/scores": -11.58369255065918, "policy/approxkl_avg": 1.2609248756234592e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8178918361663818, "step": 98, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000258684158325, "val/ratio_var": NaN }, { "episode": 99, "epoch": 0.018591549295774647, "eps": 0, "loss/policy_avg": -1.1318134966131765e-05, "loss/value_avg": 0.543129026889801, "lr": 2.706e-06, "objective/entropy": 84.1186294555664, "objective/kl": 3.5293402671813965, "objective/non_score_reward": -0.17646701633930206, "objective/rlhf_reward": -11.226691246032715, "objective/scores": -11.050224304199219, "policy/approxkl_avg": 6.699781351926504e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8128961324691772, "step": 99, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998987913131714, "val/ratio_var": NaN }, { "episode": 100, "epoch": 0.018779342723004695, "eps": 0, "loss/policy_avg": 2.8749682314810343e-05, "loss/value_avg": 0.38489094376564026, "lr": 2.703e-06, "objective/entropy": 97.39865112304688, "objective/kl": 10.079329490661621, "objective/non_score_reward": -0.5039664506912231, "objective/rlhf_reward": -11.652255058288574, "objective/scores": -11.14828872680664, "policy/approxkl_avg": 7.667184576121144e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6694313287734985, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999038577079773, "val/ratio_var": NaN }, { "episode": 101, "epoch": 0.01896713615023474, "eps": 0, "loss/policy_avg": 2.4858511096681468e-05, "loss/value_avg": 0.47059834003448486, "lr": 2.7e-06, "objective/entropy": 123.10418701171875, "objective/kl": 10.407770156860352, "objective/non_score_reward": -0.5203885436058044, "objective/rlhf_reward": -11.422197341918945, "objective/scores": -10.901808738708496, "policy/approxkl_avg": 1.3321128733423393e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9853515625, "step": 101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001482963562012, "val/ratio_var": NaN }, { "episode": 102, "epoch": 0.01915492957746479, "eps": 0, "loss/policy_avg": -4.687399268732406e-06, "loss/value_avg": 0.5947672724723816, "lr": 2.697e-06, "objective/entropy": 96.03418731689453, "objective/kl": 18.515512466430664, "objective/non_score_reward": -0.9257756471633911, "objective/rlhf_reward": -12.102278709411621, "objective/scores": -11.17650318145752, "policy/approxkl_avg": 7.690211845101658e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9811551570892334, "step": 102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000019907951355, "val/ratio_var": NaN }, { "episode": 103, "epoch": 0.019342723004694834, "eps": 0, "loss/policy_avg": -7.88130855653435e-05, "loss/value_avg": 0.7429995536804199, "lr": 2.694e-06, "objective/entropy": 81.79742431640625, "objective/kl": 14.243643760681152, "objective/non_score_reward": -0.7121821641921997, "objective/rlhf_reward": -10.138145446777344, "objective/scores": -9.425963401794434, "policy/approxkl_avg": 1.0606901668097635e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.927923321723938, "step": 103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000396966934204, "val/ratio_var": NaN }, { "episode": 104, "epoch": 0.019530516431924883, "eps": 0, "loss/policy_avg": -6.510626553790644e-05, "loss/value_avg": 0.3004145920276642, "lr": 2.691e-06, "objective/entropy": 82.23486328125, "objective/kl": 8.613237380981445, "objective/non_score_reward": -0.4306618571281433, "objective/rlhf_reward": -11.703987121582031, "objective/scores": -11.273324966430664, "policy/approxkl_avg": 7.863710038691352e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8487457036972046, "step": 104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001091957092285, "val/ratio_var": NaN }, { "episode": 105, "epoch": 0.01971830985915493, "eps": 0, "loss/policy_avg": 4.3356194510124624e-05, "loss/value_avg": 0.5908864140510559, "lr": 2.688e-06, "objective/entropy": 83.3988265991211, "objective/kl": 5.6112871170043945, "objective/non_score_reward": -0.2805643677711487, "objective/rlhf_reward": -8.788839340209961, "objective/scores": -8.508275032043457, "policy/approxkl_avg": 6.67086865746569e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7055203914642334, "step": 105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999873638153076, "val/ratio_var": NaN }, { "episode": 106, "epoch": 0.019906103286384976, "eps": 0, "loss/policy_avg": 0.0001763037871569395, "loss/value_avg": 0.9166803956031799, "lr": 2.685e-06, "objective/entropy": 82.26583862304688, "objective/kl": 4.8363189697265625, "objective/non_score_reward": -0.2418159395456314, "objective/rlhf_reward": -11.287281036376953, "objective/scores": -11.045465469360352, "policy/approxkl_avg": 1.1010519784804274e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9094630479812622, "step": 106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999284744262695, "val/ratio_var": NaN }, { "episode": 107, "epoch": 0.020093896713615025, "eps": 0, "loss/policy_avg": -5.021185279474594e-05, "loss/value_avg": 0.4058072865009308, "lr": 2.6820000000000003e-06, "objective/entropy": 82.42610931396484, "objective/kl": 12.05010986328125, "objective/non_score_reward": -0.6025055646896362, "objective/rlhf_reward": -12.374974250793457, "objective/scores": -11.772468566894531, "policy/approxkl_avg": 1.3740147153384896e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.355708360671997, "step": 107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998775720596313, "val/ratio_var": NaN }, { "episode": 108, "epoch": 0.02028169014084507, "eps": 0, "loss/policy_avg": -1.0443183782626875e-05, "loss/value_avg": 1.0055078268051147, "lr": 2.679e-06, "objective/entropy": 105.13883972167969, "objective/kl": 8.569499015808105, "objective/non_score_reward": -0.42847493290901184, "objective/rlhf_reward": -10.921231269836426, "objective/scores": -10.492755889892578, "policy/approxkl_avg": 8.639455018055742e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0113847255706787, "step": 108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000057220458984, "val/ratio_var": NaN }, { "episode": 109, "epoch": 0.02046948356807512, "eps": 0, "loss/policy_avg": 2.3724898710497655e-05, "loss/value_avg": 0.5457293391227722, "lr": 2.6760000000000003e-06, "objective/entropy": 105.79176330566406, "objective/kl": 10.836904525756836, "objective/non_score_reward": -0.5418452620506287, "objective/rlhf_reward": -10.336092948913574, "objective/scores": -9.7942476272583, "policy/approxkl_avg": 9.423367686167694e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8689080476760864, "step": 109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999444484710693, "val/ratio_var": NaN }, { "episode": 110, "epoch": 0.020657276995305163, "eps": 0, "loss/policy_avg": 2.420173586870078e-05, "loss/value_avg": 1.2975260019302368, "lr": 2.673e-06, "objective/entropy": 98.12688446044922, "objective/kl": 6.357450485229492, "objective/non_score_reward": -0.317872554063797, "objective/rlhf_reward": -12.927559852600098, "objective/scores": -12.609686851501465, "policy/approxkl_avg": 1.1509537500842271e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0442285537719727, "step": 110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999987483024597, "val/ratio_var": NaN }, { "episode": 111, "epoch": 0.020845070422535212, "eps": 0, "loss/policy_avg": 1.669604898779653e-05, "loss/value_avg": 0.5313883423805237, "lr": 2.6700000000000003e-06, "objective/entropy": 90.2462158203125, "objective/kl": 12.252594947814941, "objective/non_score_reward": -0.6126296520233154, "objective/rlhf_reward": -11.065726280212402, "objective/scores": -10.453096389770508, "policy/approxkl_avg": 1.2447243591395818e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7626769542694092, "step": 111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000011920928955, "val/ratio_var": NaN }, { "episode": 112, "epoch": 0.021032863849765257, "eps": 0, "loss/policy_avg": 2.393182694504503e-05, "loss/value_avg": 0.6011212468147278, "lr": 2.667e-06, "objective/entropy": 73.97472381591797, "objective/kl": 17.270288467407227, "objective/non_score_reward": -0.8635144829750061, "objective/rlhf_reward": -12.186919212341309, "objective/scores": -11.323404312133789, "policy/approxkl_avg": 8.743162993596343e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7176836729049683, "step": 112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999647736549377, "val/ratio_var": NaN }, { "episode": 113, "epoch": 0.021220657276995306, "eps": 0, "loss/policy_avg": -4.0625625842949376e-05, "loss/value_avg": 0.39422744512557983, "lr": 2.6640000000000002e-06, "objective/entropy": 78.90961456298828, "objective/kl": 10.700329780578613, "objective/non_score_reward": -0.5350164771080017, "objective/rlhf_reward": -11.677353858947754, "objective/scores": -11.142337799072266, "policy/approxkl_avg": 8.962484088215206e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.297789454460144, "step": 113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000133514404297, "val/ratio_var": NaN }, { "episode": 114, "epoch": 0.02140845070422535, "eps": 0, "loss/policy_avg": -5.372740270104259e-05, "loss/value_avg": 0.5474290251731873, "lr": 2.661e-06, "objective/entropy": 98.49168395996094, "objective/kl": 14.771495819091797, "objective/non_score_reward": -0.738574743270874, "objective/rlhf_reward": -12.898970603942871, "objective/scores": -12.160395622253418, "policy/approxkl_avg": 1.3615044736070558e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8004426956176758, "step": 114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000749826431274, "val/ratio_var": NaN }, { "episode": 115, "epoch": 0.0215962441314554, "eps": 0, "loss/policy_avg": -1.9253425307397265e-06, "loss/value_avg": 1.0356911420822144, "lr": 2.6580000000000002e-06, "objective/entropy": 89.24701690673828, "objective/kl": 12.415081024169922, "objective/non_score_reward": -0.620754063129425, "objective/rlhf_reward": -12.041976928710938, "objective/scores": -11.421222686767578, "policy/approxkl_avg": 1.9764078729167522e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.928036093711853, "step": 115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999377131462097, "val/ratio_var": NaN }, { "episode": 116, "epoch": 0.021784037558685444, "eps": 0, "loss/policy_avg": -3.375647065695375e-05, "loss/value_avg": 0.6831283569335938, "lr": 2.655e-06, "objective/entropy": 115.3565444946289, "objective/kl": 7.171603679656982, "objective/non_score_reward": -0.35858017206192017, "objective/rlhf_reward": -12.042454719543457, "objective/scores": -11.683874130249023, "policy/approxkl_avg": 1.6114373124764825e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8451550006866455, "step": 116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000449419021606, "val/ratio_var": NaN }, { "episode": 117, "epoch": 0.021971830985915493, "eps": 0, "loss/policy_avg": -5.384661108109867e-06, "loss/value_avg": 0.6162310838699341, "lr": 2.652e-06, "objective/entropy": 106.42625427246094, "objective/kl": 12.072477340698242, "objective/non_score_reward": -0.6036238670349121, "objective/rlhf_reward": -10.507926940917969, "objective/scores": -9.904302597045898, "policy/approxkl_avg": 6.798443052957737e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8217105865478516, "step": 117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000131130218506, "val/ratio_var": NaN }, { "episode": 118, "epoch": 0.02215962441314554, "eps": 0, "loss/policy_avg": 3.222249506507069e-05, "loss/value_avg": 0.792917788028717, "lr": 2.649e-06, "objective/entropy": 112.77588653564453, "objective/kl": 9.012856483459473, "objective/non_score_reward": -0.450642853975296, "objective/rlhf_reward": -12.826851844787598, "objective/scores": -12.376209259033203, "policy/approxkl_avg": 1.644503697662003e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2062482833862305, "step": 118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998651146888733, "val/ratio_var": NaN }, { "episode": 119, "epoch": 0.022347417840375586, "eps": 0, "loss/policy_avg": -1.5519699445576407e-05, "loss/value_avg": 0.5791659951210022, "lr": 2.646e-06, "objective/entropy": 37.611507415771484, "objective/kl": 2.236257553100586, "objective/non_score_reward": -0.11181288957595825, "objective/rlhf_reward": -10.67795467376709, "objective/scores": -10.566142082214355, "policy/approxkl_avg": 4.574174639060402e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9348812699317932, "step": 119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999694228172302, "val/ratio_var": NaN }, { "episode": 120, "epoch": 0.022535211267605635, "eps": 0, "loss/policy_avg": -1.0175524948863313e-05, "loss/value_avg": 0.3691723346710205, "lr": 2.643e-06, "objective/entropy": 118.53192901611328, "objective/kl": 23.477018356323242, "objective/non_score_reward": -1.1738508939743042, "objective/rlhf_reward": -10.412705421447754, "objective/scores": -9.23885440826416, "policy/approxkl_avg": 1.0892347290791804e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0391340255737305, "step": 120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000553131103516, "val/ratio_var": NaN }, { "episode": 121, "epoch": 0.02272300469483568, "eps": 0, "loss/policy_avg": -2.625303386594169e-05, "loss/value_avg": 0.45505890250205994, "lr": 2.64e-06, "objective/entropy": 92.2886734008789, "objective/kl": 19.32079315185547, "objective/non_score_reward": -0.9660395979881287, "objective/rlhf_reward": -12.136062622070312, "objective/scores": -11.170022964477539, "policy/approxkl_avg": 7.847287974982464e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7370997667312622, "step": 121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999802708625793, "val/ratio_var": NaN }, { "episode": 122, "epoch": 0.02291079812206573, "eps": 0, "loss/policy_avg": -3.8421378121711314e-05, "loss/value_avg": 0.7214630246162415, "lr": 2.637e-06, "objective/entropy": 102.9349365234375, "objective/kl": 13.256671905517578, "objective/non_score_reward": -0.662833571434021, "objective/rlhf_reward": -11.51992130279541, "objective/scores": -10.857088088989258, "policy/approxkl_avg": 1.333757921884171e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8363797664642334, "step": 122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000237226486206, "val/ratio_var": NaN }, { "episode": 123, "epoch": 0.023098591549295774, "eps": 0, "loss/policy_avg": -6.77198768244125e-05, "loss/value_avg": 0.6664794087409973, "lr": 2.634e-06, "objective/entropy": 103.09527587890625, "objective/kl": 10.523482322692871, "objective/non_score_reward": -0.5261741280555725, "objective/rlhf_reward": -11.192225456237793, "objective/scores": -10.666050910949707, "policy/approxkl_avg": 1.1033840507934656e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8879441022872925, "step": 123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000050067901611, "val/ratio_var": NaN }, { "episode": 124, "epoch": 0.023286384976525822, "eps": 0, "loss/policy_avg": -5.91278076171875e-05, "loss/value_avg": 0.4947410523891449, "lr": 2.631e-06, "objective/entropy": 86.10026550292969, "objective/kl": 25.742351531982422, "objective/non_score_reward": -1.2871177196502686, "objective/rlhf_reward": -12.745865821838379, "objective/scores": -11.458747863769531, "policy/approxkl_avg": 9.267009914992741e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8585182428359985, "step": 124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999694228172302, "val/ratio_var": NaN }, { "episode": 125, "epoch": 0.023474178403755867, "eps": 0, "loss/policy_avg": -0.0001048029589583166, "loss/value_avg": 0.8118029832839966, "lr": 2.628e-06, "objective/entropy": 94.12370300292969, "objective/kl": 6.691989898681641, "objective/non_score_reward": -0.33459949493408203, "objective/rlhf_reward": -10.666067123413086, "objective/scores": -10.331467628479004, "policy/approxkl_avg": 1.3673147236659133e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5309160947799683, "step": 125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0002028942108154, "val/ratio_var": NaN }, { "episode": 126, "epoch": 0.023661971830985916, "eps": 0, "loss/policy_avg": 2.0760409825015813e-05, "loss/value_avg": 0.2653543949127197, "lr": 2.6250000000000003e-06, "objective/entropy": 83.44908905029297, "objective/kl": 9.000375747680664, "objective/non_score_reward": -0.45001882314682007, "objective/rlhf_reward": -11.949639320373535, "objective/scores": -11.49962043762207, "policy/approxkl_avg": 6.289425158456652e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5466538667678833, "step": 126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999808669090271, "val/ratio_var": NaN }, { "episode": 127, "epoch": 0.02384976525821596, "eps": 0, "loss/policy_avg": 0.0001553094625705853, "loss/value_avg": 0.32733437418937683, "lr": 2.622e-06, "objective/entropy": 130.70034790039062, "objective/kl": 7.322316646575928, "objective/non_score_reward": -0.3661157786846161, "objective/rlhf_reward": -11.933454513549805, "objective/scores": -11.567338943481445, "policy/approxkl_avg": 1.3855687086561375e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2280826568603516, "step": 127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.00004243850708, "val/ratio_var": NaN }, { "episode": 128, "epoch": 0.02403755868544601, "eps": 0, "loss/policy_avg": 2.689631401153747e-05, "loss/value_avg": 1.3124293088912964, "lr": 2.6190000000000003e-06, "objective/entropy": 109.83758544921875, "objective/kl": 9.707868576049805, "objective/non_score_reward": -0.4853934943675995, "objective/rlhf_reward": -11.708505630493164, "objective/scores": -11.223112106323242, "policy/approxkl_avg": 1.4776473733491002e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8152776956558228, "step": 128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999714493751526, "val/ratio_var": NaN }, { "episode": 129, "epoch": 0.024225352112676058, "eps": 0, "loss/policy_avg": -2.3675414922763593e-05, "loss/value_avg": 0.8364843726158142, "lr": 2.616e-06, "objective/entropy": 86.4832534790039, "objective/kl": 7.820969581604004, "objective/non_score_reward": -0.39104849100112915, "objective/rlhf_reward": -11.034602165222168, "objective/scores": -10.643553733825684, "policy/approxkl_avg": 1.0893255364408105e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.620794415473938, "step": 129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999911904335022, "val/ratio_var": NaN }, { "episode": 130, "epoch": 0.024413145539906103, "eps": 0, "loss/policy_avg": -3.1507239327766e-05, "loss/value_avg": 0.2683379054069519, "lr": 2.6130000000000002e-06, "objective/entropy": 89.66504669189453, "objective/kl": 18.529006958007812, "objective/non_score_reward": -0.9264503717422485, "objective/rlhf_reward": -11.325263977050781, "objective/scores": -10.398813247680664, "policy/approxkl_avg": 1.2142552918703586e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7323228120803833, "step": 130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000076293945312, "val/ratio_var": NaN }, { "episode": 131, "epoch": 0.02460093896713615, "eps": 0, "loss/policy_avg": -3.0130710001685657e-05, "loss/value_avg": 0.7166529297828674, "lr": 2.61e-06, "objective/entropy": 98.91737365722656, "objective/kl": 3.675016403198242, "objective/non_score_reward": -0.18375082314014435, "objective/rlhf_reward": -10.694395065307617, "objective/scores": -10.51064395904541, "policy/approxkl_avg": 7.978449900747364e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6859891414642334, "step": 131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999847412109375, "val/ratio_var": NaN }, { "episode": 132, "epoch": 0.024788732394366197, "eps": 0, "loss/policy_avg": 1.0274491614836734e-05, "loss/value_avg": 0.4408113956451416, "lr": 2.607e-06, "objective/entropy": 83.91133117675781, "objective/kl": 7.95505428314209, "objective/non_score_reward": -0.39775270223617554, "objective/rlhf_reward": -10.378518104553223, "objective/scores": -9.980765342712402, "policy/approxkl_avg": 7.143622582361786e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5218943357467651, "step": 132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999787211418152, "val/ratio_var": NaN }, { "episode": 133, "epoch": 0.024976525821596245, "eps": 0, "loss/policy_avg": 1.6500365745741874e-05, "loss/value_avg": 0.5923090577125549, "lr": 2.604e-06, "objective/entropy": 105.81744384765625, "objective/kl": 15.214542388916016, "objective/non_score_reward": -0.7607271075248718, "objective/rlhf_reward": -10.99831485748291, "objective/scores": -10.237587928771973, "policy/approxkl_avg": 1.2953722716702032e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1242997646331787, "step": 133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999325275421143, "val/ratio_var": NaN }, { "episode": 134, "epoch": 0.02516431924882629, "eps": 0, "loss/policy_avg": 2.1646606910508126e-05, "loss/value_avg": 0.3157866597175598, "lr": 2.601e-06, "objective/entropy": 39.07932662963867, "objective/kl": 9.939384460449219, "objective/non_score_reward": -0.49696922302246094, "objective/rlhf_reward": -10.282241821289062, "objective/scores": -9.785272598266602, "policy/approxkl_avg": 2.6619078496992188e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9390961527824402, "step": 134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000022053718567, "val/ratio_var": NaN }, { "episode": 135, "epoch": 0.02535211267605634, "eps": 0, "loss/policy_avg": -0.00011641574383247644, "loss/value_avg": 0.5259698033332825, "lr": 2.598e-06, "objective/entropy": 78.10018157958984, "objective/kl": 20.79111099243164, "objective/non_score_reward": -1.039555549621582, "objective/rlhf_reward": -12.081794738769531, "objective/scores": -11.04223918914795, "policy/approxkl_avg": 1.0485104695590053e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.852308750152588, "step": 135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999399781227112, "val/ratio_var": NaN }, { "episode": 136, "epoch": 0.025539906103286384, "eps": 0, "loss/policy_avg": -1.0733334420365281e-05, "loss/value_avg": 0.7988017201423645, "lr": 2.595e-06, "objective/entropy": 62.50822448730469, "objective/kl": -1.8606102466583252, "objective/non_score_reward": 0.09303051978349686, "objective/rlhf_reward": -9.57384967803955, "objective/scores": -9.66688060760498, "policy/approxkl_avg": 6.619652026529366e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9958841800689697, "step": 136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000110864639282, "val/ratio_var": NaN }, { "episode": 137, "epoch": 0.025727699530516433, "eps": 0, "loss/policy_avg": 2.4080276489257812e-05, "loss/value_avg": 0.4405337870121002, "lr": 2.592e-06, "objective/entropy": 120.31510925292969, "objective/kl": 13.188962936401367, "objective/non_score_reward": -0.6594482064247131, "objective/rlhf_reward": -11.147398948669434, "objective/scores": -10.487950325012207, "policy/approxkl_avg": 1.1582484660266346e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0921125411987305, "step": 137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001236200332642, "val/ratio_var": NaN }, { "episode": 138, "epoch": 0.025915492957746478, "eps": 0, "loss/policy_avg": 7.395699503831565e-05, "loss/value_avg": 0.5236732959747314, "lr": 2.589e-06, "objective/entropy": 90.833740234375, "objective/kl": 15.413938522338867, "objective/non_score_reward": -0.7706968784332275, "objective/rlhf_reward": -12.133798599243164, "objective/scores": -11.363101959228516, "policy/approxkl_avg": 9.873337347698907e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8711421489715576, "step": 138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000197887420654, "val/ratio_var": NaN }, { "episode": 139, "epoch": 0.026103286384976526, "eps": 0, "loss/policy_avg": -1.7296593796345405e-05, "loss/value_avg": 0.5577424764633179, "lr": 2.586e-06, "objective/entropy": 66.2403564453125, "objective/kl": 19.181020736694336, "objective/non_score_reward": -0.9590510725975037, "objective/rlhf_reward": -12.260686874389648, "objective/scores": -11.3016357421875, "policy/approxkl_avg": 9.090403096934097e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.533009648323059, "step": 139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999181628227234, "val/ratio_var": NaN }, { "episode": 140, "epoch": 0.02629107981220657, "eps": 0, "loss/policy_avg": 1.7975860828300938e-05, "loss/value_avg": 0.29716527462005615, "lr": 2.583e-06, "objective/entropy": 89.15927124023438, "objective/kl": 13.560918807983398, "objective/non_score_reward": -0.6780458688735962, "objective/rlhf_reward": -11.751083374023438, "objective/scores": -11.073037147521973, "policy/approxkl_avg": 8.195436862479255e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8485982418060303, "step": 140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999496340751648, "val/ratio_var": NaN }, { "episode": 141, "epoch": 0.02647887323943662, "eps": 0, "loss/policy_avg": 1.2406762834871188e-05, "loss/value_avg": 0.44688308238983154, "lr": 2.58e-06, "objective/entropy": 53.74085998535156, "objective/kl": 9.671512603759766, "objective/non_score_reward": -0.48357561230659485, "objective/rlhf_reward": -11.81394100189209, "objective/scores": -11.330365180969238, "policy/approxkl_avg": 6.092892590459087e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.7865381836891174, "step": 141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999701976776123, "val/ratio_var": NaN }, { "episode": 142, "epoch": 0.02666666666666667, "eps": 0, "loss/policy_avg": 1.1075218026235234e-05, "loss/value_avg": 0.28803786635398865, "lr": 2.577e-06, "objective/entropy": 95.82167053222656, "objective/kl": 6.305478096008301, "objective/non_score_reward": -0.31527388095855713, "objective/rlhf_reward": -11.002294540405273, "objective/scores": -10.687020301818848, "policy/approxkl_avg": 9.293371761032176e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1108996868133545, "step": 142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999732375144958, "val/ratio_var": NaN }, { "episode": 143, "epoch": 0.026854460093896713, "eps": 0, "loss/policy_avg": 2.113378286594525e-05, "loss/value_avg": 0.49003854393959045, "lr": 2.574e-06, "objective/entropy": 102.00337219238281, "objective/kl": 17.38307762145996, "objective/non_score_reward": -0.8691538572311401, "objective/rlhf_reward": -12.835332870483398, "objective/scores": -11.966178894042969, "policy/approxkl_avg": 9.223958130633036e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.149409532546997, "step": 143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000249147415161, "val/ratio_var": NaN }, { "episode": 144, "epoch": 0.027042253521126762, "eps": 0, "loss/policy_avg": 1.6473373761982657e-05, "loss/value_avg": 0.5211838483810425, "lr": 2.571e-06, "objective/entropy": 101.25142669677734, "objective/kl": 9.927888870239258, "objective/non_score_reward": -0.49639445543289185, "objective/rlhf_reward": -12.50876522064209, "objective/scores": -12.012371063232422, "policy/approxkl_avg": 1.0580450293673493e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.093994140625, "step": 144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999344944953918, "val/ratio_var": NaN }, { "episode": 145, "epoch": 0.027230046948356807, "eps": 0, "loss/policy_avg": 3.9622467738809064e-05, "loss/value_avg": 0.32654479146003723, "lr": 2.568e-06, "objective/entropy": 79.36514282226562, "objective/kl": 11.414214134216309, "objective/non_score_reward": -0.5707107186317444, "objective/rlhf_reward": -12.863065719604492, "objective/scores": -12.292354583740234, "policy/approxkl_avg": 6.110789740887412e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5827245712280273, "step": 145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999371767044067, "val/ratio_var": NaN }, { "episode": 146, "epoch": 0.027417840375586856, "eps": 0, "loss/policy_avg": 5.7449880841886625e-05, "loss/value_avg": 0.3799317181110382, "lr": 2.565e-06, "objective/entropy": 84.4321517944336, "objective/kl": 10.287858963012695, "objective/non_score_reward": -0.5143929719924927, "objective/rlhf_reward": -11.362838745117188, "objective/scores": -10.848445892333984, "policy/approxkl_avg": 6.1643284254842e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6493312120437622, "step": 146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999800324440002, "val/ratio_var": NaN }, { "episode": 147, "epoch": 0.0276056338028169, "eps": 0, "loss/policy_avg": 4.397698285174556e-05, "loss/value_avg": 0.5025385022163391, "lr": 2.562e-06, "objective/entropy": 88.90082550048828, "objective/kl": 12.11947250366211, "objective/non_score_reward": -0.6059736013412476, "objective/rlhf_reward": -11.174012184143066, "objective/scores": -10.568038940429688, "policy/approxkl_avg": 3.753954302965212e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5282673835754395, "step": 147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999747276306152, "val/ratio_var": NaN }, { "episode": 148, "epoch": 0.02779342723004695, "eps": 0, "loss/policy_avg": -4.8601403250359e-05, "loss/value_avg": 0.5441368818283081, "lr": 2.559e-06, "objective/entropy": 62.89722442626953, "objective/kl": 6.861681938171387, "objective/non_score_reward": -0.34308409690856934, "objective/rlhf_reward": -10.223848342895508, "objective/scores": -9.88076400756836, "policy/approxkl_avg": 3.647550883556505e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4700720310211182, "step": 148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999822974205017, "val/ratio_var": NaN }, { "episode": 149, "epoch": 0.027981220657276994, "eps": 0, "loss/policy_avg": -7.028849358903244e-05, "loss/value_avg": 0.5014904141426086, "lr": 2.556e-06, "objective/entropy": 81.2103271484375, "objective/kl": 13.679666519165039, "objective/non_score_reward": -0.683983325958252, "objective/rlhf_reward": -12.146825790405273, "objective/scores": -11.46284294128418, "policy/approxkl_avg": 6.042296263331082e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.786685585975647, "step": 149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000576972961426, "val/ratio_var": NaN }, { "episode": 150, "epoch": 0.028169014084507043, "eps": 0, "loss/policy_avg": 4.838997483602725e-05, "loss/value_avg": 0.4705970585346222, "lr": 2.553e-06, "objective/entropy": 107.8565673828125, "objective/kl": 5.467012405395508, "objective/non_score_reward": -0.27335065603256226, "objective/rlhf_reward": -10.378904342651367, "objective/scores": -10.10555362701416, "policy/approxkl_avg": 9.105790610419717e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.149034023284912, "step": 150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000239610671997, "val/ratio_var": NaN }, { "episode": 151, "epoch": 0.028356807511737088, "eps": 0, "loss/policy_avg": 0.0001742704916978255, "loss/value_avg": 0.4962337017059326, "lr": 2.55e-06, "objective/entropy": 139.8780517578125, "objective/kl": 21.835477828979492, "objective/non_score_reward": -1.0917737483978271, "objective/rlhf_reward": -12.227535247802734, "objective/scores": -11.135761260986328, "policy/approxkl_avg": 1.700281302419171e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.054461717605591, "step": 151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998593926429749, "val/ratio_var": NaN }, { "episode": 152, "epoch": 0.028544600938967137, "eps": 0, "loss/policy_avg": -4.7863654799584765e-06, "loss/value_avg": 0.5008350610733032, "lr": 2.547e-06, "objective/entropy": 117.76811218261719, "objective/kl": 7.899986743927002, "objective/non_score_reward": -0.39499935507774353, "objective/rlhf_reward": -11.090338706970215, "objective/scores": -10.69533920288086, "policy/approxkl_avg": 1.0630145652612555e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.225116014480591, "step": 152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999963104724884, "val/ratio_var": NaN }, { "episode": 153, "epoch": 0.02873239436619718, "eps": 0, "loss/policy_avg": 4.4606767914956436e-05, "loss/value_avg": 0.2578524053096771, "lr": 2.544e-06, "objective/entropy": 77.37584686279297, "objective/kl": 5.40702486038208, "objective/non_score_reward": -0.27035123109817505, "objective/rlhf_reward": -11.04019546508789, "objective/scores": -10.769844055175781, "policy/approxkl_avg": 1.1717559544877076e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7036455869674683, "step": 153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000116229057312, "val/ratio_var": NaN }, { "episode": 154, "epoch": 0.02892018779342723, "eps": 0, "loss/policy_avg": -3.9918242691783234e-05, "loss/value_avg": 0.4935897886753082, "lr": 2.541e-06, "objective/entropy": 52.102989196777344, "objective/kl": 11.296867370605469, "objective/non_score_reward": -0.5648434162139893, "objective/rlhf_reward": -10.433706283569336, "objective/scores": -9.868863105773926, "policy/approxkl_avg": 1.2005668281744875e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1618629693984985, "step": 154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999423027038574, "val/ratio_var": NaN }, { "episode": 155, "epoch": 0.02910798122065728, "eps": 0, "loss/policy_avg": 4.235303640598431e-05, "loss/value_avg": 0.6854602098464966, "lr": 2.538e-06, "objective/entropy": 79.57754516601562, "objective/kl": 20.118396759033203, "objective/non_score_reward": -1.0059196949005127, "objective/rlhf_reward": -12.263788223266602, "objective/scores": -11.257868766784668, "policy/approxkl_avg": 9.399078493288471e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5552011728286743, "step": 155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000457763671875, "val/ratio_var": NaN }, { "episode": 156, "epoch": 0.029295774647887324, "eps": 0, "loss/policy_avg": -3.132730125798844e-05, "loss/value_avg": 0.6214393973350525, "lr": 2.535e-06, "objective/entropy": 103.73768615722656, "objective/kl": 18.905658721923828, "objective/non_score_reward": -0.9452829360961914, "objective/rlhf_reward": -11.64207935333252, "objective/scores": -10.696796417236328, "policy/approxkl_avg": 1.0532741612223617e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.815913438796997, "step": 156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001016855239868, "val/ratio_var": NaN }, { "episode": 157, "epoch": 0.029483568075117372, "eps": 0, "loss/policy_avg": -1.5519699445576407e-05, "loss/value_avg": 1.2252824306488037, "lr": 2.532e-06, "objective/entropy": 117.50518798828125, "objective/kl": 12.192819595336914, "objective/non_score_reward": -0.6096409559249878, "objective/rlhf_reward": -9.943497657775879, "objective/scores": -9.333856582641602, "policy/approxkl_avg": 1.6135527403093874e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3715498447418213, "step": 157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999994695186615, "val/ratio_var": NaN }, { "episode": 158, "epoch": 0.029671361502347417, "eps": 0, "loss/policy_avg": 9.356804753224424e-07, "loss/value_avg": 1.270674705505371, "lr": 2.529e-06, "objective/entropy": 91.32841491699219, "objective/kl": 6.301640510559082, "objective/non_score_reward": -0.31508201360702515, "objective/rlhf_reward": -10.272037506103516, "objective/scores": -9.956955909729004, "policy/approxkl_avg": 1.1416885570270097e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8193082809448242, "step": 158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000065803527832, "val/ratio_var": NaN }, { "episode": 159, "epoch": 0.029859154929577466, "eps": 0, "loss/policy_avg": -3.7350742786657065e-05, "loss/value_avg": 0.5029188394546509, "lr": 2.526e-06, "objective/entropy": 119.34776306152344, "objective/kl": 20.081939697265625, "objective/non_score_reward": -1.0040969848632812, "objective/rlhf_reward": -11.714045524597168, "objective/scores": -10.709948539733887, "policy/approxkl_avg": 1.0749170797907937e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0439937114715576, "step": 159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998984336853027, "val/ratio_var": NaN }, { "episode": 160, "epoch": 0.03004694835680751, "eps": 0, "loss/policy_avg": -3.6455548979574814e-05, "loss/value_avg": 0.3158326745033264, "lr": 2.523e-06, "objective/entropy": 81.31778717041016, "objective/kl": 17.283281326293945, "objective/non_score_reward": -0.8641641736030579, "objective/rlhf_reward": -11.83645248413086, "objective/scores": -10.972288131713867, "policy/approxkl_avg": 8.716394717112053e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7671105861663818, "step": 160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000023365020752, "val/ratio_var": NaN }, { "episode": 161, "epoch": 0.03023474178403756, "eps": 0, "loss/policy_avg": 1.079631329048425e-05, "loss/value_avg": 0.4205116629600525, "lr": 2.52e-06, "objective/entropy": 41.48210144042969, "objective/kl": 11.037649154663086, "objective/non_score_reward": -0.5518824458122253, "objective/rlhf_reward": -11.274378776550293, "objective/scores": -10.722496032714844, "policy/approxkl_avg": 2.0894550090133635e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.023078203201294, "step": 161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999595880508423, "val/ratio_var": NaN }, { "episode": 162, "epoch": 0.030422535211267605, "eps": 0, "loss/policy_avg": 8.07654214440845e-05, "loss/value_avg": 0.20774205029010773, "lr": 2.5169999999999998e-06, "objective/entropy": 106.37580871582031, "objective/kl": 5.964994430541992, "objective/non_score_reward": -0.2982497215270996, "objective/rlhf_reward": -11.242666244506836, "objective/scores": -10.944416046142578, "policy/approxkl_avg": 9.275944989894924e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7211430072784424, "step": 162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000205039978027, "val/ratio_var": NaN }, { "episode": 163, "epoch": 0.030610328638497653, "eps": 0, "loss/policy_avg": 4.523205279838294e-05, "loss/value_avg": 0.2571594715118408, "lr": 2.514e-06, "objective/entropy": 69.75611877441406, "objective/kl": 16.653892517089844, "objective/non_score_reward": -0.8326945900917053, "objective/rlhf_reward": -11.636560440063477, "objective/scores": -10.803865432739258, "policy/approxkl_avg": 8.98295056117604e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.793643593788147, "step": 163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999741315841675, "val/ratio_var": NaN }, { "episode": 164, "epoch": 0.0307981220657277, "eps": 0, "loss/policy_avg": 4.679751873482019e-05, "loss/value_avg": 0.2608196437358856, "lr": 2.5109999999999998e-06, "objective/entropy": 89.03092956542969, "objective/kl": 18.933250427246094, "objective/non_score_reward": -0.9466625452041626, "objective/rlhf_reward": -11.354288101196289, "objective/scores": -10.407625198364258, "policy/approxkl_avg": 8.356625613714641e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.735296368598938, "step": 164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999965250492096, "val/ratio_var": NaN }, { "episode": 165, "epoch": 0.030985915492957747, "eps": 0, "loss/policy_avg": 3.6905395973008126e-05, "loss/value_avg": 0.5160859823226929, "lr": 2.508e-06, "objective/entropy": 91.0670394897461, "objective/kl": 9.926922798156738, "objective/non_score_reward": -0.496346116065979, "objective/rlhf_reward": -10.853309631347656, "objective/scores": -10.356963157653809, "policy/approxkl_avg": 8.124883521531956e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9378316402435303, "step": 165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999238848686218, "val/ratio_var": NaN }, { "episode": 166, "epoch": 0.031173708920187792, "eps": 0, "loss/policy_avg": 1.465599507355364e-05, "loss/value_avg": 0.285050630569458, "lr": 2.505e-06, "objective/entropy": 123.07917022705078, "objective/kl": 7.512008190155029, "objective/non_score_reward": -0.3756003975868225, "objective/rlhf_reward": -10.646194458007812, "objective/scores": -10.270593643188477, "policy/approxkl_avg": 1.4006050719217455e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1916112899780273, "step": 166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999733567237854, "val/ratio_var": NaN }, { "episode": 167, "epoch": 0.03136150234741784, "eps": 0, "loss/policy_avg": -6.663574458798394e-05, "loss/value_avg": 0.2545795440673828, "lr": 2.502e-06, "objective/entropy": 95.22453308105469, "objective/kl": 8.46187686920166, "objective/non_score_reward": -0.4230938255786896, "objective/rlhf_reward": -10.709874153137207, "objective/scores": -10.28678035736084, "policy/approxkl_avg": 4.5209198162865505e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0069074630737305, "step": 167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999004006385803, "val/ratio_var": NaN }, { "episode": 168, "epoch": 0.031549295774647886, "eps": 0, "loss/policy_avg": 8.455762144876644e-05, "loss/value_avg": 0.23792417347431183, "lr": 2.499e-06, "objective/entropy": 113.9044189453125, "objective/kl": 9.622343063354492, "objective/non_score_reward": -0.4811171591281891, "objective/rlhf_reward": -11.503866195678711, "objective/scores": -11.022748947143555, "policy/approxkl_avg": 9.294981140328673e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.072283983230591, "step": 168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999533891677856, "val/ratio_var": NaN }, { "episode": 169, "epoch": 0.031737089201877934, "eps": 0, "loss/policy_avg": 5.831808448419906e-05, "loss/value_avg": 0.2808913588523865, "lr": 2.496e-06, "objective/entropy": 97.74860382080078, "objective/kl": 20.79623794555664, "objective/non_score_reward": -1.0398119688034058, "objective/rlhf_reward": -10.90333366394043, "objective/scores": -9.863521575927734, "policy/approxkl_avg": 8.837814391426946e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6838033199310303, "step": 169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999719262123108, "val/ratio_var": NaN }, { "episode": 170, "epoch": 0.03192488262910798, "eps": 0, "loss/policy_avg": -3.3837444789242e-05, "loss/value_avg": 0.2741740047931671, "lr": 2.493e-06, "objective/entropy": 111.56925964355469, "objective/kl": 26.643177032470703, "objective/non_score_reward": -1.3321589231491089, "objective/rlhf_reward": -11.472410202026367, "objective/scores": -10.140251159667969, "policy/approxkl_avg": 9.966635872160623e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9328912496566772, "step": 170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999904632568359, "val/ratio_var": NaN }, { "episode": 171, "epoch": 0.03211267605633803, "eps": 0, "loss/policy_avg": 8.812490705167875e-06, "loss/value_avg": 0.23669397830963135, "lr": 2.49e-06, "objective/entropy": 66.6216812133789, "objective/kl": 22.636714935302734, "objective/non_score_reward": -1.1318358182907104, "objective/rlhf_reward": -11.255050659179688, "objective/scores": -10.123214721679688, "policy/approxkl_avg": 8.616957813956105e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.344844102859497, "step": 171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000346899032593, "val/ratio_var": NaN }, { "episode": 172, "epoch": 0.03230046948356807, "eps": 0, "loss/policy_avg": 4.628919214155758e-06, "loss/value_avg": 1.1704574823379517, "lr": 2.487e-06, "objective/entropy": 53.32468032836914, "objective/kl": 11.545422554016113, "objective/non_score_reward": -0.5772712230682373, "objective/rlhf_reward": -8.372638702392578, "objective/scores": -7.795367240905762, "policy/approxkl_avg": 4.61604940937832e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1824352741241455, "step": 172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000216960906982, "val/ratio_var": NaN }, { "episode": 173, "epoch": 0.03248826291079812, "eps": 0, "loss/policy_avg": 4.487667320063338e-05, "loss/value_avg": 0.35960134863853455, "lr": 2.484e-06, "objective/entropy": 125.90808868408203, "objective/kl": 5.764248847961426, "objective/non_score_reward": -0.28821244835853577, "objective/rlhf_reward": -10.638680458068848, "objective/scores": -10.350467681884766, "policy/approxkl_avg": 2.3692450668022502e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.208346366882324, "step": 173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998576641082764, "val/ratio_var": NaN }, { "episode": 174, "epoch": 0.03267605633802817, "eps": 0, "loss/policy_avg": -2.821436464728322e-05, "loss/value_avg": 0.25455424189567566, "lr": 2.481e-06, "objective/entropy": 81.20153045654297, "objective/kl": 21.19855499267578, "objective/non_score_reward": -1.0599279403686523, "objective/rlhf_reward": -12.312171936035156, "objective/scores": -11.252243995666504, "policy/approxkl_avg": 5.99823692937207e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6064361333847046, "step": 174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000265836715698, "val/ratio_var": NaN }, { "episode": 175, "epoch": 0.03286384976525822, "eps": 0, "loss/policy_avg": -8.3806378825102e-05, "loss/value_avg": 0.5716187357902527, "lr": 2.478e-06, "objective/entropy": 101.63897705078125, "objective/kl": 22.03339195251465, "objective/non_score_reward": -1.1016695499420166, "objective/rlhf_reward": -13.49070930480957, "objective/scores": -12.389039993286133, "policy/approxkl_avg": 9.973262393714322e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9440112113952637, "step": 175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999741315841675, "val/ratio_var": NaN }, { "episode": 176, "epoch": 0.03305164319248826, "eps": 0, "loss/policy_avg": 3.8731774111511186e-05, "loss/value_avg": 0.32569989562034607, "lr": 2.475e-06, "objective/entropy": 87.48081970214844, "objective/kl": 15.715847969055176, "objective/non_score_reward": -0.7857924699783325, "objective/rlhf_reward": -12.10960865020752, "objective/scores": -11.323816299438477, "policy/approxkl_avg": 5.9448616696045065e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7172943353652954, "step": 176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000892877578735, "val/ratio_var": NaN }, { "episode": 177, "epoch": 0.03323943661971831, "eps": 0, "loss/policy_avg": -5.587092164205387e-05, "loss/value_avg": 0.25147557258605957, "lr": 2.472e-06, "objective/entropy": 88.9833984375, "objective/kl": 13.066854476928711, "objective/non_score_reward": -0.6533426642417908, "objective/rlhf_reward": -12.021493911743164, "objective/scores": -11.368151664733887, "policy/approxkl_avg": 1.0101440039989029e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6449273824691772, "step": 177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000028133392334, "val/ratio_var": NaN }, { "episode": 178, "epoch": 0.03342723004694836, "eps": 0, "loss/policy_avg": -1.3418917660601437e-05, "loss/value_avg": 0.3157516419887543, "lr": 2.469e-06, "objective/entropy": 84.34383392333984, "objective/kl": 24.47296905517578, "objective/non_score_reward": -1.2236485481262207, "objective/rlhf_reward": -12.318042755126953, "objective/scores": -11.094393730163574, "policy/approxkl_avg": 5.595708429950719e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7419594526290894, "step": 178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000680685043335, "val/ratio_var": NaN }, { "episode": 179, "epoch": 0.033615023474178406, "eps": 0, "loss/policy_avg": 9.220500942319632e-05, "loss/value_avg": 0.260237455368042, "lr": 2.4659999999999998e-06, "objective/entropy": 149.8551788330078, "objective/kl": 10.21229362487793, "objective/non_score_reward": -0.5106146931648254, "objective/rlhf_reward": -12.349052429199219, "objective/scores": -11.838438034057617, "policy/approxkl_avg": 1.3233780293830932e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0702550411224365, "step": 179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000085711479187, "val/ratio_var": NaN }, { "episode": 180, "epoch": 0.03380281690140845, "eps": 0, "loss/policy_avg": -2.3203076125355437e-05, "loss/value_avg": 0.19812393188476562, "lr": 2.463e-06, "objective/entropy": 99.29838562011719, "objective/kl": 15.894245147705078, "objective/non_score_reward": -0.7947123050689697, "objective/rlhf_reward": -12.15432357788086, "objective/scores": -11.359611511230469, "policy/approxkl_avg": 9.466057093732161e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8033884763717651, "step": 180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999558925628662, "val/ratio_var": NaN }, { "episode": 181, "epoch": 0.033990610328638496, "eps": 0, "loss/policy_avg": -5.6284778111148626e-05, "loss/value_avg": 0.1963600367307663, "lr": 2.4599999999999997e-06, "objective/entropy": 77.6550521850586, "objective/kl": 7.0415873527526855, "objective/non_score_reward": -0.3520793318748474, "objective/rlhf_reward": -11.0802640914917, "objective/scores": -10.728184700012207, "policy/approxkl_avg": 3.350607613583634e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5506107807159424, "step": 181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000156164169312, "val/ratio_var": NaN }, { "episode": 182, "epoch": 0.034178403755868544, "eps": 0, "loss/policy_avg": -9.636609320295975e-05, "loss/value_avg": 0.10419384390115738, "lr": 2.457e-06, "objective/entropy": 69.07745361328125, "objective/kl": 16.78192901611328, "objective/non_score_reward": -0.8390965461730957, "objective/rlhf_reward": -11.380563735961914, "objective/scores": -10.541467666625977, "policy/approxkl_avg": 4.909178485945631e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7402712106704712, "step": 182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999973773956299, "val/ratio_var": NaN }, { "episode": 183, "epoch": 0.03436619718309859, "eps": 0, "loss/policy_avg": 1.1965913699896191e-06, "loss/value_avg": 0.2544725835323334, "lr": 2.4539999999999997e-06, "objective/entropy": 86.31712341308594, "objective/kl": 10.937726020812988, "objective/non_score_reward": -0.5468862652778625, "objective/rlhf_reward": -11.2022705078125, "objective/scores": -10.655384063720703, "policy/approxkl_avg": 9.063415973287192e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.010369062423706, "step": 183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000040888786316, "val/ratio_var": NaN }, { "episode": 184, "epoch": 0.03455399061032864, "eps": 0, "loss/policy_avg": -7.516032928833738e-05, "loss/value_avg": 0.15757262706756592, "lr": 2.451e-06, "objective/entropy": 66.4777603149414, "objective/kl": 12.048249244689941, "objective/non_score_reward": -0.6024124622344971, "objective/rlhf_reward": -11.863114356994629, "objective/scores": -11.260702133178711, "policy/approxkl_avg": 1.0686941465110067e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3471126556396484, "step": 184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000919103622437, "val/ratio_var": NaN }, { "episode": 185, "epoch": 0.03474178403755868, "eps": 0, "loss/policy_avg": 9.014921670313925e-05, "loss/value_avg": 0.4251910448074341, "lr": 2.448e-06, "objective/entropy": 108.03091430664062, "objective/kl": 14.342548370361328, "objective/non_score_reward": -0.7171273827552795, "objective/rlhf_reward": -11.253255844116211, "objective/scores": -10.536128044128418, "policy/approxkl_avg": 1.4195691733220883e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3938841819763184, "step": 185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999908804893494, "val/ratio_var": NaN }, { "episode": 186, "epoch": 0.03492957746478873, "eps": 0, "loss/policy_avg": -3.456619378994219e-05, "loss/value_avg": 0.3956950604915619, "lr": 2.445e-06, "objective/entropy": 44.07251739501953, "objective/kl": 6.407146453857422, "objective/non_score_reward": -0.3203573226928711, "objective/rlhf_reward": -10.328758239746094, "objective/scores": -10.008400917053223, "policy/approxkl_avg": 6.459643486778077e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2259222269058228, "step": 186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999728202819824, "val/ratio_var": NaN }, { "episode": 187, "epoch": 0.03511737089201878, "eps": 0, "loss/policy_avg": -1.0854792890313547e-05, "loss/value_avg": 0.13507291674613953, "lr": 2.442e-06, "objective/entropy": 71.40792846679688, "objective/kl": 13.959936141967773, "objective/non_score_reward": -0.6979968547821045, "objective/rlhf_reward": -11.937403678894043, "objective/scores": -11.23940658569336, "policy/approxkl_avg": 4.1771581749117104e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3423219919204712, "step": 187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000008225440979, "val/ratio_var": NaN }, { "episode": 188, "epoch": 0.03530516431924883, "eps": 0, "loss/policy_avg": 2.2849946617498063e-05, "loss/value_avg": 0.2702501118183136, "lr": 2.439e-06, "objective/entropy": 117.69695281982422, "objective/kl": 15.945779800415039, "objective/non_score_reward": -0.7972890138626099, "objective/rlhf_reward": -11.48303508758545, "objective/scores": -10.685746192932129, "policy/approxkl_avg": 8.47770422751637e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.805853009223938, "step": 188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998875260353088, "val/ratio_var": NaN }, { "episode": 189, "epoch": 0.03549295774647887, "eps": 0, "loss/policy_avg": 0.00010737383126979694, "loss/value_avg": 0.22182248532772064, "lr": 2.436e-06, "objective/entropy": 83.2739486694336, "objective/kl": 20.391347885131836, "objective/non_score_reward": -1.0195674896240234, "objective/rlhf_reward": -12.064057350158691, "objective/scores": -11.044489860534668, "policy/approxkl_avg": 8.143941698790513e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6520121097564697, "step": 189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999412894248962, "val/ratio_var": NaN }, { "episode": 190, "epoch": 0.03568075117370892, "eps": 0, "loss/policy_avg": 8.751311543164775e-05, "loss/value_avg": 0.8453420400619507, "lr": 2.4330000000000003e-06, "objective/entropy": 94.20062255859375, "objective/kl": 4.954801559448242, "objective/non_score_reward": -0.24774010479450226, "objective/rlhf_reward": -9.86809253692627, "objective/scores": -9.620352745056152, "policy/approxkl_avg": 9.7561397183199e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4554604291915894, "step": 190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000576972961426, "val/ratio_var": NaN }, { "episode": 191, "epoch": 0.03586854460093897, "eps": 0, "loss/policy_avg": 4.495314715313725e-05, "loss/value_avg": 0.30768322944641113, "lr": 2.43e-06, "objective/entropy": 80.4485092163086, "objective/kl": 18.293027877807617, "objective/non_score_reward": -0.9146513938903809, "objective/rlhf_reward": -12.301811218261719, "objective/scores": -11.38715934753418, "policy/approxkl_avg": 9.583308724359085e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.884334921836853, "step": 191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000009298324585, "val/ratio_var": NaN }, { "episode": 192, "epoch": 0.036056338028169016, "eps": 0, "loss/policy_avg": 0.00010996269702445716, "loss/value_avg": 0.4747096300125122, "lr": 2.4270000000000002e-06, "objective/entropy": 113.67166137695312, "objective/kl": 21.293903350830078, "objective/non_score_reward": -1.0646952390670776, "objective/rlhf_reward": -10.878660202026367, "objective/scores": -9.81396484375, "policy/approxkl_avg": 1.318167193176123e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.865176796913147, "step": 192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999591112136841, "val/ratio_var": NaN }, { "episode": 193, "epoch": 0.03624413145539906, "eps": 0, "loss/policy_avg": -9.968595804821234e-06, "loss/value_avg": 0.3488837778568268, "lr": 2.4240000000000004e-06, "objective/entropy": 95.339599609375, "objective/kl": 8.468063354492188, "objective/non_score_reward": -0.42340314388275146, "objective/rlhf_reward": -13.19472599029541, "objective/scores": -12.771323204040527, "policy/approxkl_avg": 1.04599187977783e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.753431797027588, "step": 193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999201893806458, "val/ratio_var": NaN }, { "episode": 194, "epoch": 0.036431924882629106, "eps": 0, "loss/policy_avg": 2.9284998163348064e-05, "loss/value_avg": 0.33273348212242126, "lr": 2.421e-06, "objective/entropy": 114.15370178222656, "objective/kl": 21.104402542114258, "objective/non_score_reward": -1.055220127105713, "objective/rlhf_reward": -11.52410888671875, "objective/scores": -10.468889236450195, "policy/approxkl_avg": 7.900181486775182e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.028170585632324, "step": 194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.99998939037323, "val/ratio_var": NaN }, { "episode": 195, "epoch": 0.036619718309859155, "eps": 0, "loss/policy_avg": 6.29425048828125e-05, "loss/value_avg": 0.25035297870635986, "lr": 2.4180000000000004e-06, "objective/entropy": 79.88825225830078, "objective/kl": 18.592693328857422, "objective/non_score_reward": -0.929634690284729, "objective/rlhf_reward": -12.065760612487793, "objective/scores": -11.136125564575195, "policy/approxkl_avg": 7.327733442252793e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.774593710899353, "step": 195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999819993972778, "val/ratio_var": NaN }, { "episode": 196, "epoch": 0.0368075117370892, "eps": 0, "loss/policy_avg": -1.7804919480113313e-05, "loss/value_avg": 0.3321259617805481, "lr": 2.415e-06, "objective/entropy": 103.87841796875, "objective/kl": 18.583003997802734, "objective/non_score_reward": -0.9291501641273499, "objective/rlhf_reward": -11.325934410095215, "objective/scores": -10.396783828735352, "policy/approxkl_avg": 8.878829760305962e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.115126132965088, "step": 196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999415278434753, "val/ratio_var": NaN }, { "episode": 197, "epoch": 0.03699530516431925, "eps": 0, "loss/policy_avg": 0.00010492216824786738, "loss/value_avg": 0.43154749274253845, "lr": 2.4120000000000004e-06, "objective/entropy": 87.95869445800781, "objective/kl": 10.851791381835938, "objective/non_score_reward": -0.5425896644592285, "objective/rlhf_reward": -10.535955429077148, "objective/scores": -9.993366241455078, "policy/approxkl_avg": 1.694311748678956e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4357702732086182, "step": 197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999679327011108, "val/ratio_var": NaN }, { "episode": 198, "epoch": 0.03718309859154929, "eps": 0, "loss/policy_avg": -2.4822522391332313e-05, "loss/value_avg": 0.3976421356201172, "lr": 2.409e-06, "objective/entropy": 126.4654541015625, "objective/kl": 12.036211013793945, "objective/non_score_reward": -0.6018105149269104, "objective/rlhf_reward": -11.012462615966797, "objective/scores": -10.410652160644531, "policy/approxkl_avg": 2.428053278435982e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.345594882965088, "step": 198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9997303485870361, "val/ratio_var": NaN }, { "episode": 199, "epoch": 0.03737089201877934, "eps": 0, "loss/policy_avg": -1.7721697076922283e-05, "loss/value_avg": 0.21694175899028778, "lr": 2.4060000000000003e-06, "objective/entropy": 51.68236541748047, "objective/kl": 16.759708404541016, "objective/non_score_reward": -0.8379853367805481, "objective/rlhf_reward": -11.375014305114746, "objective/scores": -10.537029266357422, "policy/approxkl_avg": 4.265550046511635e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1591520309448242, "step": 199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000029802322388, "val/ratio_var": NaN }, { "episode": 200, "epoch": 0.03755868544600939, "eps": 0, "loss/policy_avg": -8.4301209426485e-05, "loss/value_avg": 0.44758662581443787, "lr": 2.403e-06, "objective/entropy": 138.48028564453125, "objective/kl": 13.40565299987793, "objective/non_score_reward": -0.6702826619148254, "objective/rlhf_reward": -11.21584701538086, "objective/scores": -10.545564651489258, "policy/approxkl_avg": 8.868831002928346e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3500077724456787, "step": 200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000184774398804, "val/ratio_var": NaN }, { "episode": 201, "epoch": 0.03774647887323944, "eps": 0, "loss/policy_avg": -2.4530123482691124e-05, "loss/value_avg": 0.24215291440486908, "lr": 2.4000000000000003e-06, "objective/entropy": 64.27466583251953, "objective/kl": 14.186724662780762, "objective/non_score_reward": -0.7093362808227539, "objective/rlhf_reward": -11.117931365966797, "objective/scores": -10.408595085144043, "policy/approxkl_avg": 5.875983077885394e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4601843357086182, "step": 201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000020146369934, "val/ratio_var": NaN }, { "episode": 202, "epoch": 0.03793427230046948, "eps": 0, "loss/policy_avg": -3.197058322257362e-05, "loss/value_avg": 0.267713338136673, "lr": 2.397e-06, "objective/entropy": 96.88801574707031, "objective/kl": 20.77935791015625, "objective/non_score_reward": -1.0389679670333862, "objective/rlhf_reward": -11.91568660736084, "objective/scores": -10.876718521118164, "policy/approxkl_avg": 8.486077973657302e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8171502351760864, "step": 202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000485181808472, "val/ratio_var": NaN }, { "episode": 203, "epoch": 0.03812206572769953, "eps": 0, "loss/policy_avg": 6.558310269610956e-05, "loss/value_avg": 0.29992732405662537, "lr": 2.3940000000000003e-06, "objective/entropy": 57.112266540527344, "objective/kl": 8.889095306396484, "objective/non_score_reward": -0.44445475935935974, "objective/rlhf_reward": -10.641634941101074, "objective/scores": -10.197179794311523, "policy/approxkl_avg": 3.848199980893696e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1120444536209106, "step": 203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999902248382568, "val/ratio_var": NaN }, { "episode": 204, "epoch": 0.03830985915492958, "eps": 0, "loss/policy_avg": -4.12779045291245e-05, "loss/value_avg": 0.21914097666740417, "lr": 2.391e-06, "objective/entropy": 81.79727935791016, "objective/kl": 17.890493392944336, "objective/non_score_reward": -0.8945246934890747, "objective/rlhf_reward": -12.133591651916504, "objective/scores": -11.239067077636719, "policy/approxkl_avg": 5.266126734682075e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2558109760284424, "step": 204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000686645507812, "val/ratio_var": NaN }, { "episode": 205, "epoch": 0.038497652582159626, "eps": 0, "loss/policy_avg": -0.0001270951033802703, "loss/value_avg": 0.25641047954559326, "lr": 2.3880000000000003e-06, "objective/entropy": 85.21084594726562, "objective/kl": 7.048683166503906, "objective/non_score_reward": -0.3524341583251953, "objective/rlhf_reward": -10.942026138305664, "objective/scores": -10.589591979980469, "policy/approxkl_avg": 9.143791857013639e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7024133205413818, "step": 205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999930202960968, "val/ratio_var": NaN }, { "episode": 206, "epoch": 0.03868544600938967, "eps": 0, "loss/policy_avg": -6.572255642822711e-06, "loss/value_avg": 1.1334062814712524, "lr": 2.385e-06, "objective/entropy": 123.40298461914062, "objective/kl": 10.327808380126953, "objective/non_score_reward": -0.5163904428482056, "objective/rlhf_reward": -10.425338745117188, "objective/scores": -9.908947944641113, "policy/approxkl_avg": 1.1904932506467958e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4626810550689697, "step": 206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001170635223389, "val/ratio_var": NaN }, { "episode": 207, "epoch": 0.038873239436619716, "eps": 0, "loss/policy_avg": 3.0256667741923593e-05, "loss/value_avg": 0.20807111263275146, "lr": 2.3820000000000002e-06, "objective/entropy": 150.4099884033203, "objective/kl": 12.508431434631348, "objective/non_score_reward": -0.6254215836524963, "objective/rlhf_reward": -11.83718204498291, "objective/scores": -11.211760520935059, "policy/approxkl_avg": 9.050373961372316e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1404247283935547, "step": 207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998810291290283, "val/ratio_var": NaN }, { "episode": 208, "epoch": 0.039061032863849765, "eps": 0, "loss/policy_avg": 1.5483712559216656e-05, "loss/value_avg": 0.32125982642173767, "lr": 2.379e-06, "objective/entropy": 102.1695556640625, "objective/kl": 11.446231842041016, "objective/non_score_reward": -0.572311520576477, "objective/rlhf_reward": -10.806981086730957, "objective/scores": -10.23466968536377, "policy/approxkl_avg": 1.1251151477154053e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.067758321762085, "step": 208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000070333480835, "val/ratio_var": NaN }, { "episode": 209, "epoch": 0.039248826291079814, "eps": 0, "loss/policy_avg": 5.540757774724625e-05, "loss/value_avg": 0.28024765849113464, "lr": 2.376e-06, "objective/entropy": 118.48003387451172, "objective/kl": 17.216121673583984, "objective/non_score_reward": -0.8608059883117676, "objective/rlhf_reward": -11.38800048828125, "objective/scores": -10.52719497680664, "policy/approxkl_avg": 1.1787907538973741e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8451480865478516, "step": 209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999936044216156, "val/ratio_var": NaN }, { "episode": 210, "epoch": 0.03943661971830986, "eps": 0, "loss/policy_avg": 3.418832420720719e-06, "loss/value_avg": 0.4915725290775299, "lr": 2.373e-06, "objective/entropy": 130.6443634033203, "objective/kl": 21.8004150390625, "objective/non_score_reward": -1.090020775794983, "objective/rlhf_reward": -10.801827430725098, "objective/scores": -9.711806297302246, "policy/approxkl_avg": 1.1520631204575693e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.249769687652588, "step": 210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000000238418579, "val/ratio_var": NaN }, { "episode": 211, "epoch": 0.039624413145539904, "eps": 0, "loss/policy_avg": -2.6003370294347405e-05, "loss/value_avg": 0.19709666073322296, "lr": 2.37e-06, "objective/entropy": 115.11572265625, "objective/kl": 10.282585144042969, "objective/non_score_reward": -0.5141292810440063, "objective/rlhf_reward": -11.110494613647461, "objective/scores": -10.596364974975586, "policy/approxkl_avg": 8.948868668312571e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7177457809448242, "step": 211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000617504119873, "val/ratio_var": NaN }, { "episode": 212, "epoch": 0.03981220657276995, "eps": 0, "loss/policy_avg": 6.822145223850384e-05, "loss/value_avg": 0.20059999823570251, "lr": 2.367e-06, "objective/entropy": 123.00929260253906, "objective/kl": 21.327186584472656, "objective/non_score_reward": -1.066359281539917, "objective/rlhf_reward": -12.835022926330566, "objective/scores": -11.76866340637207, "policy/approxkl_avg": 8.132240481018016e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2189066410064697, "step": 212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000076293945312, "val/ratio_var": NaN }, { "episode": 213, "epoch": 0.04, "eps": 0, "loss/policy_avg": 9.337011579191312e-05, "loss/value_avg": 0.3886176645755768, "lr": 2.364e-06, "objective/entropy": 130.47195434570312, "objective/kl": 37.67643737792969, "objective/non_score_reward": -1.883821964263916, "objective/rlhf_reward": -11.980201721191406, "objective/scores": -10.096379280090332, "policy/approxkl_avg": 8.014371388753716e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.6772069931030273, "step": 213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000227689743042, "val/ratio_var": NaN }, { "episode": 214, "epoch": 0.04018779342723005, "eps": 0, "loss/policy_avg": 6.907840725034475e-05, "loss/value_avg": 0.2940538823604584, "lr": 2.3610000000000003e-06, "objective/entropy": 107.00993347167969, "objective/kl": 17.424131393432617, "objective/non_score_reward": -0.8712066411972046, "objective/rlhf_reward": -12.384641647338867, "objective/scores": -11.513435363769531, "policy/approxkl_avg": 8.966112829966733e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.824343204498291, "step": 214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000065565109253, "val/ratio_var": NaN }, { "episode": 215, "epoch": 0.04037558685446009, "eps": 0, "loss/policy_avg": 1.1030233508790843e-05, "loss/value_avg": 0.6989230513572693, "lr": 2.358e-06, "objective/entropy": 106.13154602050781, "objective/kl": 9.375249862670898, "objective/non_score_reward": -0.46876251697540283, "objective/rlhf_reward": -12.14388370513916, "objective/scores": -11.675121307373047, "policy/approxkl_avg": 1.0402751371429986e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0837035179138184, "step": 215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000124216079712, "val/ratio_var": NaN }, { "episode": 216, "epoch": 0.04056338028169014, "eps": 0, "loss/policy_avg": 1.9964181774412282e-05, "loss/value_avg": 0.21609878540039062, "lr": 2.3550000000000003e-06, "objective/entropy": 112.9332275390625, "objective/kl": 18.597509384155273, "objective/non_score_reward": -0.9298754930496216, "objective/rlhf_reward": -11.495927810668945, "objective/scores": -10.566052436828613, "policy/approxkl_avg": 7.038781291157648e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3401777744293213, "step": 216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000677108764648, "val/ratio_var": NaN }, { "episode": 217, "epoch": 0.04075117370892019, "eps": 0, "loss/policy_avg": -6.63793325657025e-05, "loss/value_avg": 0.1201852336525917, "lr": 2.352e-06, "objective/entropy": 56.785179138183594, "objective/kl": 4.989446640014648, "objective/non_score_reward": -0.24947234988212585, "objective/rlhf_reward": -11.345636367797852, "objective/scores": -11.096163749694824, "policy/approxkl_avg": 9.795254385380758e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.377858281135559, "step": 217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000070333480835, "val/ratio_var": NaN }, { "episode": 218, "epoch": 0.04093896713615024, "eps": 0, "loss/policy_avg": 6.978017336223274e-05, "loss/value_avg": 0.26380619406700134, "lr": 2.3490000000000003e-06, "objective/entropy": 130.68812561035156, "objective/kl": 17.12250518798828, "objective/non_score_reward": -0.8561253547668457, "objective/rlhf_reward": -11.659444808959961, "objective/scores": -10.803318977355957, "policy/approxkl_avg": 9.377527021570131e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3730976581573486, "step": 218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999970018863678, "val/ratio_var": NaN }, { "episode": 219, "epoch": 0.04112676056338028, "eps": 0, "loss/policy_avg": 5.5189400882227346e-05, "loss/value_avg": 0.1482914239168167, "lr": 2.346e-06, "objective/entropy": 113.89747619628906, "objective/kl": 14.709491729736328, "objective/non_score_reward": -0.7354745864868164, "objective/rlhf_reward": -12.224302291870117, "objective/scores": -11.4888277053833, "policy/approxkl_avg": 1.657247850062049e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9210320711135864, "step": 219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999475479125977, "val/ratio_var": NaN }, { "episode": 220, "epoch": 0.04131455399061033, "eps": 0, "loss/policy_avg": -5.541657446883619e-05, "loss/value_avg": 0.21821658313274384, "lr": 2.3430000000000003e-06, "objective/entropy": 95.5645980834961, "objective/kl": 28.75519561767578, "objective/non_score_reward": -1.4377598762512207, "objective/rlhf_reward": -12.251487731933594, "objective/scores": -10.813727378845215, "policy/approxkl_avg": 6.373601024733944e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.033937931060791, "step": 220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000313520431519, "val/ratio_var": NaN }, { "episode": 221, "epoch": 0.041502347417840375, "eps": 0, "loss/policy_avg": -3.336960435262881e-05, "loss/value_avg": 0.4078538715839386, "lr": 2.34e-06, "objective/entropy": 74.2306137084961, "objective/kl": 17.681331634521484, "objective/non_score_reward": -0.8840665817260742, "objective/rlhf_reward": -11.664179801940918, "objective/scores": -10.780113220214844, "policy/approxkl_avg": 4.445208645620369e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4517062902450562, "step": 221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999540448188782, "val/ratio_var": NaN }, { "episode": 222, "epoch": 0.041690140845070424, "eps": 0, "loss/policy_avg": 7.992870087036863e-05, "loss/value_avg": 0.49588897824287415, "lr": 2.3370000000000002e-06, "objective/entropy": 106.88037109375, "objective/kl": 25.799182891845703, "objective/non_score_reward": -1.289959192276001, "objective/rlhf_reward": -11.718819618225098, "objective/scores": -10.428860664367676, "policy/approxkl_avg": 1.146089445569487e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1644701957702637, "step": 222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000094175338745, "val/ratio_var": NaN }, { "episode": 223, "epoch": 0.04187793427230047, "eps": 0, "loss/policy_avg": -5.47013187315315e-05, "loss/value_avg": 0.2867770195007324, "lr": 2.334e-06, "objective/entropy": 77.8486328125, "objective/kl": 22.622982025146484, "objective/non_score_reward": -1.131149172782898, "objective/rlhf_reward": -12.568194389343262, "objective/scores": -11.437045097351074, "policy/approxkl_avg": 5.80008787665065e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5359578132629395, "step": 223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999319314956665, "val/ratio_var": NaN }, { "episode": 224, "epoch": 0.042065727699530514, "eps": 0, "loss/policy_avg": 6.37139892205596e-05, "loss/value_avg": 0.3020741045475006, "lr": 2.3310000000000002e-06, "objective/entropy": 79.0317153930664, "objective/kl": 20.611434936523438, "objective/non_score_reward": -1.030571699142456, "objective/rlhf_reward": -12.943833351135254, "objective/scores": -11.913261413574219, "policy/approxkl_avg": 7.6631799572624e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5179259777069092, "step": 224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000080943107605, "val/ratio_var": NaN }, { "episode": 225, "epoch": 0.04225352112676056, "eps": 0, "loss/policy_avg": -9.793155186343938e-06, "loss/value_avg": 0.15448841452598572, "lr": 2.328e-06, "objective/entropy": 109.014404296875, "objective/kl": 19.970149993896484, "objective/non_score_reward": -0.998507559299469, "objective/rlhf_reward": -12.40622615814209, "objective/scores": -11.407718658447266, "policy/approxkl_avg": 1.2230638901655766e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.874942421913147, "step": 225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998888969421387, "val/ratio_var": NaN }, { "episode": 226, "epoch": 0.04244131455399061, "eps": 0, "loss/policy_avg": -2.8875638236058876e-05, "loss/value_avg": 0.49689802527427673, "lr": 2.325e-06, "objective/entropy": 79.0527572631836, "objective/kl": 4.965512275695801, "objective/non_score_reward": -0.24827557802200317, "objective/rlhf_reward": -9.516345977783203, "objective/scores": -9.268070220947266, "policy/approxkl_avg": 8.631348435983455e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4039974212646484, "step": 226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000406503677368, "val/ratio_var": NaN }, { "episode": 227, "epoch": 0.04262910798122066, "eps": 0, "loss/policy_avg": -4.433685899130069e-05, "loss/value_avg": 0.1749654859304428, "lr": 2.322e-06, "objective/entropy": 99.55097961425781, "objective/kl": 16.34319496154785, "objective/non_score_reward": -0.8171596527099609, "objective/rlhf_reward": -11.67340087890625, "objective/scores": -10.856241226196289, "policy/approxkl_avg": 9.028423875179215e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.696349024772644, "step": 227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000232458114624, "val/ratio_var": NaN }, { "episode": 228, "epoch": 0.0428169014084507, "eps": 0, "loss/policy_avg": 4.1943676478695124e-05, "loss/value_avg": 0.36977070569992065, "lr": 2.319e-06, "objective/entropy": 109.76190185546875, "objective/kl": 15.187460899353027, "objective/non_score_reward": -0.7593730688095093, "objective/rlhf_reward": -10.746458053588867, "objective/scores": -9.987085342407227, "policy/approxkl_avg": 7.631552279008247e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0932388305664062, "step": 228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000035762786865, "val/ratio_var": NaN }, { "episode": 229, "epoch": 0.04300469483568075, "eps": 0, "loss/policy_avg": -7.899302545411047e-06, "loss/value_avg": 0.5703476071357727, "lr": 2.316e-06, "objective/entropy": 119.81922149658203, "objective/kl": 14.56661605834961, "objective/non_score_reward": -0.7283308506011963, "objective/rlhf_reward": -11.410261154174805, "objective/scores": -10.681930541992188, "policy/approxkl_avg": 1.3208320126523176e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2175891399383545, "step": 229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999672770500183, "val/ratio_var": NaN }, { "episode": 230, "epoch": 0.0431924882629108, "eps": 0, "loss/policy_avg": -0.00012744597916025668, "loss/value_avg": 0.4021327495574951, "lr": 2.313e-06, "objective/entropy": 121.71968078613281, "objective/kl": 22.24272918701172, "objective/non_score_reward": -1.1121364831924438, "objective/rlhf_reward": -12.933022499084473, "objective/scores": -11.82088565826416, "policy/approxkl_avg": 1.2017113704132498e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8723697662353516, "step": 230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999989867210388, "val/ratio_var": NaN }, { "episode": 231, "epoch": 0.04338028169014085, "eps": 0, "loss/policy_avg": -2.7341662644175813e-05, "loss/value_avg": 7.397233963012695, "lr": 2.31e-06, "objective/entropy": 85.66588592529297, "objective/kl": 21.503488540649414, "objective/non_score_reward": -1.0751745700836182, "objective/rlhf_reward": -12.519591331481934, "objective/scores": -11.444416999816895, "policy/approxkl_avg": 7.479543739918881e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.567002773284912, "step": 231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999743700027466, "val/ratio_var": NaN }, { "episode": 232, "epoch": 0.04356807511737089, "eps": 0, "loss/policy_avg": 3.4300785046070814e-05, "loss/value_avg": 0.2698543965816498, "lr": 2.307e-06, "objective/entropy": 56.5830078125, "objective/kl": 6.878105640411377, "objective/non_score_reward": -0.3439052700996399, "objective/rlhf_reward": -11.36417007446289, "objective/scores": -11.020264625549316, "policy/approxkl_avg": 6.61544987679008e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.190791368484497, "step": 232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000181198120117, "val/ratio_var": NaN }, { "episode": 233, "epoch": 0.04375586854460094, "eps": 0, "loss/policy_avg": -7.080581781337969e-06, "loss/value_avg": 0.33561912178993225, "lr": 2.3040000000000003e-06, "objective/entropy": 139.5666046142578, "objective/kl": 16.746967315673828, "objective/non_score_reward": -0.8373484015464783, "objective/rlhf_reward": -11.817666053771973, "objective/scores": -10.980318069458008, "policy/approxkl_avg": 1.368988762351364e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1014864444732666, "step": 233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000802278518677, "val/ratio_var": NaN }, { "episode": 234, "epoch": 0.043943661971830986, "eps": 0, "loss/policy_avg": -2.7256191970082e-05, "loss/value_avg": 0.39115041494369507, "lr": 2.301e-06, "objective/entropy": 118.1261215209961, "objective/kl": 26.426210403442383, "objective/non_score_reward": -1.3213105201721191, "objective/rlhf_reward": -11.984102249145508, "objective/scores": -10.66279125213623, "policy/approxkl_avg": 1.2768569490617665e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.347032070159912, "step": 234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000035762786865, "val/ratio_var": NaN }, { "episode": 235, "epoch": 0.044131455399061034, "eps": 0, "loss/policy_avg": -7.884007936809212e-05, "loss/value_avg": 0.5598492622375488, "lr": 2.2980000000000003e-06, "objective/entropy": 99.9340591430664, "objective/kl": 9.457916259765625, "objective/non_score_reward": -0.4728958010673523, "objective/rlhf_reward": -11.167378425598145, "objective/scores": -10.694482803344727, "policy/approxkl_avg": 9.037682957568904e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8542734384536743, "step": 235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999669194221497, "val/ratio_var": NaN }, { "episode": 236, "epoch": 0.04431924882629108, "eps": 0, "loss/policy_avg": 1.5527572031714953e-05, "loss/value_avg": 0.22785870730876923, "lr": 2.295e-06, "objective/entropy": 119.13226318359375, "objective/kl": 19.122692108154297, "objective/non_score_reward": -0.956134557723999, "objective/rlhf_reward": -10.716713905334473, "objective/scores": -9.760579109191895, "policy/approxkl_avg": 1.0259133631507211e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4507710933685303, "step": 236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999955296516418, "val/ratio_var": NaN }, { "episode": 237, "epoch": 0.044507042253521124, "eps": 0, "loss/policy_avg": 4.470573185244575e-05, "loss/value_avg": 0.20961883664131165, "lr": 2.2920000000000002e-06, "objective/entropy": 84.73722839355469, "objective/kl": 18.18035125732422, "objective/non_score_reward": -0.9090176224708557, "objective/rlhf_reward": -10.50731086730957, "objective/scores": -9.59829330444336, "policy/approxkl_avg": 1.2651531733354204e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8910235166549683, "step": 237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000069499015808, "val/ratio_var": NaN }, { "episode": 238, "epoch": 0.04469483568075117, "eps": 0, "loss/policy_avg": 1.9752755179069936e-05, "loss/value_avg": 0.8207483887672424, "lr": 2.289e-06, "objective/entropy": 129.86502075195312, "objective/kl": 15.430672645568848, "objective/non_score_reward": -0.7715336084365845, "objective/rlhf_reward": -9.744234085083008, "objective/scores": -8.972700119018555, "policy/approxkl_avg": 2.6510477368901775e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2334952354431152, "step": 238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999029636383057, "val/ratio_var": NaN }, { "episode": 239, "epoch": 0.04488262910798122, "eps": 0, "loss/policy_avg": 7.56101799197495e-05, "loss/value_avg": 0.298221230506897, "lr": 2.2860000000000002e-06, "objective/entropy": 104.87044525146484, "objective/kl": 19.383445739746094, "objective/non_score_reward": -0.9691722393035889, "objective/rlhf_reward": -11.849845886230469, "objective/scores": -10.8806734085083, "policy/approxkl_avg": 9.998814931577726e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9156103134155273, "step": 239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999408721923828, "val/ratio_var": NaN }, { "episode": 240, "epoch": 0.04507042253521127, "eps": 0, "loss/policy_avg": -6.910540105309337e-05, "loss/value_avg": 0.8986638784408569, "lr": 2.283e-06, "objective/entropy": 118.30313873291016, "objective/kl": 22.30738067626953, "objective/non_score_reward": -1.1153690814971924, "objective/rlhf_reward": -10.566060066223145, "objective/scores": -9.450691223144531, "policy/approxkl_avg": 1.0310826326076494e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3378446102142334, "step": 240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001044273376465, "val/ratio_var": NaN }, { "episode": 241, "epoch": 0.04525821596244131, "eps": 0, "loss/policy_avg": 7.320912118302658e-05, "loss/value_avg": 0.16556909680366516, "lr": 2.28e-06, "objective/entropy": 93.94596862792969, "objective/kl": 17.538219451904297, "objective/non_score_reward": -0.8769110441207886, "objective/rlhf_reward": -11.604119300842285, "objective/scores": -10.727208137512207, "policy/approxkl_avg": 8.809092122419315e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.109241485595703, "step": 241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000582933425903, "val/ratio_var": NaN }, { "episode": 242, "epoch": 0.04544600938967136, "eps": 0, "loss/policy_avg": -4.857890962739475e-05, "loss/value_avg": 0.4984230101108551, "lr": 2.277e-06, "objective/entropy": 95.99846649169922, "objective/kl": 16.526941299438477, "objective/non_score_reward": -0.8263469934463501, "objective/rlhf_reward": -10.905091285705566, "objective/scores": -10.078743934631348, "policy/approxkl_avg": 1.3007665700115467e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.765143632888794, "step": 242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998459815979004, "val/ratio_var": NaN }, { "episode": 243, "epoch": 0.04563380281690141, "eps": 0, "loss/policy_avg": -0.0001303834724240005, "loss/value_avg": 0.3698982000350952, "lr": 2.274e-06, "objective/entropy": 162.93064880371094, "objective/kl": 24.61795425415039, "objective/non_score_reward": -1.2308979034423828, "objective/rlhf_reward": -11.808052062988281, "objective/scores": -10.577154159545898, "policy/approxkl_avg": 1.5416041776461498e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.5021166801452637, "step": 243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999672174453735, "val/ratio_var": NaN }, { "episode": 244, "epoch": 0.04582159624413146, "eps": 0, "loss/policy_avg": 2.14756655623205e-05, "loss/value_avg": 0.2678453028202057, "lr": 2.271e-06, "objective/entropy": 107.80564880371094, "objective/kl": 19.25516700744629, "objective/non_score_reward": -0.9627583622932434, "objective/rlhf_reward": -11.548535346984863, "objective/scores": -10.585777282714844, "policy/approxkl_avg": 5.673276959328177e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.104137420654297, "step": 244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000036597251892, "val/ratio_var": NaN }, { "episode": 245, "epoch": 0.046009389671361506, "eps": 0, "loss/policy_avg": -3.332012056489475e-05, "loss/value_avg": 0.24541445076465607, "lr": 2.268e-06, "objective/entropy": 53.912967681884766, "objective/kl": 18.268138885498047, "objective/non_score_reward": -0.9134069681167603, "objective/rlhf_reward": -11.430041313171387, "objective/scores": -10.516633987426758, "policy/approxkl_avg": 5.423388316216915e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3131909370422363, "step": 245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000020146369934, "val/ratio_var": NaN }, { "episode": 246, "epoch": 0.04619718309859155, "eps": 0, "loss/policy_avg": 2.7825248253066093e-05, "loss/value_avg": 0.12893441319465637, "lr": 2.265e-06, "objective/entropy": 99.71572875976562, "objective/kl": 20.67214584350586, "objective/non_score_reward": -1.0336073637008667, "objective/rlhf_reward": -12.046154022216797, "objective/scores": -11.01254653930664, "policy/approxkl_avg": 7.8569215133939e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0127367973327637, "step": 246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998950958251953, "val/ratio_var": NaN }, { "episode": 247, "epoch": 0.046384976525821596, "eps": 0, "loss/policy_avg": -2.7917465558857657e-05, "loss/value_avg": 0.2979557514190674, "lr": 2.262e-06, "objective/entropy": 107.9608383178711, "objective/kl": 13.795696258544922, "objective/non_score_reward": -0.6897848844528198, "objective/rlhf_reward": -10.608423233032227, "objective/scores": -9.918638229370117, "policy/approxkl_avg": 1.1461365545528679e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.096304178237915, "step": 247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000015377998352, "val/ratio_var": NaN }, { "episode": 248, "epoch": 0.046572769953051645, "eps": 0, "loss/policy_avg": -1.3747305274591781e-05, "loss/value_avg": 0.31084147095680237, "lr": 2.259e-06, "objective/entropy": 79.8472671508789, "objective/kl": 18.691539764404297, "objective/non_score_reward": -0.9345769882202148, "objective/rlhf_reward": -11.249999046325684, "objective/scores": -10.315422058105469, "policy/approxkl_avg": 9.711976645121467e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5955027341842651, "step": 248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999967634677887, "val/ratio_var": NaN }, { "episode": 249, "epoch": 0.04676056338028169, "eps": 0, "loss/policy_avg": 9.495582344243303e-05, "loss/value_avg": 0.17312836647033691, "lr": 2.256e-06, "objective/entropy": 76.0282974243164, "objective/kl": 19.391368865966797, "objective/non_score_reward": -0.9695684909820557, "objective/rlhf_reward": -11.505380630493164, "objective/scores": -10.535812377929688, "policy/approxkl_avg": 9.608016426909671e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6311196088790894, "step": 249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999669194221497, "val/ratio_var": NaN }, { "episode": 250, "epoch": 0.046948356807511735, "eps": 0, "loss/policy_avg": 3.6455548979574814e-05, "loss/value_avg": 0.24262148141860962, "lr": 2.253e-06, "objective/entropy": 115.2911605834961, "objective/kl": 15.440498352050781, "objective/non_score_reward": -0.7720248699188232, "objective/rlhf_reward": -11.488998413085938, "objective/scores": -10.716973304748535, "policy/approxkl_avg": 6.84068055534226e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.339261054992676, "step": 250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000438690185547, "val/ratio_var": NaN }, { "episode": 251, "epoch": 0.04713615023474178, "eps": 0, "loss/policy_avg": -1.3963231140223797e-05, "loss/value_avg": 0.2400258332490921, "lr": 2.25e-06, "objective/entropy": 80.03707122802734, "objective/kl": 13.186511039733887, "objective/non_score_reward": -0.6593255400657654, "objective/rlhf_reward": -12.305158615112305, "objective/scores": -11.645833015441895, "policy/approxkl_avg": 8.272403562159525e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5246742963790894, "step": 251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999955952167511, "val/ratio_var": NaN }, { "episode": 252, "epoch": 0.04732394366197183, "eps": 0, "loss/policy_avg": 0.000132002925965935, "loss/value_avg": 0.34294575452804565, "lr": 2.2470000000000003e-06, "objective/entropy": 140.82472229003906, "objective/kl": 21.11013412475586, "objective/non_score_reward": -1.055506706237793, "objective/rlhf_reward": -13.05988597869873, "objective/scores": -12.004379272460938, "policy/approxkl_avg": 1.3440092061500764e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3753983974456787, "step": 252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000011920928955, "val/ratio_var": NaN }, { "episode": 253, "epoch": 0.04751173708920188, "eps": 0, "loss/policy_avg": 1.7692458641249686e-05, "loss/value_avg": 0.2814905345439911, "lr": 2.244e-06, "objective/entropy": 140.42538452148438, "objective/kl": 3.3656787872314453, "objective/non_score_reward": -0.16828395426273346, "objective/rlhf_reward": -9.71604061126709, "objective/scores": -9.54775619506836, "policy/approxkl_avg": 1.070235029487776e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3627238273620605, "step": 253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999423027038574, "val/ratio_var": NaN }, { "episode": 254, "epoch": 0.04769953051643192, "eps": 0, "loss/policy_avg": 8.077891106950119e-05, "loss/value_avg": 0.3594406545162201, "lr": 2.2410000000000002e-06, "objective/entropy": 133.3474578857422, "objective/kl": 27.735692977905273, "objective/non_score_reward": -1.3867846727371216, "objective/rlhf_reward": -12.306174278259277, "objective/scores": -10.919389724731445, "policy/approxkl_avg": 8.458162881197495e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2233381271362305, "step": 254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999278783798218, "val/ratio_var": NaN }, { "episode": 255, "epoch": 0.04788732394366197, "eps": 0, "loss/policy_avg": 6.189886335050687e-05, "loss/value_avg": 0.30639776587486267, "lr": 2.238e-06, "objective/entropy": 77.5257568359375, "objective/kl": 12.552241325378418, "objective/non_score_reward": -0.6276121139526367, "objective/rlhf_reward": -10.472887992858887, "objective/scores": -9.84527587890625, "policy/approxkl_avg": 1.0283594065185753e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5708353519439697, "step": 255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999958336353302, "val/ratio_var": NaN }, { "episode": 256, "epoch": 0.04807511737089202, "eps": 0, "loss/policy_avg": 0.00011521465057739988, "loss/value_avg": 0.3278931677341461, "lr": 2.235e-06, "objective/entropy": 97.70217895507812, "objective/kl": 27.218095779418945, "objective/non_score_reward": -1.3609049320220947, "objective/rlhf_reward": -12.404216766357422, "objective/scores": -11.043312072753906, "policy/approxkl_avg": 1.720714237762877e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9291808605194092, "step": 256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999647736549377, "val/ratio_var": NaN }, { "episode": 257, "epoch": 0.04826291079812207, "eps": 0, "loss/policy_avg": -3.8599067920586094e-05, "loss/value_avg": 0.16366660594940186, "lr": 2.232e-06, "objective/entropy": 111.62004852294922, "objective/kl": 8.994698524475098, "objective/non_score_reward": -0.4497348666191101, "objective/rlhf_reward": -11.143132209777832, "objective/scores": -10.693397521972656, "policy/approxkl_avg": 1.5413107234962808e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.844728946685791, "step": 257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000182390213013, "val/ratio_var": NaN }, { "episode": 258, "epoch": 0.048450704225352116, "eps": 0, "loss/policy_avg": -0.00013680278789252043, "loss/value_avg": 0.14445362985134125, "lr": 2.229e-06, "objective/entropy": 110.33682250976562, "objective/kl": 27.395837783813477, "objective/non_score_reward": -1.3697919845581055, "objective/rlhf_reward": -13.30858325958252, "objective/scores": -11.938791275024414, "policy/approxkl_avg": 1.0093909708075444e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.011126756668091, "step": 258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000006914138794, "val/ratio_var": NaN }, { "episode": 259, "epoch": 0.04863849765258216, "eps": 0, "loss/policy_avg": 3.473039032542147e-05, "loss/value_avg": 0.26900002360343933, "lr": 2.226e-06, "objective/entropy": 97.95916748046875, "objective/kl": 24.112957000732422, "objective/non_score_reward": -1.2056479454040527, "objective/rlhf_reward": -11.96058464050293, "objective/scores": -10.754936218261719, "policy/approxkl_avg": 1.0527624283440673e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0372314453125, "step": 259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999327659606934, "val/ratio_var": NaN }, { "episode": 260, "epoch": 0.048826291079812206, "eps": 0, "loss/policy_avg": -2.8313330403761938e-05, "loss/value_avg": 0.2712470293045044, "lr": 2.223e-06, "objective/entropy": 75.4831771850586, "objective/kl": 8.0484619140625, "objective/non_score_reward": -0.40242308378219604, "objective/rlhf_reward": -11.855975151062012, "objective/scores": -11.45355224609375, "policy/approxkl_avg": 6.124200524482148e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6386604309082031, "step": 260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999476075172424, "val/ratio_var": NaN }, { "episode": 261, "epoch": 0.049014084507042255, "eps": 0, "loss/policy_avg": 4.120592620893149e-06, "loss/value_avg": 0.20553959906101227, "lr": 2.22e-06, "objective/entropy": 87.23855590820312, "objective/kl": 28.052160263061523, "objective/non_score_reward": -1.402608036994934, "objective/rlhf_reward": -12.284942626953125, "objective/scores": -10.88233470916748, "policy/approxkl_avg": 7.991882000624173e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7335296869277954, "step": 261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000108480453491, "val/ratio_var": NaN }, { "episode": 262, "epoch": 0.0492018779342723, "eps": 0, "loss/policy_avg": -4.557618376566097e-05, "loss/value_avg": 0.3065522015094757, "lr": 2.217e-06, "objective/entropy": 91.69232940673828, "objective/kl": 18.01368522644043, "objective/non_score_reward": -0.9006842970848083, "objective/rlhf_reward": -12.942110061645508, "objective/scores": -12.041425704956055, "policy/approxkl_avg": 5.693195959111108e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.817873477935791, "step": 262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000066757202148, "val/ratio_var": NaN }, { "episode": 263, "epoch": 0.049389671361502345, "eps": 0, "loss/policy_avg": 0.00011120202543679625, "loss/value_avg": 0.38370755314826965, "lr": 2.214e-06, "objective/entropy": 103.82330322265625, "objective/kl": 11.153785705566406, "objective/non_score_reward": -0.5576893091201782, "objective/rlhf_reward": -10.439900398254395, "objective/scores": -9.882210731506348, "policy/approxkl_avg": 9.68322524386167e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.931334376335144, "step": 263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000202655792236, "val/ratio_var": NaN }, { "episode": 264, "epoch": 0.049577464788732394, "eps": 0, "loss/policy_avg": -6.127132655819878e-05, "loss/value_avg": 0.30821412801742554, "lr": 2.211e-06, "objective/entropy": 115.01385498046875, "objective/kl": 10.425859451293945, "objective/non_score_reward": -0.5212929844856262, "objective/rlhf_reward": -11.252250671386719, "objective/scores": -10.730957984924316, "policy/approxkl_avg": 6.000892227575605e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2173588275909424, "step": 264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999504685401917, "val/ratio_var": NaN }, { "episode": 265, "epoch": 0.04976525821596244, "eps": 0, "loss/policy_avg": -7.823728083167225e-05, "loss/value_avg": 0.10457803308963776, "lr": 2.208e-06, "objective/entropy": 90.44403839111328, "objective/kl": 16.410860061645508, "objective/non_score_reward": -0.8205429911613464, "objective/rlhf_reward": -11.461723327636719, "objective/scores": -10.641180038452148, "policy/approxkl_avg": 5.258619140136034e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.950635313987732, "step": 265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000278949737549, "val/ratio_var": NaN }, { "episode": 266, "epoch": 0.04995305164319249, "eps": 0, "loss/policy_avg": -7.701370122958906e-06, "loss/value_avg": 0.1745627075433731, "lr": 2.205e-06, "objective/entropy": 108.02021789550781, "objective/kl": 14.171207427978516, "objective/non_score_reward": -0.7085604071617126, "objective/rlhf_reward": -12.464210510253906, "objective/scores": -11.755650520324707, "policy/approxkl_avg": 1.0252530557863793e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.67046320438385, "step": 266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999904990196228, "val/ratio_var": NaN }, { "episode": 267, "epoch": 0.05014084507042253, "eps": 0, "loss/policy_avg": 3.451221346040256e-05, "loss/value_avg": 0.20133833587169647, "lr": 2.202e-06, "objective/entropy": 106.9229736328125, "objective/kl": 15.956695556640625, "objective/non_score_reward": -0.7978348731994629, "objective/rlhf_reward": -12.14028549194336, "objective/scores": -11.342451095581055, "policy/approxkl_avg": 8.49915338108076e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7632710933685303, "step": 267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999579787254333, "val/ratio_var": NaN }, { "episode": 268, "epoch": 0.05032863849765258, "eps": 0, "loss/policy_avg": -6.151199340820312e-05, "loss/value_avg": 0.13117900490760803, "lr": 2.199e-06, "objective/entropy": 134.3262939453125, "objective/kl": 10.854525566101074, "objective/non_score_reward": -0.5427262783050537, "objective/rlhf_reward": -12.15324592590332, "objective/scores": -11.610519409179688, "policy/approxkl_avg": 1.0901570846044706e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2521812915802, "step": 268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000871419906616, "val/ratio_var": NaN }, { "episode": 269, "epoch": 0.05051643192488263, "eps": 0, "loss/policy_avg": -3.846636536763981e-05, "loss/value_avg": 0.37927719950675964, "lr": 2.196e-06, "objective/entropy": 114.13118743896484, "objective/kl": 18.631668090820312, "objective/non_score_reward": -0.9315835237503052, "objective/rlhf_reward": -11.316747665405273, "objective/scores": -10.385164260864258, "policy/approxkl_avg": 1.3985295765905903e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2168221473693848, "step": 269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001169443130493, "val/ratio_var": NaN }, { "episode": 270, "epoch": 0.05070422535211268, "eps": 0, "loss/policy_avg": -6.006798503221944e-05, "loss/value_avg": 0.16689404845237732, "lr": 2.193e-06, "objective/entropy": 115.433349609375, "objective/kl": 18.103660583496094, "objective/non_score_reward": -0.905182957649231, "objective/rlhf_reward": -12.220995903015137, "objective/scores": -11.315813064575195, "policy/approxkl_avg": 6.79115430557431e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0597407817840576, "step": 270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999756813049316, "val/ratio_var": NaN }, { "episode": 271, "epoch": 0.050892018779342726, "eps": 0, "loss/policy_avg": 9.486360067967325e-05, "loss/value_avg": 0.43156835436820984, "lr": 2.19e-06, "objective/entropy": 107.82562255859375, "objective/kl": 12.178421020507812, "objective/non_score_reward": -0.6089209914207458, "objective/rlhf_reward": -10.390684127807617, "objective/scores": -9.781763076782227, "policy/approxkl_avg": 1.155682838316352e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1218032836914062, "step": 271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999606013298035, "val/ratio_var": NaN }, { "episode": 272, "epoch": 0.05107981220657277, "eps": 0, "loss/policy_avg": -1.9613302356447093e-05, "loss/value_avg": 0.42078348994255066, "lr": 2.187e-06, "objective/entropy": 98.63529968261719, "objective/kl": 10.292573928833008, "objective/non_score_reward": -0.5146286487579346, "objective/rlhf_reward": -12.991349220275879, "objective/scores": -12.476720809936523, "policy/approxkl_avg": 6.085355153118144e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.913465976715088, "step": 272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001003742218018, "val/ratio_var": NaN }, { "episode": 273, "epoch": 0.05126760563380282, "eps": 0, "loss/policy_avg": 7.793138502165675e-05, "loss/value_avg": 0.36485856771469116, "lr": 2.184e-06, "objective/entropy": 122.08625030517578, "objective/kl": 10.771406173706055, "objective/non_score_reward": -0.5385704040527344, "objective/rlhf_reward": -11.559391975402832, "objective/scores": -11.020821571350098, "policy/approxkl_avg": 1.2354554712601384e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.120867967605591, "step": 273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998831748962402, "val/ratio_var": NaN }, { "episode": 274, "epoch": 0.051455399061032865, "eps": 0, "loss/policy_avg": -0.00015147226804401726, "loss/value_avg": 0.331150621175766, "lr": 2.181e-06, "objective/entropy": 98.30414581298828, "objective/kl": 11.799871444702148, "objective/non_score_reward": -0.5899935364723206, "objective/rlhf_reward": -13.375889778137207, "objective/scores": -12.785896301269531, "policy/approxkl_avg": 1.2949193717304297e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7612673044204712, "step": 274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000022053718567, "val/ratio_var": NaN }, { "episode": 275, "epoch": 0.051643192488262914, "eps": 0, "loss/policy_avg": -1.7705953723634593e-05, "loss/value_avg": 0.26516708731651306, "lr": 2.178e-06, "objective/entropy": 145.3992462158203, "objective/kl": 22.555858612060547, "objective/non_score_reward": -1.1277928352355957, "objective/rlhf_reward": -12.150089263916016, "objective/scores": -11.022296905517578, "policy/approxkl_avg": 1.2047573250129062e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4137215614318848, "step": 275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999076128005981, "val/ratio_var": NaN }, { "episode": 276, "epoch": 0.051830985915492955, "eps": 0, "loss/policy_avg": -2.3571949441247853e-06, "loss/value_avg": 0.21696323156356812, "lr": 2.175e-06, "objective/entropy": 74.35282135009766, "objective/kl": 17.75287437438965, "objective/non_score_reward": -0.8876436948776245, "objective/rlhf_reward": -12.251785278320312, "objective/scores": -11.364141464233398, "policy/approxkl_avg": 4.540979148259794e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7067018747329712, "step": 276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999945342540741, "val/ratio_var": NaN }, { "episode": 277, "epoch": 0.052018779342723004, "eps": 0, "loss/policy_avg": -2.7811751351691782e-05, "loss/value_avg": 0.26115500926971436, "lr": 2.172e-06, "objective/entropy": 50.90238571166992, "objective/kl": 7.657000541687012, "objective/non_score_reward": -0.3828500211238861, "objective/rlhf_reward": -10.890006065368652, "objective/scores": -10.507156372070312, "policy/approxkl_avg": 4.6856833080255456e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1453304290771484, "step": 277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000052809715271, "val/ratio_var": NaN }, { "episode": 278, "epoch": 0.05220657276995305, "eps": 0, "loss/policy_avg": 2.8736187232425436e-05, "loss/value_avg": 0.5237569808959961, "lr": 2.169e-06, "objective/entropy": 99.10174560546875, "objective/kl": 22.477622985839844, "objective/non_score_reward": -1.1238811016082764, "objective/rlhf_reward": -13.13638687133789, "objective/scores": -12.012505531311035, "policy/approxkl_avg": 6.369084104562717e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8460900783538818, "step": 278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999381899833679, "val/ratio_var": NaN }, { "episode": 279, "epoch": 0.0523943661971831, "eps": 0, "loss/policy_avg": -1.3081532415526453e-05, "loss/value_avg": 1.2208433151245117, "lr": 2.166e-06, "objective/entropy": 145.72048950195312, "objective/kl": 19.36878204345703, "objective/non_score_reward": -0.9684391617774963, "objective/rlhf_reward": -12.401973724365234, "objective/scores": -11.433534622192383, "policy/approxkl_avg": 8.861887579314498e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.492300510406494, "step": 279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999842047691345, "val/ratio_var": NaN }, { "episode": 280, "epoch": 0.05258215962441314, "eps": 0, "loss/policy_avg": 9.090045932680368e-05, "loss/value_avg": 0.21569886803627014, "lr": 2.163e-06, "objective/entropy": 76.51856994628906, "objective/kl": -0.947142481803894, "objective/non_score_reward": 0.04735712707042694, "objective/rlhf_reward": -11.710285186767578, "objective/scores": -11.75764274597168, "policy/approxkl_avg": 6.736862445677616e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4870283603668213, "step": 280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000454187393188, "val/ratio_var": NaN }, { "episode": 281, "epoch": 0.05276995305164319, "eps": 0, "loss/policy_avg": -2.2825204723631032e-05, "loss/value_avg": 0.2836819291114807, "lr": 2.16e-06, "objective/entropy": 112.15441131591797, "objective/kl": 23.01726531982422, "objective/non_score_reward": -1.1508632898330688, "objective/rlhf_reward": -12.342395782470703, "objective/scores": -11.191532135009766, "policy/approxkl_avg": 1.1848420200522014e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1886401176452637, "step": 281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999340772628784, "val/ratio_var": NaN }, { "episode": 282, "epoch": 0.05295774647887324, "eps": 0, "loss/policy_avg": -1.5438727132277563e-05, "loss/value_avg": 3.031697988510132, "lr": 2.157e-06, "objective/entropy": 92.70404052734375, "objective/kl": 8.007779121398926, "objective/non_score_reward": -0.4003889262676239, "objective/rlhf_reward": -7.1187214851379395, "objective/scores": -6.718332767486572, "policy/approxkl_avg": 5.460224627995558e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7726037502288818, "step": 282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999512434005737, "val/ratio_var": NaN }, { "episode": 283, "epoch": 0.05314553990610329, "eps": 0, "loss/policy_avg": 8.816988952276006e-07, "loss/value_avg": 0.2824263274669647, "lr": 2.154e-06, "objective/entropy": 130.45623779296875, "objective/kl": 14.686119079589844, "objective/non_score_reward": -0.7343059778213501, "objective/rlhf_reward": -12.363890647888184, "objective/scores": -11.629584312438965, "policy/approxkl_avg": 1.2011773264930525e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.131580352783203, "step": 283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999672174453735, "val/ratio_var": NaN }, { "episode": 284, "epoch": 0.05333333333333334, "eps": 0, "loss/policy_avg": -7.770196680212393e-05, "loss/value_avg": 0.29920312762260437, "lr": 2.151e-06, "objective/entropy": 108.65821838378906, "objective/kl": 13.393362045288086, "objective/non_score_reward": -0.6696681976318359, "objective/rlhf_reward": -12.40462589263916, "objective/scores": -11.734957695007324, "policy/approxkl_avg": 1.4185343388817273e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.341209650039673, "step": 284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999221563339233, "val/ratio_var": NaN }, { "episode": 285, "epoch": 0.05352112676056338, "eps": 0, "loss/policy_avg": 5.300539851305075e-05, "loss/value_avg": 0.12342122197151184, "lr": 2.148e-06, "objective/entropy": 109.71591186523438, "objective/kl": 5.612714767456055, "objective/non_score_reward": -0.2806357741355896, "objective/rlhf_reward": -12.445533752441406, "objective/scores": -12.164897918701172, "policy/approxkl_avg": 7.309159144597288e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8846435546875, "step": 285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999876081943512, "val/ratio_var": NaN }, { "episode": 286, "epoch": 0.05370892018779343, "eps": 0, "loss/policy_avg": 1.2235821486683562e-05, "loss/value_avg": 0.22449533641338348, "lr": 2.145e-06, "objective/entropy": 66.32128143310547, "objective/kl": 9.206890106201172, "objective/non_score_reward": -0.46034449338912964, "objective/rlhf_reward": -11.725584983825684, "objective/scores": -11.265240669250488, "policy/approxkl_avg": 5.2350113577404045e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6029835939407349, "step": 286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999628067016602, "val/ratio_var": NaN }, { "episode": 287, "epoch": 0.053896713615023475, "eps": 0, "loss/policy_avg": -3.532531263772398e-05, "loss/value_avg": 0.14065881073474884, "lr": 2.142e-06, "objective/entropy": 78.03199768066406, "objective/kl": 5.120072841644287, "objective/non_score_reward": -0.2560036778450012, "objective/rlhf_reward": -12.57677936553955, "objective/scores": -12.320775985717773, "policy/approxkl_avg": 4.650226870239749e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.629753828048706, "step": 287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000044822692871, "val/ratio_var": NaN }, { "episode": 288, "epoch": 0.054084507042253524, "eps": 0, "loss/policy_avg": -9.529995440971106e-05, "loss/value_avg": 1.163883924484253, "lr": 2.1389999999999998e-06, "objective/entropy": 140.29672241210938, "objective/kl": 13.507523536682129, "objective/non_score_reward": -0.6753761768341064, "objective/rlhf_reward": -12.616202354431152, "objective/scores": -11.940826416015625, "policy/approxkl_avg": 7.405375157532035e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3286709785461426, "step": 288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000892877578735, "val/ratio_var": NaN }, { "episode": 289, "epoch": 0.054272300469483566, "eps": 0, "loss/policy_avg": 6.71080852043815e-05, "loss/value_avg": 1.3906610012054443, "lr": 2.136e-06, "objective/entropy": 124.04959869384766, "objective/kl": 10.87600040435791, "objective/non_score_reward": -0.5438000559806824, "objective/rlhf_reward": -10.438055038452148, "objective/scores": -9.894254684448242, "policy/approxkl_avg": 1.602552401891444e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0679125785827637, "step": 289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998889565467834, "val/ratio_var": NaN }, { "episode": 290, "epoch": 0.054460093896713614, "eps": 0, "loss/policy_avg": -5.396806955104694e-05, "loss/value_avg": 0.6070965528488159, "lr": 2.133e-06, "objective/entropy": 97.56748962402344, "objective/kl": 10.16738510131836, "objective/non_score_reward": -0.5083692669868469, "objective/rlhf_reward": -10.111307144165039, "objective/scores": -9.602937698364258, "policy/approxkl_avg": 8.210323443336165e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6963582038879395, "step": 290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999683499336243, "val/ratio_var": NaN }, { "episode": 291, "epoch": 0.05464788732394366, "eps": 0, "loss/policy_avg": 5.751735807280056e-05, "loss/value_avg": 0.6195428967475891, "lr": 2.13e-06, "objective/entropy": 125.65296936035156, "objective/kl": 21.63423728942871, "objective/non_score_reward": -1.081712007522583, "objective/rlhf_reward": -10.114699363708496, "objective/scores": -9.032987594604492, "policy/approxkl_avg": 9.80776775350023e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2776131629943848, "step": 291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001055002212524, "val/ratio_var": NaN }, { "episode": 292, "epoch": 0.05483568075117371, "eps": 0, "loss/policy_avg": -3.809299232671037e-05, "loss/value_avg": 0.3458562195301056, "lr": 2.127e-06, "objective/entropy": 106.83460235595703, "objective/kl": 17.608749389648438, "objective/non_score_reward": -0.880437433719635, "objective/rlhf_reward": -11.487760543823242, "objective/scores": -10.607322692871094, "policy/approxkl_avg": 1.0079986623168224e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.024552345275879, "step": 292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999479055404663, "val/ratio_var": NaN }, { "episode": 293, "epoch": 0.05502347417840375, "eps": 0, "loss/policy_avg": 1.394973605783889e-05, "loss/value_avg": 0.23620721697807312, "lr": 2.124e-06, "objective/entropy": 111.61634063720703, "objective/kl": 18.307411193847656, "objective/non_score_reward": -0.9153706431388855, "objective/rlhf_reward": -12.04785442352295, "objective/scores": -11.13248348236084, "policy/approxkl_avg": 1.3438980772662035e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.87541925907135, "step": 293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001510381698608, "val/ratio_var": NaN }, { "episode": 294, "epoch": 0.0552112676056338, "eps": 0, "loss/policy_avg": -3.9541497244499624e-05, "loss/value_avg": 0.2875356376171112, "lr": 2.121e-06, "objective/entropy": 94.27328491210938, "objective/kl": 19.339641571044922, "objective/non_score_reward": -0.9669820070266724, "objective/rlhf_reward": -10.416654586791992, "objective/scores": -9.44967269897461, "policy/approxkl_avg": 8.715292665328889e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.868410587310791, "step": 294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000048875808716, "val/ratio_var": NaN }, { "episode": 295, "epoch": 0.05539906103286385, "eps": 0, "loss/policy_avg": -4.500263094087131e-05, "loss/value_avg": 0.269153892993927, "lr": 2.118e-06, "objective/entropy": 128.9630126953125, "objective/kl": 10.43917465209961, "objective/non_score_reward": -0.5219587087631226, "objective/rlhf_reward": -10.538108825683594, "objective/scores": -10.01615047454834, "policy/approxkl_avg": 8.338182055922516e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.849063515663147, "step": 295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999672174453735, "val/ratio_var": NaN }, { "episode": 296, "epoch": 0.0555868544600939, "eps": 0, "loss/policy_avg": 1.7184131593239726e-06, "loss/value_avg": 0.2173244208097458, "lr": 2.115e-06, "objective/entropy": 75.63229370117188, "objective/kl": 20.87623405456543, "objective/non_score_reward": -1.0438117980957031, "objective/rlhf_reward": -12.882070541381836, "objective/scores": -11.838258743286133, "policy/approxkl_avg": 8.485879732234025e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8048211336135864, "step": 296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999212026596069, "val/ratio_var": NaN }, { "episode": 297, "epoch": 0.05577464788732395, "eps": 0, "loss/policy_avg": -0.0001151246833615005, "loss/value_avg": 0.636832594871521, "lr": 2.112e-06, "objective/entropy": 112.9948501586914, "objective/kl": 13.371078491210938, "objective/non_score_reward": -0.6685539484024048, "objective/rlhf_reward": -9.39988899230957, "objective/scores": -8.731334686279297, "policy/approxkl_avg": 6.846595113074727e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.988887071609497, "step": 297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999160766601562, "val/ratio_var": NaN }, { "episode": 298, "epoch": 0.05596244131455399, "eps": 0, "loss/policy_avg": 0.0001506805419921875, "loss/value_avg": 0.7446457743644714, "lr": 2.109e-06, "objective/entropy": 162.38616943359375, "objective/kl": 15.885584831237793, "objective/non_score_reward": -0.7942793369293213, "objective/rlhf_reward": -11.26923942565918, "objective/scores": -10.474960327148438, "policy/approxkl_avg": 1.766115644841193e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3112170696258545, "step": 298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000511407852173, "val/ratio_var": NaN }, { "episode": 299, "epoch": 0.05615023474178404, "eps": 0, "loss/policy_avg": 0.00018870830535888672, "loss/value_avg": 0.1866167038679123, "lr": 2.106e-06, "objective/entropy": 55.352455139160156, "objective/kl": 7.639952659606934, "objective/non_score_reward": -0.38199758529663086, "objective/rlhf_reward": -11.210023880004883, "objective/scores": -10.828025817871094, "policy/approxkl_avg": 7.380049993344073e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0052813291549683, "step": 299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999989926815033, "val/ratio_var": NaN }, { "episode": 300, "epoch": 0.056338028169014086, "eps": 0, "loss/policy_avg": 4.081006409251131e-05, "loss/value_avg": 0.3752688765525818, "lr": 2.103e-06, "objective/entropy": 69.08626556396484, "objective/kl": 15.149430274963379, "objective/non_score_reward": -0.7574715614318848, "objective/rlhf_reward": -10.636190414428711, "objective/scores": -9.878718376159668, "policy/approxkl_avg": 5.0386624650400336e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.420115351676941, "step": 300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999852776527405, "val/ratio_var": NaN }, { "episode": 301, "epoch": 0.056525821596244134, "eps": 0, "loss/policy_avg": -5.784124732599594e-05, "loss/value_avg": 0.3621582090854645, "lr": 2.1e-06, "objective/entropy": 76.19731903076172, "objective/kl": 14.151154518127441, "objective/non_score_reward": -0.707557737827301, "objective/rlhf_reward": -10.621854782104492, "objective/scores": -9.914297103881836, "policy/approxkl_avg": 7.689673253707952e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9053586721420288, "step": 301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000475645065308, "val/ratio_var": NaN }, { "episode": 302, "epoch": 0.056713615023474176, "eps": 0, "loss/policy_avg": 5.6932556617539376e-05, "loss/value_avg": 0.4704884886741638, "lr": 2.097e-06, "objective/entropy": 105.55192565917969, "objective/kl": 22.49460220336914, "objective/non_score_reward": -1.124730110168457, "objective/rlhf_reward": -13.519640922546387, "objective/scores": -12.39491081237793, "policy/approxkl_avg": 8.274864171653462e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1592350006103516, "step": 302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001243352890015, "val/ratio_var": NaN }, { "episode": 303, "epoch": 0.056901408450704224, "eps": 0, "loss/policy_avg": -6.093618867453188e-05, "loss/value_avg": 0.31144917011260986, "lr": 2.0939999999999998e-06, "objective/entropy": 116.04998779296875, "objective/kl": 21.203035354614258, "objective/non_score_reward": -1.0601519346237183, "objective/rlhf_reward": -10.818822860717773, "objective/scores": -9.758670806884766, "policy/approxkl_avg": 1.2476871802391543e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.206328868865967, "step": 303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999385476112366, "val/ratio_var": NaN }, { "episode": 304, "epoch": 0.05708920187793427, "eps": 0, "loss/policy_avg": 6.60014629829675e-05, "loss/value_avg": 0.47730323672294617, "lr": 2.091e-06, "objective/entropy": 46.22013473510742, "objective/kl": 7.560131072998047, "objective/non_score_reward": -0.3780065178871155, "objective/rlhf_reward": -11.818822860717773, "objective/scores": -11.440815925598145, "policy/approxkl_avg": 6.516913941823077e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9046078324317932, "step": 304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000008463859558, "val/ratio_var": NaN }, { "episode": 305, "epoch": 0.05727699530516432, "eps": 0, "loss/policy_avg": -1.6680303815519437e-05, "loss/value_avg": 0.29301685094833374, "lr": 2.0879999999999997e-06, "objective/entropy": 105.03376770019531, "objective/kl": 20.55617904663086, "objective/non_score_reward": -1.0278089046478271, "objective/rlhf_reward": -11.670660972595215, "objective/scores": -10.642851829528809, "policy/approxkl_avg": 7.223352582741427e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0155744552612305, "step": 305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000568628311157, "val/ratio_var": NaN }, { "episode": 306, "epoch": 0.05746478873239436, "eps": 0, "loss/policy_avg": -0.00014340202324092388, "loss/value_avg": 0.16689713299274445, "lr": 2.085e-06, "objective/entropy": 124.86930084228516, "objective/kl": 29.660202026367188, "objective/non_score_reward": -1.4830100536346436, "objective/rlhf_reward": -12.191205024719238, "objective/scores": -10.708194732666016, "policy/approxkl_avg": 1.0217976864623779e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8814882040023804, "step": 306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000529289245605, "val/ratio_var": NaN }, { "episode": 307, "epoch": 0.05765258215962441, "eps": 0, "loss/policy_avg": 5.621280433842912e-05, "loss/value_avg": 0.2894185483455658, "lr": 2.0819999999999997e-06, "objective/entropy": 90.61984252929688, "objective/kl": 35.15495681762695, "objective/non_score_reward": -1.7577478885650635, "objective/rlhf_reward": -11.623421669006348, "objective/scores": -9.865674018859863, "policy/approxkl_avg": 7.854093553305574e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.214083671569824, "step": 307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999871253967285, "val/ratio_var": NaN }, { "episode": 308, "epoch": 0.05784037558685446, "eps": 0, "loss/policy_avg": -8.962739229900762e-05, "loss/value_avg": 0.14829105138778687, "lr": 2.079e-06, "objective/entropy": 115.61469268798828, "objective/kl": 29.094850540161133, "objective/non_score_reward": -1.4547425508499146, "objective/rlhf_reward": -11.727081298828125, "objective/scores": -10.2723388671875, "policy/approxkl_avg": 1.300248868574272e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0129992961883545, "step": 308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.99996018409729, "val/ratio_var": NaN }, { "episode": 309, "epoch": 0.05802816901408451, "eps": 0, "loss/policy_avg": 1.1498073035909329e-05, "loss/value_avg": 0.2207784801721573, "lr": 2.0759999999999997e-06, "objective/entropy": 109.25921630859375, "objective/kl": 22.892404556274414, "objective/non_score_reward": -1.1446201801300049, "objective/rlhf_reward": -12.18674087524414, "objective/scores": -11.042120933532715, "policy/approxkl_avg": 1.0863100641245182e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.820918321609497, "step": 309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000171661376953, "val/ratio_var": NaN }, { "episode": 310, "epoch": 0.05821596244131456, "eps": 0, "loss/policy_avg": 1.4125175766821485e-06, "loss/value_avg": 0.12959228456020355, "lr": 2.073e-06, "objective/entropy": 85.93333435058594, "objective/kl": 10.380951881408691, "objective/non_score_reward": -0.5190476179122925, "objective/rlhf_reward": -11.03056812286377, "objective/scores": -10.511520385742188, "policy/approxkl_avg": 6.685500864023197e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7973518371582031, "step": 310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000012993812561, "val/ratio_var": NaN }, { "episode": 311, "epoch": 0.0584037558685446, "eps": 0, "loss/policy_avg": -4.6145240048645064e-05, "loss/value_avg": 0.160446435213089, "lr": 2.07e-06, "objective/entropy": 54.366241455078125, "objective/kl": 18.42224884033203, "objective/non_score_reward": -0.9211124181747437, "objective/rlhf_reward": -11.81851577758789, "objective/scores": -10.897403717041016, "policy/approxkl_avg": 4.1820147345106307e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3217958211898804, "step": 311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999123215675354, "val/ratio_var": NaN }, { "episode": 312, "epoch": 0.05859154929577465, "eps": 0, "loss/policy_avg": 5.150740980752744e-05, "loss/value_avg": 0.10065846890211105, "lr": 2.067e-06, "objective/entropy": 77.57946014404297, "objective/kl": 19.38521385192871, "objective/non_score_reward": -0.9692608118057251, "objective/rlhf_reward": -11.614994049072266, "objective/scores": -10.645732879638672, "policy/approxkl_avg": 8.367188542024451e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5222467184066772, "step": 312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000903606414795, "val/ratio_var": NaN }, { "episode": 313, "epoch": 0.058779342723004696, "eps": 0, "loss/policy_avg": 5.9910540585406125e-05, "loss/value_avg": 0.1185358539223671, "lr": 2.064e-06, "objective/entropy": 101.03329467773438, "objective/kl": 16.57684326171875, "objective/non_score_reward": -0.8288422226905823, "objective/rlhf_reward": -12.203924179077148, "objective/scores": -11.375082015991211, "policy/approxkl_avg": 1.18576096497236e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7892606258392334, "step": 313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000057578086853, "val/ratio_var": NaN }, { "episode": 314, "epoch": 0.058967136150234745, "eps": 0, "loss/policy_avg": -8.754010195843875e-05, "loss/value_avg": 0.2595737874507904, "lr": 2.0610000000000003e-06, "objective/entropy": 110.18350219726562, "objective/kl": 16.38006591796875, "objective/non_score_reward": -0.8190033435821533, "objective/rlhf_reward": -11.280675888061523, "objective/scores": -10.46167278289795, "policy/approxkl_avg": 6.164612642578504e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9132195711135864, "step": 314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000455379486084, "val/ratio_var": NaN }, { "episode": 315, "epoch": 0.059154929577464786, "eps": 0, "loss/policy_avg": 1.8551663742982782e-05, "loss/value_avg": 0.16135787963867188, "lr": 2.058e-06, "objective/entropy": 116.27442169189453, "objective/kl": 18.39554786682129, "objective/non_score_reward": -0.9197773337364197, "objective/rlhf_reward": -12.238415718078613, "objective/scores": -11.318638801574707, "policy/approxkl_avg": 8.860320832582147e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.067758321762085, "step": 315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000083446502686, "val/ratio_var": NaN }, { "episode": 316, "epoch": 0.059342723004694835, "eps": 0, "loss/policy_avg": -7.802810432622209e-05, "loss/value_avg": 0.2064758986234665, "lr": 2.0550000000000002e-06, "objective/entropy": 95.53726959228516, "objective/kl": 9.384368896484375, "objective/non_score_reward": -0.4692184627056122, "objective/rlhf_reward": -12.2537260055542, "objective/scores": -11.784507751464844, "policy/approxkl_avg": 1.015535531223577e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8361401557922363, "step": 316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999002814292908, "val/ratio_var": NaN }, { "episode": 317, "epoch": 0.05953051643192488, "eps": 0, "loss/policy_avg": 2.2829703084426e-05, "loss/value_avg": 0.3114243149757385, "lr": 2.052e-06, "objective/entropy": 109.42609405517578, "objective/kl": 16.22074317932129, "objective/non_score_reward": -0.8110370635986328, "objective/rlhf_reward": -10.575434684753418, "objective/scores": -9.764397621154785, "policy/approxkl_avg": 1.2111797786928946e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2906033992767334, "step": 317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000081181526184, "val/ratio_var": NaN }, { "episode": 318, "epoch": 0.05971830985915493, "eps": 0, "loss/policy_avg": -4.236428139847703e-05, "loss/value_avg": 0.24300165474414825, "lr": 2.049e-06, "objective/entropy": 98.23355102539062, "objective/kl": 12.391650199890137, "objective/non_score_reward": -0.6195825338363647, "objective/rlhf_reward": -12.662110328674316, "objective/scores": -12.04252815246582, "policy/approxkl_avg": 7.652784006495494e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.848280429840088, "step": 318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000050067901611, "val/ratio_var": NaN }, { "episode": 319, "epoch": 0.059906103286384974, "eps": 0, "loss/policy_avg": -6.297849353131824e-08, "loss/value_avg": 0.30994105339050293, "lr": 2.0460000000000004e-06, "objective/entropy": 112.32173919677734, "objective/kl": 11.153640747070312, "objective/non_score_reward": -0.5576820373535156, "objective/rlhf_reward": -11.299226760864258, "objective/scores": -10.741544723510742, "policy/approxkl_avg": 1.2216655420616007e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.027700662612915, "step": 319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.00010085105896, "val/ratio_var": NaN }, { "episode": 320, "epoch": 0.06009389671361502, "eps": 0, "loss/policy_avg": 0.00013177799701225013, "loss/value_avg": 0.3437172472476959, "lr": 2.043e-06, "objective/entropy": 72.93425750732422, "objective/kl": 19.20391273498535, "objective/non_score_reward": -0.9601956009864807, "objective/rlhf_reward": -10.63747501373291, "objective/scores": -9.677279472351074, "policy/approxkl_avg": 6.278959574501641e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.867438554763794, "step": 320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000346899032593, "val/ratio_var": NaN }, { "episode": 321, "epoch": 0.06028169014084507, "eps": 0, "loss/policy_avg": -7.444957645930117e-06, "loss/value_avg": 0.1838231384754181, "lr": 2.0400000000000004e-06, "objective/entropy": 116.7230453491211, "objective/kl": 24.478073120117188, "objective/non_score_reward": -1.2239036560058594, "objective/rlhf_reward": -12.796274185180664, "objective/scores": -11.572370529174805, "policy/approxkl_avg": 1.1177515091276291e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0826025009155273, "step": 321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000449419021606, "val/ratio_var": NaN }, { "episode": 322, "epoch": 0.06046948356807512, "eps": 0, "loss/policy_avg": -3.455719706835225e-05, "loss/value_avg": 0.14527973532676697, "lr": 2.037e-06, "objective/entropy": 109.50104522705078, "objective/kl": 19.468685150146484, "objective/non_score_reward": -0.9734342694282532, "objective/rlhf_reward": -11.936362266540527, "objective/scores": -10.96292781829834, "policy/approxkl_avg": 7.29176008462673e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.212908983230591, "step": 322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000253915786743, "val/ratio_var": NaN }, { "episode": 323, "epoch": 0.06065727699530517, "eps": 0, "loss/policy_avg": -0.0001627071906113997, "loss/value_avg": 0.08600179105997086, "lr": 2.0340000000000003e-06, "objective/entropy": 119.7093505859375, "objective/kl": 26.39267349243164, "objective/non_score_reward": -1.3196337223052979, "objective/rlhf_reward": -12.355619430541992, "objective/scores": -11.035985946655273, "policy/approxkl_avg": 8.136803586467067e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0903987884521484, "step": 323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999849796295166, "val/ratio_var": NaN }, { "episode": 324, "epoch": 0.06084507042253521, "eps": 0, "loss/policy_avg": -8.070243893598672e-06, "loss/value_avg": 0.11834752559661865, "lr": 2.031e-06, "objective/entropy": 104.31472778320312, "objective/kl": 10.584217071533203, "objective/non_score_reward": -0.5292108654975891, "objective/rlhf_reward": -11.265326499938965, "objective/scores": -10.736115455627441, "policy/approxkl_avg": 6.814238417973684e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0101778507232666, "step": 324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000555515289307, "val/ratio_var": NaN }, { "episode": 325, "epoch": 0.06103286384976526, "eps": 0, "loss/policy_avg": -1.0751327863545157e-06, "loss/value_avg": 0.19862768054008484, "lr": 2.0280000000000003e-06, "objective/entropy": 104.06597900390625, "objective/kl": 13.631956100463867, "objective/non_score_reward": -0.6815977096557617, "objective/rlhf_reward": -11.829935073852539, "objective/scores": -11.148337364196777, "policy/approxkl_avg": 9.19796008247431e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9673864841461182, "step": 325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999558925628662, "val/ratio_var": NaN }, { "episode": 326, "epoch": 0.061220657276995306, "eps": 0, "loss/policy_avg": -1.7026685554810683e-06, "loss/value_avg": 0.2056485265493393, "lr": 2.025e-06, "objective/entropy": 93.19398498535156, "objective/kl": 15.863722801208496, "objective/non_score_reward": -0.7931861281394958, "objective/rlhf_reward": -11.36833381652832, "objective/scores": -10.57514762878418, "policy/approxkl_avg": 7.3530912914066e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7055249214172363, "step": 326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999995827674866, "val/ratio_var": NaN }, { "episode": 327, "epoch": 0.061408450704225355, "eps": 0, "loss/policy_avg": 1.3747305274591781e-05, "loss/value_avg": 0.23446354269981384, "lr": 2.0220000000000003e-06, "objective/entropy": 126.43692779541016, "objective/kl": 21.98818588256836, "objective/non_score_reward": -1.0994093418121338, "objective/rlhf_reward": -11.991082191467285, "objective/scores": -10.89167308807373, "policy/approxkl_avg": 1.2446977848412644e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.040036678314209, "step": 327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000078678131104, "val/ratio_var": NaN }, { "episode": 328, "epoch": 0.0615962441314554, "eps": 0, "loss/policy_avg": -9.662699994805735e-06, "loss/value_avg": 1.2891931533813477, "lr": 2.019e-06, "objective/entropy": 88.28544616699219, "objective/kl": 21.085311889648438, "objective/non_score_reward": -1.0542656183242798, "objective/rlhf_reward": -8.832748413085938, "objective/scores": -7.778482437133789, "policy/approxkl_avg": 6.370497374064144e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.81804621219635, "step": 328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000206232070923, "val/ratio_var": NaN }, { "episode": 329, "epoch": 0.061784037558685445, "eps": 0, "loss/policy_avg": 6.315843347692862e-05, "loss/value_avg": 0.17737093567848206, "lr": 2.0160000000000003e-06, "objective/entropy": 139.966064453125, "objective/kl": 13.234186172485352, "objective/non_score_reward": -0.6617093682289124, "objective/rlhf_reward": -11.630959510803223, "objective/scores": -10.969249725341797, "policy/approxkl_avg": 1.2881352517979394e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.333645820617676, "step": 329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000483989715576, "val/ratio_var": NaN }, { "episode": 330, "epoch": 0.061971830985915494, "eps": 0, "loss/policy_avg": 2.879241765185725e-05, "loss/value_avg": 0.28473198413848877, "lr": 2.013e-06, "objective/entropy": 99.98576354980469, "objective/kl": 22.707847595214844, "objective/non_score_reward": -1.135392427444458, "objective/rlhf_reward": -11.484192848205566, "objective/scores": -10.348800659179688, "policy/approxkl_avg": 8.612739321733898e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9874590635299683, "step": 330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999416470527649, "val/ratio_var": NaN }, { "episode": 331, "epoch": 0.06215962441314554, "eps": 0, "loss/policy_avg": 3.8673293602187186e-05, "loss/value_avg": 0.6737788915634155, "lr": 2.0100000000000002e-06, "objective/entropy": 132.73377990722656, "objective/kl": 6.534013748168945, "objective/non_score_reward": -0.32670074701309204, "objective/rlhf_reward": -10.127314567565918, "objective/scores": -9.800613403320312, "policy/approxkl_avg": 1.0840451381000094e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2289879322052, "step": 331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999812841415405, "val/ratio_var": NaN }, { "episode": 332, "epoch": 0.062347417840375584, "eps": 0, "loss/policy_avg": 6.542565824929625e-05, "loss/value_avg": 0.10708345472812653, "lr": 2.007e-06, "objective/entropy": 109.70713806152344, "objective/kl": 17.444812774658203, "objective/non_score_reward": -0.8722406029701233, "objective/rlhf_reward": -12.666994094848633, "objective/scores": -11.794753074645996, "policy/approxkl_avg": 6.498405724641998e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0492749214172363, "step": 332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998621344566345, "val/ratio_var": NaN }, { "episode": 333, "epoch": 0.06253521126760564, "eps": 0, "loss/policy_avg": -2.7287680495646782e-05, "loss/value_avg": 0.19735120236873627, "lr": 2.004e-06, "objective/entropy": 88.80266571044922, "objective/kl": 25.791894912719727, "objective/non_score_reward": -1.2895946502685547, "objective/rlhf_reward": -12.917926788330078, "objective/scores": -11.628332138061523, "policy/approxkl_avg": 1.1558631740626879e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9697611331939697, "step": 333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999111890792847, "val/ratio_var": NaN }, { "episode": 334, "epoch": 0.06272300469483567, "eps": 0, "loss/policy_avg": 7.759850268485025e-05, "loss/value_avg": 0.31862080097198486, "lr": 2.001e-06, "objective/entropy": 105.14110565185547, "objective/kl": 14.621528625488281, "objective/non_score_reward": -0.7310763597488403, "objective/rlhf_reward": -11.060364723205566, "objective/scores": -10.329288482666016, "policy/approxkl_avg": 2.1785479020763887e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9543180465698242, "step": 334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998273253440857, "val/ratio_var": NaN }, { "episode": 335, "epoch": 0.06291079812206572, "eps": 0, "loss/policy_avg": -3.042760909011122e-05, "loss/value_avg": 0.21673069894313812, "lr": 1.998e-06, "objective/entropy": 93.67596435546875, "objective/kl": 9.817207336425781, "objective/non_score_reward": -0.49086037278175354, "objective/rlhf_reward": -10.80578327178955, "objective/scores": -10.314923286437988, "policy/approxkl_avg": 1.045323898551942e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9169830083847046, "step": 335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999572038650513, "val/ratio_var": NaN }, { "episode": 336, "epoch": 0.06309859154929577, "eps": 0, "loss/policy_avg": 8.953742508310825e-05, "loss/value_avg": 0.13986703753471375, "lr": 1.995e-06, "objective/entropy": 41.171348571777344, "objective/kl": 12.86909294128418, "objective/non_score_reward": -0.6434546113014221, "objective/rlhf_reward": -11.079334259033203, "objective/scores": -10.435879707336426, "policy/approxkl_avg": 5.5041230240249206e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1071478128433228, "step": 336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.00009286403656, "val/ratio_var": NaN }, { "episode": 337, "epoch": 0.06328638497652582, "eps": 0, "loss/policy_avg": -7.016253948677331e-05, "loss/value_avg": 0.16413457691669464, "lr": 1.992e-06, "objective/entropy": 118.90145874023438, "objective/kl": 24.713003158569336, "objective/non_score_reward": -1.2356501817703247, "objective/rlhf_reward": -11.666977882385254, "objective/scores": -10.431327819824219, "policy/approxkl_avg": 1.6034316274726734e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3214550018310547, "step": 337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999780058860779, "val/ratio_var": NaN }, { "episode": 338, "epoch": 0.06347417840375587, "eps": 0, "loss/policy_avg": 8.70542717166245e-05, "loss/value_avg": 0.09194015711545944, "lr": 1.9890000000000004e-06, "objective/entropy": 99.39573669433594, "objective/kl": 19.484397888183594, "objective/non_score_reward": -0.9742199182510376, "objective/rlhf_reward": -11.96489143371582, "objective/scores": -10.990671157836914, "policy/approxkl_avg": 1.4541762993758311e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6603519916534424, "step": 338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000078678131104, "val/ratio_var": NaN }, { "episode": 339, "epoch": 0.06366197183098592, "eps": 0, "loss/policy_avg": 9.378397226100788e-05, "loss/value_avg": 1.1213847398757935, "lr": 1.986e-06, "objective/entropy": 77.70364379882812, "objective/kl": 12.618650436401367, "objective/non_score_reward": -0.6309325695037842, "objective/rlhf_reward": -10.27947998046875, "objective/scores": -9.648547172546387, "policy/approxkl_avg": 8.36525302361224e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6374765634536743, "step": 339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000253915786743, "val/ratio_var": NaN }, { "episode": 340, "epoch": 0.06384976525821597, "eps": 0, "loss/policy_avg": 3.757116792257875e-05, "loss/value_avg": 0.21783630549907684, "lr": 1.9830000000000003e-06, "objective/entropy": 83.5167236328125, "objective/kl": 22.622703552246094, "objective/non_score_reward": -1.1311352252960205, "objective/rlhf_reward": -12.23100757598877, "objective/scores": -11.099872589111328, "policy/approxkl_avg": 7.734202966958037e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.409287929534912, "step": 340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998835325241089, "val/ratio_var": NaN }, { "episode": 341, "epoch": 0.06403755868544601, "eps": 0, "loss/policy_avg": -3.40263795806095e-05, "loss/value_avg": 0.15510261058807373, "lr": 1.98e-06, "objective/entropy": 88.54486083984375, "objective/kl": 6.3221940994262695, "objective/non_score_reward": -0.31610971689224243, "objective/rlhf_reward": -12.223749160766602, "objective/scores": -11.907639503479004, "policy/approxkl_avg": 5.915083889362904e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6562039852142334, "step": 341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000636577606201, "val/ratio_var": NaN }, { "episode": 342, "epoch": 0.06422535211267606, "eps": 0, "loss/policy_avg": -6.4993801061064e-05, "loss/value_avg": 0.15332335233688354, "lr": 1.9770000000000003e-06, "objective/entropy": 39.82926940917969, "objective/kl": 9.175437927246094, "objective/non_score_reward": -0.4587719440460205, "objective/rlhf_reward": -12.78089427947998, "objective/scores": -12.322122573852539, "policy/approxkl_avg": 3.981064367053477e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.835628867149353, "step": 342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999899864196777, "val/ratio_var": NaN }, { "episode": 343, "epoch": 0.0644131455399061, "eps": 0, "loss/policy_avg": -4.849343895330094e-05, "loss/value_avg": 0.1575630009174347, "lr": 1.974e-06, "objective/entropy": 54.492919921875, "objective/kl": 11.97981071472168, "objective/non_score_reward": -0.5989905595779419, "objective/rlhf_reward": -10.92995834350586, "objective/scores": -10.330967903137207, "policy/approxkl_avg": 6.111731210012294e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.277095079421997, "step": 343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999950647354126, "val/ratio_var": NaN }, { "episode": 344, "epoch": 0.06460093896713615, "eps": 0, "loss/policy_avg": 8.209696534322575e-05, "loss/value_avg": 0.18327149748802185, "lr": 1.9710000000000003e-06, "objective/entropy": 125.25948333740234, "objective/kl": 15.904701232910156, "objective/non_score_reward": -0.7952351570129395, "objective/rlhf_reward": -12.299369812011719, "objective/scores": -11.504135131835938, "policy/approxkl_avg": 1.6076374720341846e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9977221488952637, "step": 344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000780820846558, "val/ratio_var": NaN }, { "episode": 345, "epoch": 0.0647887323943662, "eps": 0, "loss/policy_avg": 0.000156690482981503, "loss/value_avg": 0.1842001974582672, "lr": 1.968e-06, "objective/entropy": 92.740478515625, "objective/kl": 17.197975158691406, "objective/non_score_reward": -0.8598987460136414, "objective/rlhf_reward": -11.407084465026855, "objective/scores": -10.547185897827148, "policy/approxkl_avg": 1.2426228579442977e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8086490631103516, "step": 345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999423027038574, "val/ratio_var": NaN }, { "episode": 346, "epoch": 0.06497652582159624, "eps": 0, "loss/policy_avg": 2.4392918930971064e-05, "loss/value_avg": 0.14106081426143646, "lr": 1.9650000000000002e-06, "objective/entropy": 95.7754135131836, "objective/kl": 17.710771560668945, "objective/non_score_reward": -0.8855385780334473, "objective/rlhf_reward": -11.64472770690918, "objective/scores": -10.75918960571289, "policy/approxkl_avg": 9.591305882850065e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9439259767532349, "step": 346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001230239868164, "val/ratio_var": NaN }, { "episode": 347, "epoch": 0.06516431924882629, "eps": 0, "loss/policy_avg": 1.7597989426576532e-05, "loss/value_avg": 0.17697274684906006, "lr": 1.962e-06, "objective/entropy": 106.55509948730469, "objective/kl": 23.57567024230957, "objective/non_score_reward": -1.178783655166626, "objective/rlhf_reward": -12.447393417358398, "objective/scores": -11.268610000610352, "policy/approxkl_avg": 6.997457546731312e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1949808597564697, "step": 347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000042200088501, "val/ratio_var": NaN }, { "episode": 348, "epoch": 0.06535211267605634, "eps": 0, "loss/policy_avg": 4.317625644034706e-05, "loss/value_avg": 0.10970550775527954, "lr": 1.9590000000000002e-06, "objective/entropy": 87.21656036376953, "objective/kl": 15.508825302124023, "objective/non_score_reward": -0.7754412293434143, "objective/rlhf_reward": -11.038070678710938, "objective/scores": -10.262629508972168, "policy/approxkl_avg": 7.128132750722216e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7274330854415894, "step": 348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000593662261963, "val/ratio_var": NaN }, { "episode": 349, "epoch": 0.06553990610328639, "eps": 0, "loss/policy_avg": -1.8637136236066e-05, "loss/value_avg": 0.12610411643981934, "lr": 1.956e-06, "objective/entropy": 89.1832046508789, "objective/kl": 13.556708335876465, "objective/non_score_reward": -0.6778354644775391, "objective/rlhf_reward": -12.148648262023926, "objective/scores": -11.470812797546387, "policy/approxkl_avg": 9.832786673769078e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6674344539642334, "step": 349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999503493309021, "val/ratio_var": NaN }, { "episode": 350, "epoch": 0.06572769953051644, "eps": 0, "loss/policy_avg": -2.103481710946653e-05, "loss/value_avg": 0.1733969897031784, "lr": 1.953e-06, "objective/entropy": 77.1535415649414, "objective/kl": 17.23231315612793, "objective/non_score_reward": -0.8616156578063965, "objective/rlhf_reward": -11.561912536621094, "objective/scores": -10.700296401977539, "policy/approxkl_avg": 1.0108938397479506e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6845334768295288, "step": 350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000652074813843, "val/ratio_var": NaN }, { "episode": 351, "epoch": 0.06591549295774649, "eps": 0, "loss/policy_avg": 2.4804528948152438e-05, "loss/value_avg": 0.11714835464954376, "lr": 1.95e-06, "objective/entropy": 105.48739624023438, "objective/kl": 25.294902801513672, "objective/non_score_reward": -1.2647451162338257, "objective/rlhf_reward": -11.909392356872559, "objective/scores": -10.644647598266602, "policy/approxkl_avg": 1.0200677280636228e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1970906257629395, "step": 351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000065803527832, "val/ratio_var": NaN }, { "episode": 352, "epoch": 0.06610328638497652, "eps": 0, "loss/policy_avg": -6.140402888377139e-07, "loss/value_avg": 0.14413012564182281, "lr": 1.947e-06, "objective/entropy": 111.27120971679688, "objective/kl": 27.18610191345215, "objective/non_score_reward": -1.3593051433563232, "objective/rlhf_reward": -12.909260749816895, "objective/scores": -11.549955368041992, "policy/approxkl_avg": 7.510158894774577e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6138041019439697, "step": 352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999135136604309, "val/ratio_var": NaN }, { "episode": 353, "epoch": 0.06629107981220657, "eps": 0, "loss/policy_avg": 1.5825595255591907e-05, "loss/value_avg": 0.21331319212913513, "lr": 1.944e-06, "objective/entropy": 97.41116333007812, "objective/kl": 15.11487102508545, "objective/non_score_reward": -0.7557436227798462, "objective/rlhf_reward": -10.76768970489502, "objective/scores": -10.011945724487305, "policy/approxkl_avg": 8.266350448593585e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.847161054611206, "step": 353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999752640724182, "val/ratio_var": NaN }, { "episode": 354, "epoch": 0.06647887323943662, "eps": 0, "loss/policy_avg": 1.2343784874246921e-05, "loss/value_avg": 0.15079335868358612, "lr": 1.941e-06, "objective/entropy": 17.948049545288086, "objective/kl": 11.56285285949707, "objective/non_score_reward": -0.5781425833702087, "objective/rlhf_reward": -12.453828811645508, "objective/scores": -11.875686645507812, "policy/approxkl_avg": 1.8643737220713774e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.6217778325080872, "step": 354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000112056732178, "val/ratio_var": NaN }, { "episode": 355, "epoch": 0.06666666666666667, "eps": 0, "loss/policy_avg": 9.64290666161105e-05, "loss/value_avg": 0.26194146275520325, "lr": 1.938e-06, "objective/entropy": 92.04432678222656, "objective/kl": 20.520401000976562, "objective/non_score_reward": -1.0260200500488281, "objective/rlhf_reward": -12.656539916992188, "objective/scores": -11.63051986694336, "policy/approxkl_avg": 9.329647099320937e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9629689455032349, "step": 355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999778270721436, "val/ratio_var": NaN }, { "episode": 356, "epoch": 0.06685446009389671, "eps": 0, "loss/policy_avg": 7.142210961319506e-05, "loss/value_avg": 0.2566719055175781, "lr": 1.935e-06, "objective/entropy": 107.46831512451172, "objective/kl": 17.88418960571289, "objective/non_score_reward": -0.8942095041275024, "objective/rlhf_reward": -12.328539848327637, "objective/scores": -11.434329986572266, "policy/approxkl_avg": 8.116602856489408e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8410276174545288, "step": 356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000407695770264, "val/ratio_var": NaN }, { "episode": 357, "epoch": 0.06704225352112676, "eps": 0, "loss/policy_avg": 8.39953136164695e-05, "loss/value_avg": 0.20364686846733093, "lr": 1.9320000000000003e-06, "objective/entropy": 98.07715606689453, "objective/kl": 19.43822479248047, "objective/non_score_reward": -0.9719113111495972, "objective/rlhf_reward": -12.011107444763184, "objective/scores": -11.039196014404297, "policy/approxkl_avg": 1.2859308640145173e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.005352735519409, "step": 357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998507499694824, "val/ratio_var": NaN }, { "episode": 358, "epoch": 0.06723004694835681, "eps": 0, "loss/policy_avg": -0.00011117503163404763, "loss/value_avg": 0.6335130929946899, "lr": 1.929e-06, "objective/entropy": 121.08682250976562, "objective/kl": 7.801116466522217, "objective/non_score_reward": -0.3900558352470398, "objective/rlhf_reward": -10.045238494873047, "objective/scores": -9.655182838439941, "policy/approxkl_avg": 1.0905139902206429e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.144524335861206, "step": 358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999713897705078, "val/ratio_var": NaN }, { "episode": 359, "epoch": 0.06741784037558686, "eps": 0, "loss/policy_avg": -3.874976755469106e-05, "loss/value_avg": 0.5734544992446899, "lr": 1.9260000000000003e-06, "objective/entropy": 79.31658172607422, "objective/kl": 28.342008590698242, "objective/non_score_reward": -1.417100429534912, "objective/rlhf_reward": -10.318595886230469, "objective/scores": -8.901494979858398, "policy/approxkl_avg": 5.070779707239126e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8302485942840576, "step": 359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999002814292908, "val/ratio_var": NaN }, { "episode": 360, "epoch": 0.0676056338028169, "eps": 0, "loss/policy_avg": 4.11609426009818e-06, "loss/value_avg": 1.1067767143249512, "lr": 1.923e-06, "objective/entropy": 52.45967102050781, "objective/kl": 17.15128517150879, "objective/non_score_reward": -0.8575642704963684, "objective/rlhf_reward": -8.895776748657227, "objective/scores": -8.038212776184082, "policy/approxkl_avg": 3.1446131032453195e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3310662508010864, "step": 360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999565482139587, "val/ratio_var": NaN }, { "episode": 361, "epoch": 0.06779342723004694, "eps": 0, "loss/policy_avg": 4.3086285586468875e-05, "loss/value_avg": 0.14358662068843842, "lr": 1.9200000000000003e-06, "objective/entropy": 93.87188720703125, "objective/kl": 17.269365310668945, "objective/non_score_reward": -0.8634682297706604, "objective/rlhf_reward": -11.41604232788086, "objective/scores": -10.552574157714844, "policy/approxkl_avg": 1.343457256552938e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9181207418441772, "step": 361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000685453414917, "val/ratio_var": NaN }, { "episode": 362, "epoch": 0.06798122065727699, "eps": 0, "loss/policy_avg": -2.267225681862328e-06, "loss/value_avg": 0.18536505103111267, "lr": 1.917e-06, "objective/entropy": 119.77464294433594, "objective/kl": 23.676219940185547, "objective/non_score_reward": -1.1838109493255615, "objective/rlhf_reward": -11.879217147827148, "objective/scores": -10.695405960083008, "policy/approxkl_avg": 1.1125500520847709e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2441914081573486, "step": 362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999244213104248, "val/ratio_var": NaN }, { "episode": 363, "epoch": 0.06816901408450704, "eps": 0, "loss/policy_avg": 3.195708632119931e-05, "loss/value_avg": 0.13429994881153107, "lr": 1.9140000000000002e-06, "objective/entropy": 111.18460083007812, "objective/kl": 17.367687225341797, "objective/non_score_reward": -0.8683844208717346, "objective/rlhf_reward": -11.543015480041504, "objective/scores": -10.674631118774414, "policy/approxkl_avg": 8.630897951888983e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9168493747711182, "step": 363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999997019767761, "val/ratio_var": NaN }, { "episode": 364, "epoch": 0.06835680751173709, "eps": 0, "loss/policy_avg": -2.9644876121892594e-05, "loss/value_avg": 0.14832572638988495, "lr": 1.911e-06, "objective/entropy": 90.40093231201172, "objective/kl": 21.04558563232422, "objective/non_score_reward": -1.0522794723510742, "objective/rlhf_reward": -11.717867851257324, "objective/scores": -10.66558837890625, "policy/approxkl_avg": 8.710637899866924e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8276851177215576, "step": 364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000025749206543, "val/ratio_var": NaN }, { "episode": 365, "epoch": 0.06854460093896714, "eps": 0, "loss/policy_avg": 6.385569577105343e-05, "loss/value_avg": 0.36757004261016846, "lr": 1.908e-06, "objective/entropy": 118.4820556640625, "objective/kl": 22.031005859375, "objective/non_score_reward": -1.1015503406524658, "objective/rlhf_reward": -10.58051586151123, "objective/scores": -9.478965759277344, "policy/approxkl_avg": 8.275595320128559e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9265966415405273, "step": 365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.00002121925354, "val/ratio_var": NaN }, { "episode": 366, "epoch": 0.06873239436619719, "eps": 0, "loss/policy_avg": 2.088186920445878e-05, "loss/value_avg": 0.2165653258562088, "lr": 1.905e-06, "objective/entropy": 54.49293518066406, "objective/kl": 16.79452133178711, "objective/non_score_reward": -0.8397260308265686, "objective/rlhf_reward": -10.596758842468262, "objective/scores": -9.75703239440918, "policy/approxkl_avg": 3.4214522770525946e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4197261333465576, "step": 366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000028133392334, "val/ratio_var": NaN }, { "episode": 367, "epoch": 0.06892018779342723, "eps": 0, "loss/policy_avg": -1.4979884326749016e-05, "loss/value_avg": 0.3255368769168854, "lr": 1.9020000000000002e-06, "objective/entropy": 87.25562286376953, "objective/kl": 18.187536239624023, "objective/non_score_reward": -0.909376859664917, "objective/rlhf_reward": -10.706585884094238, "objective/scores": -9.797208786010742, "policy/approxkl_avg": 4.7593882612773086e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8448601961135864, "step": 367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999697804450989, "val/ratio_var": NaN }, { "episode": 368, "epoch": 0.06910798122065728, "eps": 0, "loss/policy_avg": 3.6079927667742595e-05, "loss/value_avg": 0.13576920330524445, "lr": 1.8990000000000002e-06, "objective/entropy": 127.78015899658203, "objective/kl": 2.4967360496520996, "objective/non_score_reward": -0.12483682483434677, "objective/rlhf_reward": -11.721183776855469, "objective/scores": -11.596346855163574, "policy/approxkl_avg": 6.982443068181965e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2296948432922363, "step": 368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000028610229492, "val/ratio_var": NaN }, { "episode": 369, "epoch": 0.06929577464788732, "eps": 0, "loss/policy_avg": -4.200665352982469e-05, "loss/value_avg": 0.12653657793998718, "lr": 1.8960000000000001e-06, "objective/entropy": 119.47264099121094, "objective/kl": 25.929649353027344, "objective/non_score_reward": -1.2964825630187988, "objective/rlhf_reward": -11.651063919067383, "objective/scores": -10.354581832885742, "policy/approxkl_avg": 9.127786881890643e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9985398054122925, "step": 369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000272989273071, "val/ratio_var": NaN }, { "episode": 370, "epoch": 0.06948356807511737, "eps": 0, "loss/policy_avg": 2.681084424693836e-06, "loss/value_avg": 0.09405173361301422, "lr": 1.8930000000000001e-06, "objective/entropy": 68.8045883178711, "objective/kl": 16.792978286743164, "objective/non_score_reward": -0.8396489024162292, "objective/rlhf_reward": -12.445840835571289, "objective/scores": -11.606191635131836, "policy/approxkl_avg": 7.220749154157602e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.484015703201294, "step": 370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999970018863678, "val/ratio_var": NaN }, { "episode": 371, "epoch": 0.06967136150234741, "eps": 0, "loss/policy_avg": 2.2267395252129063e-05, "loss/value_avg": 0.09185933321714401, "lr": 1.8900000000000001e-06, "objective/entropy": 92.62086486816406, "objective/kl": 17.225337982177734, "objective/non_score_reward": -0.8612668514251709, "objective/rlhf_reward": -10.773884773254395, "objective/scores": -9.912617683410645, "policy/approxkl_avg": 6.766797611135189e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8429784774780273, "step": 371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999905228614807, "val/ratio_var": NaN }, { "episode": 372, "epoch": 0.06985915492957746, "eps": 0, "loss/policy_avg": -3.3983644243562594e-05, "loss/value_avg": 0.4706655442714691, "lr": 1.887e-06, "objective/entropy": 125.04110717773438, "objective/kl": 28.504024505615234, "objective/non_score_reward": -1.425201177597046, "objective/rlhf_reward": -13.834808349609375, "objective/scores": -12.40960693359375, "policy/approxkl_avg": 9.079005280909769e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0610835552215576, "step": 372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999620318412781, "val/ratio_var": NaN }, { "episode": 373, "epoch": 0.07004694835680751, "eps": 0, "loss/policy_avg": 3.497105717542581e-05, "loss/value_avg": 0.1366083323955536, "lr": 1.884e-06, "objective/entropy": 75.95612335205078, "objective/kl": 7.768867492675781, "objective/non_score_reward": -0.38844338059425354, "objective/rlhf_reward": -11.76013469696045, "objective/scores": -11.371691703796387, "policy/approxkl_avg": 1.1226099161376624e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.972951054573059, "step": 373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999193549156189, "val/ratio_var": NaN }, { "episode": 374, "epoch": 0.07023474178403756, "eps": 0, "loss/policy_avg": 1.9536828403943218e-05, "loss/value_avg": 0.13411036133766174, "lr": 1.881e-06, "objective/entropy": 51.03824996948242, "objective/kl": 16.398880004882812, "objective/non_score_reward": -0.8199440240859985, "objective/rlhf_reward": -11.05185317993164, "objective/scores": -10.231908798217773, "policy/approxkl_avg": 3.734641751407253e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2041015625, "step": 374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000049352645874, "val/ratio_var": NaN }, { "episode": 375, "epoch": 0.07042253521126761, "eps": 0, "loss/policy_avg": 6.267259595915675e-05, "loss/value_avg": 0.19538144767284393, "lr": 1.878e-06, "objective/entropy": 107.43186950683594, "objective/kl": 21.321125030517578, "objective/non_score_reward": -1.0660563707351685, "objective/rlhf_reward": -11.602822303771973, "objective/scores": -10.536766052246094, "policy/approxkl_avg": 1.3116505215293728e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.257579803466797, "step": 375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000219345092773, "val/ratio_var": NaN }, { "episode": 376, "epoch": 0.07061032863849766, "eps": 0, "loss/policy_avg": 8.511093619745225e-05, "loss/value_avg": 0.24279683828353882, "lr": 1.875e-06, "objective/entropy": 128.26280212402344, "objective/kl": 12.835189819335938, "objective/non_score_reward": -0.64175945520401, "objective/rlhf_reward": -10.877547264099121, "objective/scores": -10.235787391662598, "policy/approxkl_avg": 9.733297190450685e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1677982807159424, "step": 376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000277757644653, "val/ratio_var": NaN }, { "episode": 377, "epoch": 0.0707981220657277, "eps": 0, "loss/policy_avg": 7.388726771750953e-06, "loss/value_avg": 0.7606155872344971, "lr": 1.872e-06, "objective/entropy": 31.832406997680664, "objective/kl": 8.04755687713623, "objective/non_score_reward": -0.4023779034614563, "objective/rlhf_reward": -8.705284118652344, "objective/scores": -8.302906036376953, "policy/approxkl_avg": 2.4873806125924602e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.8697509765625, "step": 377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000274181365967, "val/ratio_var": NaN }, { "episode": 378, "epoch": 0.07098591549295774, "eps": 0, "loss/policy_avg": 6.491283420473337e-05, "loss/value_avg": 0.10807924717664719, "lr": 1.869e-06, "objective/entropy": 41.36906433105469, "objective/kl": 4.79929256439209, "objective/non_score_reward": -0.23996463418006897, "objective/rlhf_reward": -11.413307189941406, "objective/scores": -11.17334270477295, "policy/approxkl_avg": 3.1623837770666796e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.202500820159912, "step": 378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999826550483704, "val/ratio_var": NaN }, { "episode": 379, "epoch": 0.07117370892018779, "eps": 0, "loss/policy_avg": 8.1944017438218e-05, "loss/value_avg": 0.12314139306545258, "lr": 1.866e-06, "objective/entropy": 89.91000366210938, "objective/kl": 14.516233444213867, "objective/non_score_reward": -0.7258116602897644, "objective/rlhf_reward": -10.836909294128418, "objective/scores": -10.11109733581543, "policy/approxkl_avg": 6.294542487239596e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.861639142036438, "step": 379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000317096710205, "val/ratio_var": NaN }, { "episode": 380, "epoch": 0.07136150234741784, "eps": 0, "loss/policy_avg": -3.87722575396765e-05, "loss/value_avg": 0.08833552151918411, "lr": 1.863e-06, "objective/entropy": 87.2801742553711, "objective/kl": 10.23237133026123, "objective/non_score_reward": -0.5116186141967773, "objective/rlhf_reward": -11.772082328796387, "objective/scores": -11.26046371459961, "policy/approxkl_avg": 5.0312525701201594e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5008361339569092, "step": 380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999688267707825, "val/ratio_var": NaN }, { "episode": 381, "epoch": 0.07154929577464789, "eps": 0, "loss/policy_avg": 6.616340397158638e-05, "loss/value_avg": 0.17606692016124725, "lr": 1.86e-06, "objective/entropy": 104.36798858642578, "objective/kl": 20.077564239501953, "objective/non_score_reward": -1.0038782358169556, "objective/rlhf_reward": -12.289064407348633, "objective/scores": -11.285185813903809, "policy/approxkl_avg": 8.339670642953934e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0921308994293213, "step": 381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999552369117737, "val/ratio_var": NaN }, { "episode": 382, "epoch": 0.07173708920187793, "eps": 0, "loss/policy_avg": 6.695963293168461e-06, "loss/value_avg": 0.08466286957263947, "lr": 1.857e-06, "objective/entropy": 81.63018035888672, "objective/kl": 21.129411697387695, "objective/non_score_reward": -1.056470513343811, "objective/rlhf_reward": -10.891043663024902, "objective/scores": -9.834572792053223, "policy/approxkl_avg": 7.412702984765929e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6487668752670288, "step": 382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000512599945068, "val/ratio_var": NaN }, { "episode": 383, "epoch": 0.07192488262910798, "eps": 0, "loss/policy_avg": -5.0513248424977064e-05, "loss/value_avg": 0.22872933745384216, "lr": 1.854e-06, "objective/entropy": 125.1011962890625, "objective/kl": 38.681182861328125, "objective/non_score_reward": -1.9340593814849854, "objective/rlhf_reward": -11.04796028137207, "objective/scores": -9.113901138305664, "policy/approxkl_avg": 1.461828418314326e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4533069133758545, "step": 383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998778104782104, "val/ratio_var": NaN }, { "episode": 384, "epoch": 0.07211267605633803, "eps": 0, "loss/policy_avg": -6.288852455327287e-05, "loss/value_avg": 0.2413790374994278, "lr": 1.851e-06, "objective/entropy": 62.36920166015625, "objective/kl": 13.012612342834473, "objective/non_score_reward": -0.6506307125091553, "objective/rlhf_reward": -12.502799987792969, "objective/scores": -11.852169036865234, "policy/approxkl_avg": 6.362729010334078e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.251529335975647, "step": 384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000334978103638, "val/ratio_var": NaN }, { "episode": 385, "epoch": 0.07230046948356808, "eps": 0, "loss/policy_avg": -0.00013326249609235674, "loss/value_avg": 0.26707586646080017, "lr": 1.848e-06, "objective/entropy": 139.49517822265625, "objective/kl": 23.778972625732422, "objective/non_score_reward": -1.188948631286621, "objective/rlhf_reward": -11.765488624572754, "objective/scores": -10.576539993286133, "policy/approxkl_avg": 1.8046020500150917e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.7069575786590576, "step": 385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000070333480835, "val/ratio_var": NaN }, { "episode": 386, "epoch": 0.07248826291079812, "eps": 0, "loss/policy_avg": 1.4817939700151328e-05, "loss/value_avg": 0.2079947143793106, "lr": 1.8450000000000001e-06, "objective/entropy": 103.80860900878906, "objective/kl": 5.672878742218018, "objective/non_score_reward": -0.283643901348114, "objective/rlhf_reward": -11.992891311645508, "objective/scores": -11.709247589111328, "policy/approxkl_avg": 1.0324607302436561e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9612599611282349, "step": 386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999933242797852, "val/ratio_var": NaN }, { "episode": 387, "epoch": 0.07267605633802816, "eps": 0, "loss/policy_avg": 3.9190617826534435e-05, "loss/value_avg": 0.15857373178005219, "lr": 1.8420000000000001e-06, "objective/entropy": 93.10989379882812, "objective/kl": 14.546370506286621, "objective/non_score_reward": -0.7273185849189758, "objective/rlhf_reward": -11.128599166870117, "objective/scores": -10.401280403137207, "policy/approxkl_avg": 6.572307142960199e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7075679302215576, "step": 387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999135136604309, "val/ratio_var": NaN }, { "episode": 388, "epoch": 0.07286384976525821, "eps": 0, "loss/policy_avg": -4.7404810175066814e-05, "loss/value_avg": 0.2618570923805237, "lr": 1.839e-06, "objective/entropy": 101.35771179199219, "objective/kl": 28.384435653686523, "objective/non_score_reward": -1.4192218780517578, "objective/rlhf_reward": -10.697081565856934, "objective/scores": -9.277859687805176, "policy/approxkl_avg": 5.3096137264674326e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4455244541168213, "step": 388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999750256538391, "val/ratio_var": NaN }, { "episode": 389, "epoch": 0.07305164319248826, "eps": 0, "loss/policy_avg": 7.006357191130519e-05, "loss/value_avg": 0.18316581845283508, "lr": 1.836e-06, "objective/entropy": 118.5138931274414, "objective/kl": 26.347827911376953, "objective/non_score_reward": -1.3173913955688477, "objective/rlhf_reward": -12.110636711120605, "objective/scores": -10.793245315551758, "policy/approxkl_avg": 2.248353467848574e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8303338289260864, "step": 389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000115633010864, "val/ratio_var": NaN }, { "episode": 390, "epoch": 0.07323943661971831, "eps": 0, "loss/policy_avg": 4.881170389126055e-05, "loss/value_avg": 0.1928633153438568, "lr": 1.833e-06, "objective/entropy": 112.5538330078125, "objective/kl": 11.713822364807129, "objective/non_score_reward": -0.5856910943984985, "objective/rlhf_reward": -12.279748916625977, "objective/scores": -11.69405746459961, "policy/approxkl_avg": 1.6650049872168893e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0383808612823486, "step": 390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999577403068542, "val/ratio_var": NaN }, { "episode": 391, "epoch": 0.07342723004694836, "eps": 0, "loss/policy_avg": -3.171866774209775e-05, "loss/value_avg": 0.17396102845668793, "lr": 1.83e-06, "objective/entropy": 106.31515502929688, "objective/kl": 20.46021842956543, "objective/non_score_reward": -1.0230108499526978, "objective/rlhf_reward": -12.226780891418457, "objective/scores": -11.20376968383789, "policy/approxkl_avg": 7.106498145503792e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0946805477142334, "step": 391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999393820762634, "val/ratio_var": NaN }, { "episode": 392, "epoch": 0.0736150234741784, "eps": 0, "loss/policy_avg": -9.357254748465493e-05, "loss/value_avg": 0.34127873182296753, "lr": 1.827e-06, "objective/entropy": 95.57344055175781, "objective/kl": 16.045089721679688, "objective/non_score_reward": -0.8022545576095581, "objective/rlhf_reward": -10.454841613769531, "objective/scores": -9.652586936950684, "policy/approxkl_avg": 1.4493473088350584e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7841037511825562, "step": 392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000014305114746, "val/ratio_var": NaN }, { "episode": 393, "epoch": 0.07380281690140846, "eps": 0, "loss/policy_avg": 9.5367431640625e-05, "loss/value_avg": 0.10603903979063034, "lr": 1.824e-06, "objective/entropy": 104.6521987915039, "objective/kl": 17.672561645507812, "objective/non_score_reward": -0.8836281299591064, "objective/rlhf_reward": -11.617555618286133, "objective/scores": -10.733927726745605, "policy/approxkl_avg": 8.574273380190789e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.789292812347412, "step": 393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999439716339111, "val/ratio_var": NaN }, { "episode": 394, "epoch": 0.0739906103286385, "eps": 0, "loss/policy_avg": -6.264560943236575e-05, "loss/value_avg": 0.2619468867778778, "lr": 1.821e-06, "objective/entropy": 107.70478057861328, "objective/kl": 17.318248748779297, "objective/non_score_reward": -0.8659125566482544, "objective/rlhf_reward": -12.74501895904541, "objective/scores": -11.879106521606445, "policy/approxkl_avg": 8.258828643192828e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8288758993148804, "step": 394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000759363174438, "val/ratio_var": NaN }, { "episode": 395, "epoch": 0.07417840375586854, "eps": 0, "loss/policy_avg": -2.6990783226210624e-06, "loss/value_avg": 0.2550525963306427, "lr": 1.818e-06, "objective/entropy": 112.80101013183594, "objective/kl": 10.695595741271973, "objective/non_score_reward": -0.5347797870635986, "objective/rlhf_reward": -11.576695442199707, "objective/scores": -11.041915893554688, "policy/approxkl_avg": 9.402325673590894e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1394758224487305, "step": 395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000473260879517, "val/ratio_var": NaN }, { "episode": 396, "epoch": 0.07436619718309859, "eps": 0, "loss/policy_avg": -4.711690780823119e-05, "loss/value_avg": 0.07873846590518951, "lr": 1.815e-06, "objective/entropy": 72.32992553710938, "objective/kl": 19.009279251098633, "objective/non_score_reward": -0.9504640102386475, "objective/rlhf_reward": -11.912002563476562, "objective/scores": -10.961538314819336, "policy/approxkl_avg": 5.320666929264917e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6275842189788818, "step": 396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000380277633667, "val/ratio_var": NaN }, { "episode": 397, "epoch": 0.07455399061032864, "eps": 0, "loss/policy_avg": 0.00013668132305610925, "loss/value_avg": 0.19570735096931458, "lr": 1.812e-06, "objective/entropy": 95.70882415771484, "objective/kl": 14.90481185913086, "objective/non_score_reward": -0.7452405691146851, "objective/rlhf_reward": -11.954400062561035, "objective/scores": -11.209159851074219, "policy/approxkl_avg": 8.008413487914368e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.897771954536438, "step": 397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999487400054932, "val/ratio_var": NaN }, { "episode": 398, "epoch": 0.07474178403755868, "eps": 0, "loss/policy_avg": 3.3738477213773876e-05, "loss/value_avg": 0.34380683302879333, "lr": 1.809e-06, "objective/entropy": 87.85105895996094, "objective/kl": 13.421937942504883, "objective/non_score_reward": -0.6710968613624573, "objective/rlhf_reward": -10.380558967590332, "objective/scores": -9.70946216583252, "policy/approxkl_avg": 6.930507367997052e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7466557025909424, "step": 398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999150633811951, "val/ratio_var": NaN }, { "episode": 399, "epoch": 0.07492957746478873, "eps": 0, "loss/policy_avg": -3.059855225728825e-05, "loss/value_avg": 0.35521942377090454, "lr": 1.806e-06, "objective/entropy": 89.25804138183594, "objective/kl": 24.681074142456055, "objective/non_score_reward": -1.234053611755371, "objective/rlhf_reward": -10.760666847229004, "objective/scores": -9.526613235473633, "policy/approxkl_avg": 8.289492825497291e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6503560543060303, "step": 399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000125527381897, "val/ratio_var": NaN }, { "episode": 400, "epoch": 0.07511737089201878, "eps": 0, "loss/policy_avg": 4.862389323534444e-05, "loss/value_avg": 1.4206515550613403, "lr": 1.803e-06, "objective/entropy": 124.14198303222656, "objective/kl": 11.919878005981445, "objective/non_score_reward": -0.5959939956665039, "objective/rlhf_reward": -9.24152946472168, "objective/scores": -8.645535469055176, "policy/approxkl_avg": 8.351021563157701e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.296027421951294, "step": 400, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999080896377563, "val/ratio_var": NaN }, { "episode": 401, "epoch": 0.07530516431924883, "eps": 0, "loss/policy_avg": -2.2492318407785206e-07, "loss/value_avg": 0.22648631036281586, "lr": 1.8e-06, "objective/entropy": 101.82011413574219, "objective/kl": 22.137622833251953, "objective/non_score_reward": -1.1068811416625977, "objective/rlhf_reward": -12.476191520690918, "objective/scores": -11.36931037902832, "policy/approxkl_avg": 9.247968080217106e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8717156648635864, "step": 401, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999576807022095, "val/ratio_var": NaN }, { "episode": 402, "epoch": 0.07549295774647888, "eps": 0, "loss/policy_avg": 1.367308050248539e-05, "loss/value_avg": 0.26550164818763733, "lr": 1.797e-06, "objective/entropy": 88.95469665527344, "objective/kl": 12.474691390991211, "objective/non_score_reward": -0.6237345933914185, "objective/rlhf_reward": -11.82309341430664, "objective/scores": -11.199358940124512, "policy/approxkl_avg": 8.191084788222724e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7474181652069092, "step": 402, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000023365020752, "val/ratio_var": NaN }, { "episode": 403, "epoch": 0.07568075117370893, "eps": 0, "loss/policy_avg": 0.00012090970994904637, "loss/value_avg": 0.42484259605407715, "lr": 1.7939999999999999e-06, "objective/entropy": 102.27558135986328, "objective/kl": 17.459030151367188, "objective/non_score_reward": -0.8729516267776489, "objective/rlhf_reward": -11.654223442077637, "objective/scores": -10.781271934509277, "policy/approxkl_avg": 1.555601158997888e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9129846096038818, "step": 403, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000673532485962, "val/ratio_var": NaN }, { "episode": 404, "epoch": 0.07586854460093896, "eps": 0, "loss/policy_avg": 4.413667556946166e-05, "loss/value_avg": 0.5576642751693726, "lr": 1.7909999999999999e-06, "objective/entropy": 103.5799789428711, "objective/kl": 11.902374267578125, "objective/non_score_reward": -0.5951187610626221, "objective/rlhf_reward": -9.60029411315918, "objective/scores": -9.005175590515137, "policy/approxkl_avg": 1.2048108999351825e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.151827812194824, "step": 404, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999910831451416, "val/ratio_var": NaN }, { "episode": 405, "epoch": 0.07605633802816901, "eps": 0, "loss/policy_avg": -6.198433402460068e-05, "loss/value_avg": 0.17254941165447235, "lr": 1.7879999999999999e-06, "objective/entropy": 116.93441772460938, "objective/kl": 26.868663787841797, "objective/non_score_reward": -1.343433141708374, "objective/rlhf_reward": -11.594191551208496, "objective/scores": -10.250758171081543, "policy/approxkl_avg": 1.0803772454437421e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0940287113189697, "step": 405, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999672174453735, "val/ratio_var": NaN }, { "episode": 406, "epoch": 0.07624413145539906, "eps": 0, "loss/policy_avg": -2.1637610188918188e-05, "loss/value_avg": 0.3581877052783966, "lr": 1.785e-06, "objective/entropy": 14.984992027282715, "objective/kl": 3.467427968978882, "objective/non_score_reward": -0.17337140440940857, "objective/rlhf_reward": -10.635687828063965, "objective/scores": -10.462316513061523, "policy/approxkl_avg": 1.2966250295676218e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.36031702160835266, "step": 406, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999731183052063, "val/ratio_var": NaN }, { "episode": 407, "epoch": 0.07643192488262911, "eps": 0, "loss/policy_avg": -2.499796391930431e-05, "loss/value_avg": 0.18773841857910156, "lr": 1.782e-06, "objective/entropy": 101.81560516357422, "objective/kl": 17.1997127532959, "objective/non_score_reward": -0.8599857091903687, "objective/rlhf_reward": -10.863560676574707, "objective/scores": -10.003575325012207, "policy/approxkl_avg": 2.443723303713341e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9059667587280273, "step": 407, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000680685043335, "val/ratio_var": NaN }, { "episode": 408, "epoch": 0.07661971830985916, "eps": 0, "loss/policy_avg": 8.918654202716425e-05, "loss/value_avg": 0.08463366329669952, "lr": 1.779e-06, "objective/entropy": 110.76629638671875, "objective/kl": 16.366323471069336, "objective/non_score_reward": -0.8183162212371826, "objective/rlhf_reward": -12.309005737304688, "objective/scores": -11.490689277648926, "policy/approxkl_avg": 1.3515978025679942e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0978267192840576, "step": 408, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000046491622925, "val/ratio_var": NaN }, { "episode": 409, "epoch": 0.0768075117370892, "eps": 0, "loss/policy_avg": -4.550645826384425e-05, "loss/value_avg": 0.613831639289856, "lr": 1.776e-06, "objective/entropy": 55.730281829833984, "objective/kl": 23.86318588256836, "objective/non_score_reward": -1.1931592226028442, "objective/rlhf_reward": -14.553403854370117, "objective/scores": -13.360244750976562, "policy/approxkl_avg": 5.983937256814897e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.355404257774353, "step": 409, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999772906303406, "val/ratio_var": NaN }, { "episode": 410, "epoch": 0.07699530516431925, "eps": 0, "loss/policy_avg": 4.512858868110925e-05, "loss/value_avg": 0.05774000287055969, "lr": 1.773e-06, "objective/entropy": 85.17607116699219, "objective/kl": 5.576093673706055, "objective/non_score_reward": -0.2788047194480896, "objective/rlhf_reward": -12.136629104614258, "objective/scores": -11.857824325561523, "policy/approxkl_avg": 6.596325619057097e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3563348054885864, "step": 410, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999052882194519, "val/ratio_var": NaN }, { "episode": 411, "epoch": 0.0771830985915493, "eps": 0, "loss/policy_avg": 9.219826461048797e-05, "loss/value_avg": 0.12727713584899902, "lr": 1.77e-06, "objective/entropy": 101.53096008300781, "objective/kl": 13.923772811889648, "objective/non_score_reward": -0.6961886286735535, "objective/rlhf_reward": -12.176325798034668, "objective/scores": -11.48013687133789, "policy/approxkl_avg": 1.1479743733389114e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9143412113189697, "step": 411, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001391172409058, "val/ratio_var": NaN }, { "episode": 412, "epoch": 0.07737089201877934, "eps": 0, "loss/policy_avg": 1.651835918892175e-05, "loss/value_avg": 0.2632623612880707, "lr": 1.767e-06, "objective/entropy": 85.4378890991211, "objective/kl": 11.762782096862793, "objective/non_score_reward": -0.5881391167640686, "objective/rlhf_reward": -10.890652656555176, "objective/scores": -10.302513122558594, "policy/approxkl_avg": 6.919574246921911e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5812758207321167, "step": 412, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000125169754028, "val/ratio_var": NaN }, { "episode": 413, "epoch": 0.07755868544600938, "eps": 0, "loss/policy_avg": 4.574937520374078e-06, "loss/value_avg": 0.30216163396835327, "lr": 1.764e-06, "objective/entropy": 102.92317199707031, "objective/kl": 15.22615909576416, "objective/non_score_reward": -0.761307954788208, "objective/rlhf_reward": -11.609417915344238, "objective/scores": -10.84811019897461, "policy/approxkl_avg": 1.0763120172896379e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6174800395965576, "step": 413, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999485611915588, "val/ratio_var": NaN }, { "episode": 414, "epoch": 0.07774647887323943, "eps": 0, "loss/policy_avg": 6.731501343892887e-05, "loss/value_avg": 1.3839118480682373, "lr": 1.761e-06, "objective/entropy": 107.86971282958984, "objective/kl": 8.387266159057617, "objective/non_score_reward": -0.4193633794784546, "objective/rlhf_reward": -9.214956283569336, "objective/scores": -8.79559326171875, "policy/approxkl_avg": 6.252506778992029e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9238166809082031, "step": 414, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999975562095642, "val/ratio_var": NaN }, { "episode": 415, "epoch": 0.07793427230046948, "eps": 0, "loss/policy_avg": -3.343258140375838e-05, "loss/value_avg": 0.15949685871601105, "lr": 1.758e-06, "objective/entropy": 57.64238357543945, "objective/kl": 7.777488708496094, "objective/non_score_reward": -0.38887447118759155, "objective/rlhf_reward": -12.341827392578125, "objective/scores": -11.952953338623047, "policy/approxkl_avg": 4.175912948767291e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2245149612426758, "step": 415, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999575614929199, "val/ratio_var": NaN }, { "episode": 416, "epoch": 0.07812206572769953, "eps": 0, "loss/policy_avg": 3.701336027006619e-05, "loss/value_avg": 0.14492681622505188, "lr": 1.755e-06, "objective/entropy": 54.25376892089844, "objective/kl": 16.227035522460938, "objective/non_score_reward": -0.8113518953323364, "objective/rlhf_reward": -11.325294494628906, "objective/scores": -10.51394271850586, "policy/approxkl_avg": 6.402220975587625e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4306825399398804, "step": 416, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999992847442627, "val/ratio_var": NaN }, { "episode": 417, "epoch": 0.07830985915492958, "eps": 0, "loss/policy_avg": 1.1889439520018641e-05, "loss/value_avg": 0.38071125745773315, "lr": 1.752e-06, "objective/entropy": 76.00961303710938, "objective/kl": 13.170860290527344, "objective/non_score_reward": -0.658543050289154, "objective/rlhf_reward": -9.945536613464355, "objective/scores": -9.286993980407715, "policy/approxkl_avg": 7.126676848656643e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.016387462615967, "step": 417, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999947547912598, "val/ratio_var": NaN }, { "episode": 418, "epoch": 0.07849765258215963, "eps": 0, "loss/policy_avg": 5.1687347877305e-06, "loss/value_avg": 0.1321437656879425, "lr": 1.749e-06, "objective/entropy": 113.05072784423828, "objective/kl": 11.77057933807373, "objective/non_score_reward": -0.5885289311408997, "objective/rlhf_reward": -11.959439277648926, "objective/scores": -11.37091064453125, "policy/approxkl_avg": 6.69520190399453e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9559234380722046, "step": 418, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000016689300537, "val/ratio_var": NaN }, { "episode": 419, "epoch": 0.07868544600938968, "eps": 0, "loss/policy_avg": -1.1408104001020547e-05, "loss/value_avg": 0.2727203965187073, "lr": 1.7459999999999999e-06, "objective/entropy": 52.09267044067383, "objective/kl": 10.21623420715332, "objective/non_score_reward": -0.5108116865158081, "objective/rlhf_reward": -10.606037139892578, "objective/scores": -10.09522533416748, "policy/approxkl_avg": 7.23690973813973e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.062730312347412, "step": 419, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999919533729553, "val/ratio_var": NaN }, { "episode": 420, "epoch": 0.07887323943661972, "eps": 0, "loss/policy_avg": 4.9303162086289376e-05, "loss/value_avg": 0.1304897964000702, "lr": 1.7429999999999999e-06, "objective/entropy": 38.87646484375, "objective/kl": 28.871963500976562, "objective/non_score_reward": -1.4435981512069702, "objective/rlhf_reward": -12.148405075073242, "objective/scores": -10.70480728149414, "policy/approxkl_avg": 7.149778724624412e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.6657024025917053, "step": 420, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999300837516785, "val/ratio_var": NaN }, { "episode": 421, "epoch": 0.07906103286384976, "eps": 0, "loss/policy_avg": 1.872260691015981e-05, "loss/value_avg": 0.14563597738742828, "lr": 1.7399999999999999e-06, "objective/entropy": 93.88475036621094, "objective/kl": 13.935758590698242, "objective/non_score_reward": -0.69678795337677, "objective/rlhf_reward": -11.241333961486816, "objective/scores": -10.544546127319336, "policy/approxkl_avg": 9.678733192686195e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.797114610671997, "step": 421, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001059770584106, "val/ratio_var": NaN }, { "episode": 422, "epoch": 0.07924882629107981, "eps": 0, "loss/policy_avg": 0.00015891273505985737, "loss/value_avg": 0.08727020025253296, "lr": 1.7369999999999998e-06, "objective/entropy": 87.7840576171875, "objective/kl": 24.287700653076172, "objective/non_score_reward": -1.2143851518630981, "objective/rlhf_reward": -11.854912757873535, "objective/scores": -10.640527725219727, "policy/approxkl_avg": 1.454209694884412e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6056392192840576, "step": 422, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999660849571228, "val/ratio_var": NaN }, { "episode": 423, "epoch": 0.07943661971830986, "eps": 0, "loss/policy_avg": 2.8745183954015374e-05, "loss/value_avg": 0.2556104362010956, "lr": 1.7339999999999998e-06, "objective/entropy": 102.68668365478516, "objective/kl": 9.909156799316406, "objective/non_score_reward": -0.4954577684402466, "objective/rlhf_reward": -12.854629516601562, "objective/scores": -12.359171867370605, "policy/approxkl_avg": 5.5429438816645415e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5388989448547363, "step": 423, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999656081199646, "val/ratio_var": NaN }, { "episode": 424, "epoch": 0.0796244131455399, "eps": 0, "loss/policy_avg": -1.2339286513451952e-05, "loss/value_avg": 0.16541732847690582, "lr": 1.7309999999999998e-06, "objective/entropy": 64.883544921875, "objective/kl": 21.161163330078125, "objective/non_score_reward": -1.0580581426620483, "objective/rlhf_reward": -10.840007781982422, "objective/scores": -9.781949996948242, "policy/approxkl_avg": 7.245277799938776e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3121799230575562, "step": 424, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999880790710449, "val/ratio_var": NaN }, { "episode": 425, "epoch": 0.07981220657276995, "eps": 0, "loss/policy_avg": 4.489466846280266e-06, "loss/value_avg": 0.25808748602867126, "lr": 1.728e-06, "objective/entropy": 47.72441101074219, "objective/kl": 17.92038345336914, "objective/non_score_reward": -0.8960191607475281, "objective/rlhf_reward": -10.914546012878418, "objective/scores": -10.018527030944824, "policy/approxkl_avg": 3.3818587041878345e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1235305070877075, "step": 425, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000206232070923, "val/ratio_var": NaN }, { "episode": 426, "epoch": 0.08, "eps": 0, "loss/policy_avg": 1.1471082871139515e-05, "loss/value_avg": 0.4430774748325348, "lr": 1.725e-06, "objective/entropy": 90.94995880126953, "objective/kl": 17.583349227905273, "objective/non_score_reward": -0.8791675567626953, "objective/rlhf_reward": -10.132317543029785, "objective/scores": -9.25314998626709, "policy/approxkl_avg": 1.1049336023916112e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9312974214553833, "step": 426, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999858140945435, "val/ratio_var": NaN }, { "episode": 427, "epoch": 0.08018779342723005, "eps": 0, "loss/policy_avg": -0.0002567093470133841, "loss/value_avg": 0.2887458801269531, "lr": 1.722e-06, "objective/entropy": 100.25233459472656, "objective/kl": 14.659246444702148, "objective/non_score_reward": -0.7329622507095337, "objective/rlhf_reward": -10.275179862976074, "objective/scores": -9.542217254638672, "policy/approxkl_avg": 1.8599345708025794e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7697454690933228, "step": 427, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000540018081665, "val/ratio_var": NaN }, { "episode": 428, "epoch": 0.0803755868544601, "eps": 0, "loss/policy_avg": -3.5787528759101406e-05, "loss/value_avg": 0.051767390221357346, "lr": 1.719e-06, "objective/entropy": 8.974495887756348, "objective/kl": 4.731699466705322, "objective/non_score_reward": -0.23658499121665955, "objective/rlhf_reward": -11.519294738769531, "objective/scores": -11.282710075378418, "policy/approxkl_avg": 1.5553871079987402e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.37129414081573486, "step": 428, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000067949295044, "val/ratio_var": NaN }, { "episode": 429, "epoch": 0.08056338028169015, "eps": 0, "loss/policy_avg": 3.832691072602756e-05, "loss/value_avg": 0.24741564691066742, "lr": 1.716e-06, "objective/entropy": 74.20677185058594, "objective/kl": 18.383934020996094, "objective/non_score_reward": -0.9191967248916626, "objective/rlhf_reward": -10.338033676147461, "objective/scores": -9.41883659362793, "policy/approxkl_avg": 6.640477323571758e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.687628984451294, "step": 429, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000002145767212, "val/ratio_var": NaN }, { "episode": 430, "epoch": 0.08075117370892018, "eps": 0, "loss/policy_avg": -1.9307406546431594e-05, "loss/value_avg": 0.262092262506485, "lr": 1.713e-06, "objective/entropy": 178.7939453125, "objective/kl": 19.08355712890625, "objective/non_score_reward": -0.9541778564453125, "objective/rlhf_reward": -11.629754066467285, "objective/scores": -10.675576210021973, "policy/approxkl_avg": 8.843909427014296e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.018057107925415, "step": 430, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000865459442139, "val/ratio_var": NaN }, { "episode": 431, "epoch": 0.08093896713615023, "eps": 0, "loss/policy_avg": 5.566398976952769e-05, "loss/value_avg": 0.09246385097503662, "lr": 1.71e-06, "objective/entropy": 62.2794189453125, "objective/kl": 9.921525001525879, "objective/non_score_reward": -0.49607622623443604, "objective/rlhf_reward": -11.62752628326416, "objective/scores": -11.131449699401855, "policy/approxkl_avg": 5.969615557432917e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3110443353652954, "step": 431, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000250339508057, "val/ratio_var": NaN }, { "episode": 432, "epoch": 0.08112676056338028, "eps": 0, "loss/policy_avg": 7.62939453125e-06, "loss/value_avg": 0.23213079571723938, "lr": 1.707e-06, "objective/entropy": 114.41119384765625, "objective/kl": 9.520514488220215, "objective/non_score_reward": -0.4760257303714752, "objective/rlhf_reward": -10.065049171447754, "objective/scores": -9.58902359008789, "policy/approxkl_avg": 1.0697736030351734e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1699748039245605, "step": 432, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000160932540894, "val/ratio_var": NaN }, { "episode": 433, "epoch": 0.08131455399061033, "eps": 0, "loss/policy_avg": 3.6662479487858946e-06, "loss/value_avg": 0.09448806196451187, "lr": 1.704e-06, "objective/entropy": 63.38655090332031, "objective/kl": 6.483899116516113, "objective/non_score_reward": -0.3241949677467346, "objective/rlhf_reward": -11.790431022644043, "objective/scores": -11.466236114501953, "policy/approxkl_avg": 4.6435815193035523e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2946593761444092, "step": 433, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000066757202148, "val/ratio_var": NaN }, { "episode": 434, "epoch": 0.08150234741784038, "eps": 0, "loss/policy_avg": -1.2595698990480741e-06, "loss/value_avg": 0.15490196645259857, "lr": 1.7009999999999999e-06, "objective/entropy": 52.541473388671875, "objective/kl": 19.27520751953125, "objective/non_score_reward": -0.9637604355812073, "objective/rlhf_reward": -11.188016891479492, "objective/scores": -10.22425651550293, "policy/approxkl_avg": 4.884233817392669e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2891592979431152, "step": 434, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000008344650269, "val/ratio_var": NaN }, { "episode": 435, "epoch": 0.08169014084507042, "eps": 0, "loss/policy_avg": 5.66806420465582e-07, "loss/value_avg": 0.8195602893829346, "lr": 1.6979999999999999e-06, "objective/entropy": 117.15310668945312, "objective/kl": 34.32935333251953, "objective/non_score_reward": -1.7164676189422607, "objective/rlhf_reward": -11.513453483581543, "objective/scores": -9.796985626220703, "policy/approxkl_avg": 1.4806481374307623e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9902251958847046, "step": 435, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000791549682617, "val/ratio_var": NaN }, { "episode": 436, "epoch": 0.08187793427230047, "eps": 0, "loss/policy_avg": -0.00010039671178674325, "loss/value_avg": 0.07235024869441986, "lr": 1.6949999999999999e-06, "objective/entropy": 84.79206085205078, "objective/kl": 10.973495483398438, "objective/non_score_reward": -0.5486748218536377, "objective/rlhf_reward": -11.669857025146484, "objective/scores": -11.121182441711426, "policy/approxkl_avg": 8.664355277687719e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6121549606323242, "step": 436, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999790787696838, "val/ratio_var": NaN }, { "episode": 437, "epoch": 0.08206572769953052, "eps": 0, "loss/policy_avg": 4.607776281773113e-05, "loss/value_avg": 0.1723821461200714, "lr": 1.6919999999999999e-06, "objective/entropy": 35.502193450927734, "objective/kl": 12.267802238464355, "objective/non_score_reward": -0.6133900880813599, "objective/rlhf_reward": -9.852343559265137, "objective/scores": -9.238953590393066, "policy/approxkl_avg": 5.262302238406846e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.6665246486663818, "step": 437, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999904632568359, "val/ratio_var": NaN }, { "episode": 438, "epoch": 0.08225352112676056, "eps": 0, "loss/policy_avg": -1.7831909644883126e-05, "loss/value_avg": 0.18223802745342255, "lr": 1.6889999999999998e-06, "objective/entropy": 79.80711364746094, "objective/kl": 21.06568145751953, "objective/non_score_reward": -1.0532841682434082, "objective/rlhf_reward": -11.911626815795898, "objective/scores": -10.858343124389648, "policy/approxkl_avg": 6.54810605738021e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.439158320426941, "step": 438, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000743865966797, "val/ratio_var": NaN }, { "episode": 439, "epoch": 0.0824413145539906, "eps": 0, "loss/policy_avg": -0.00012027092452626675, "loss/value_avg": 0.11985595524311066, "lr": 1.6860000000000002e-06, "objective/entropy": 70.26412963867188, "objective/kl": 16.680015563964844, "objective/non_score_reward": -0.834000825881958, "objective/rlhf_reward": -12.036919593811035, "objective/scores": -11.202919006347656, "policy/approxkl_avg": 1.2767137036462373e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.605519413948059, "step": 439, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000548362731934, "val/ratio_var": NaN }, { "episode": 440, "epoch": 0.08262910798122065, "eps": 0, "loss/policy_avg": 9.142678027274087e-05, "loss/value_avg": 0.15725941956043243, "lr": 1.6830000000000002e-06, "objective/entropy": 119.31925964355469, "objective/kl": 18.55134391784668, "objective/non_score_reward": -0.9275672435760498, "objective/rlhf_reward": -11.951837539672852, "objective/scores": -11.024270057678223, "policy/approxkl_avg": 1.1409427003172823e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8468961715698242, "step": 440, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000406503677368, "val/ratio_var": NaN }, { "episode": 441, "epoch": 0.0828169014084507, "eps": 0, "loss/policy_avg": -3.296474096714519e-05, "loss/value_avg": 0.20132577419281006, "lr": 1.6800000000000002e-06, "objective/entropy": 115.89495086669922, "objective/kl": 28.860408782958984, "objective/non_score_reward": -1.4430203437805176, "objective/rlhf_reward": -12.726863861083984, "objective/scores": -11.283843994140625, "policy/approxkl_avg": 6.492468429541987e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9267117977142334, "step": 441, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000611543655396, "val/ratio_var": NaN }, { "episode": 442, "epoch": 0.08300469483568075, "eps": 0, "loss/policy_avg": -2.3823864466976374e-05, "loss/value_avg": 0.0996832549571991, "lr": 1.6770000000000002e-06, "objective/entropy": 126.4712905883789, "objective/kl": 15.543734550476074, "objective/non_score_reward": -0.7771867513656616, "objective/rlhf_reward": -10.695682525634766, "objective/scores": -9.918496131896973, "policy/approxkl_avg": 1.7033728738624632e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.122178554534912, "step": 442, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999256134033203, "val/ratio_var": NaN }, { "episode": 443, "epoch": 0.0831924882629108, "eps": 0, "loss/policy_avg": -4.920419814880006e-05, "loss/value_avg": 0.13776454329490662, "lr": 1.6740000000000002e-06, "objective/entropy": 92.67117309570312, "objective/kl": 18.048503875732422, "objective/non_score_reward": -0.9024251699447632, "objective/rlhf_reward": -11.778277397155762, "objective/scores": -10.875852584838867, "policy/approxkl_avg": 1.0246473891584174e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.777353048324585, "step": 443, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000461339950562, "val/ratio_var": NaN }, { "episode": 444, "epoch": 0.08338028169014085, "eps": 0, "loss/policy_avg": -3.447622657404281e-05, "loss/value_avg": 0.12447330355644226, "lr": 1.6710000000000002e-06, "objective/entropy": 104.17217254638672, "objective/kl": 8.206493377685547, "objective/non_score_reward": -0.41032466292381287, "objective/rlhf_reward": -10.7443208694458, "objective/scores": -10.333995819091797, "policy/approxkl_avg": 9.332477191037469e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8754007816314697, "step": 444, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000386238098145, "val/ratio_var": NaN }, { "episode": 445, "epoch": 0.0835680751173709, "eps": 0, "loss/policy_avg": -7.181347609730437e-05, "loss/value_avg": 0.360170841217041, "lr": 1.6680000000000002e-06, "objective/entropy": 140.77381896972656, "objective/kl": 16.50238609313965, "objective/non_score_reward": -0.8251193165779114, "objective/rlhf_reward": -12.706720352172852, "objective/scores": -11.881601333618164, "policy/approxkl_avg": 1.2992623510399426e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1031816005706787, "step": 445, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998899698257446, "val/ratio_var": NaN }, { "episode": 446, "epoch": 0.08375586854460094, "eps": 0, "loss/policy_avg": 8.241185241786297e-06, "loss/value_avg": 8.646595001220703, "lr": 1.6650000000000002e-06, "objective/entropy": 94.6637191772461, "objective/kl": 24.760087966918945, "objective/non_score_reward": -1.238004446029663, "objective/rlhf_reward": -5.975689888000488, "objective/scores": -4.737685203552246, "policy/approxkl_avg": 5.379429524055013e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5305233001708984, "step": 446, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000249147415161, "val/ratio_var": NaN }, { "episode": 447, "epoch": 0.08394366197183098, "eps": 0, "loss/policy_avg": -6.18797421338968e-05, "loss/value_avg": 0.25360754132270813, "lr": 1.6620000000000001e-06, "objective/entropy": 70.86831665039062, "objective/kl": 16.306533813476562, "objective/non_score_reward": -0.8153267502784729, "objective/rlhf_reward": -12.543957710266113, "objective/scores": -11.728631019592285, "policy/approxkl_avg": 7.893390829849523e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1728814840316772, "step": 447, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000118017196655, "val/ratio_var": NaN }, { "episode": 448, "epoch": 0.08413145539906103, "eps": 0, "loss/policy_avg": 3.774211108975578e-06, "loss/value_avg": 0.3043513000011444, "lr": 1.6590000000000001e-06, "objective/entropy": 89.733642578125, "objective/kl": 8.179731369018555, "objective/non_score_reward": -0.4089866280555725, "objective/rlhf_reward": -12.291971206665039, "objective/scores": -11.882984161376953, "policy/approxkl_avg": 6.54290417401171e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8752233982086182, "step": 448, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000025749206543, "val/ratio_var": NaN }, { "episode": 449, "epoch": 0.08431924882629108, "eps": 0, "loss/policy_avg": 1.98742127395235e-05, "loss/value_avg": 0.9167808890342712, "lr": 1.6560000000000001e-06, "objective/entropy": 121.89291381835938, "objective/kl": 20.05820655822754, "objective/non_score_reward": -1.0029103755950928, "objective/rlhf_reward": -11.685855865478516, "objective/scores": -10.682945251464844, "policy/approxkl_avg": 1.3287008471252193e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0359554290771484, "step": 449, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000486373901367, "val/ratio_var": NaN }, { "episode": 450, "epoch": 0.08450704225352113, "eps": 0, "loss/policy_avg": -6.3743231294211e-05, "loss/value_avg": 0.1680765450000763, "lr": 1.653e-06, "objective/entropy": 102.67932891845703, "objective/kl": 14.472742080688477, "objective/non_score_reward": -0.7236371040344238, "objective/rlhf_reward": -12.037178039550781, "objective/scores": -11.313541412353516, "policy/approxkl_avg": 5.599714825166302e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0056543350219727, "step": 450, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998950958251953, "val/ratio_var": NaN }, { "episode": 451, "epoch": 0.08469483568075117, "eps": 0, "loss/policy_avg": -5.865996718057431e-05, "loss/value_avg": 0.23069295287132263, "lr": 1.65e-06, "objective/entropy": 120.45909118652344, "objective/kl": 25.26806640625, "objective/non_score_reward": -1.2634034156799316, "objective/rlhf_reward": -10.449014663696289, "objective/scores": -9.185611724853516, "policy/approxkl_avg": 7.47153166003045e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3908185958862305, "step": 451, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999408721923828, "val/ratio_var": NaN }, { "episode": 452, "epoch": 0.08488262910798122, "eps": 0, "loss/policy_avg": -5.1687347877305e-06, "loss/value_avg": 0.3469969630241394, "lr": 1.647e-06, "objective/entropy": 136.73876953125, "objective/kl": 33.00047302246094, "objective/non_score_reward": -1.6500236988067627, "objective/rlhf_reward": -13.213589668273926, "objective/scores": -11.563566207885742, "policy/approxkl_avg": 1.0274293771317389e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4924685955047607, "step": 452, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000276565551758, "val/ratio_var": NaN }, { "episode": 453, "epoch": 0.08507042253521127, "eps": 0, "loss/policy_avg": 9.867829794529825e-05, "loss/value_avg": 0.24820931255817413, "lr": 1.6440000000000003e-06, "objective/entropy": 70.39038848876953, "objective/kl": 12.203564643859863, "objective/non_score_reward": -0.6101782917976379, "objective/rlhf_reward": -10.30146312713623, "objective/scores": -9.691285133361816, "policy/approxkl_avg": 4.4279719446649324e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3120208978652954, "step": 453, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000669956207275, "val/ratio_var": NaN }, { "episode": 454, "epoch": 0.08525821596244132, "eps": 0, "loss/policy_avg": -4.600128886522725e-05, "loss/value_avg": 0.07746198773384094, "lr": 1.6410000000000003e-06, "objective/entropy": 116.78202819824219, "objective/kl": 17.5662841796875, "objective/non_score_reward": -0.8783142566680908, "objective/rlhf_reward": -11.945990562438965, "objective/scores": -11.067676544189453, "policy/approxkl_avg": 1.093345147751279e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4466068744659424, "step": 454, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999443292617798, "val/ratio_var": NaN }, { "episode": 455, "epoch": 0.08544600938967137, "eps": 0, "loss/policy_avg": -7.653461216250435e-05, "loss/value_avg": 0.15594705939292908, "lr": 1.6380000000000002e-06, "objective/entropy": 48.46745300292969, "objective/kl": 19.330596923828125, "objective/non_score_reward": -0.9665298461914062, "objective/rlhf_reward": -12.039936065673828, "objective/scores": -11.073406219482422, "policy/approxkl_avg": 8.062734480063227e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9777440428733826, "step": 455, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999743103981018, "val/ratio_var": NaN }, { "episode": 456, "epoch": 0.0856338028169014, "eps": 0, "loss/policy_avg": 9.554736607242376e-06, "loss/value_avg": 0.1040259301662445, "lr": 1.6350000000000002e-06, "objective/entropy": 55.58533477783203, "objective/kl": 14.982118606567383, "objective/non_score_reward": -0.7491059899330139, "objective/rlhf_reward": -11.47232723236084, "objective/scores": -10.723220825195312, "policy/approxkl_avg": 8.904758175276584e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5485172271728516, "step": 456, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999753832817078, "val/ratio_var": NaN }, { "episode": 457, "epoch": 0.08582159624413145, "eps": 0, "loss/policy_avg": 0.00010312278027413413, "loss/value_avg": 1.7324175834655762, "lr": 1.6320000000000002e-06, "objective/entropy": 139.30990600585938, "objective/kl": 20.131149291992188, "objective/non_score_reward": -1.0065574645996094, "objective/rlhf_reward": -11.501253128051758, "objective/scores": -10.494695663452148, "policy/approxkl_avg": 9.387152033468737e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3950793743133545, "step": 457, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000216960906982, "val/ratio_var": NaN }, { "episode": 458, "epoch": 0.0860093896713615, "eps": 0, "loss/policy_avg": -4.372057082946412e-05, "loss/value_avg": 3.122901678085327, "lr": 1.6290000000000002e-06, "objective/entropy": 58.221561431884766, "objective/kl": 13.022204399108887, "objective/non_score_reward": -0.6511102914810181, "objective/rlhf_reward": -3.9365782737731934, "objective/scores": -3.2854678630828857, "policy/approxkl_avg": 9.37895663355448e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3904476165771484, "step": 458, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998776316642761, "val/ratio_var": NaN }, { "episode": 459, "epoch": 0.08619718309859155, "eps": 0, "loss/policy_avg": -7.041895150905475e-05, "loss/value_avg": 0.23709838092327118, "lr": 1.6260000000000002e-06, "objective/entropy": 95.24834442138672, "objective/kl": 10.187882423400879, "objective/non_score_reward": -0.5093941688537598, "objective/rlhf_reward": -11.329532623291016, "objective/scores": -10.820137977600098, "policy/approxkl_avg": 9.38996862487329e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7261617183685303, "step": 459, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000047206878662, "val/ratio_var": NaN }, { "episode": 460, "epoch": 0.0863849765258216, "eps": 0, "loss/policy_avg": 5.128248403707403e-07, "loss/value_avg": 0.15103095769882202, "lr": 1.6230000000000002e-06, "objective/entropy": 86.265625, "objective/kl": 12.429916381835938, "objective/non_score_reward": -0.6214958429336548, "objective/rlhf_reward": -12.204307556152344, "objective/scores": -11.58281135559082, "policy/approxkl_avg": 5.618566945031489e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4147558212280273, "step": 460, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999977707862854, "val/ratio_var": NaN }, { "episode": 461, "epoch": 0.08657276995305165, "eps": 0, "loss/policy_avg": -8.37613915791735e-05, "loss/value_avg": 0.08971694856882095, "lr": 1.6200000000000002e-06, "objective/entropy": 76.9791259765625, "objective/kl": 8.191218376159668, "objective/non_score_reward": -0.4095609486103058, "objective/rlhf_reward": -10.896661758422852, "objective/scores": -10.487100601196289, "policy/approxkl_avg": 7.733893170325246e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.327388048171997, "step": 461, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000180006027222, "val/ratio_var": NaN }, { "episode": 462, "epoch": 0.0867605633802817, "eps": 0, "loss/policy_avg": 1.154755682364339e-05, "loss/value_avg": 0.126914381980896, "lr": 1.6170000000000001e-06, "objective/entropy": 101.94890594482422, "objective/kl": 12.441502571105957, "objective/non_score_reward": -0.6220752000808716, "objective/rlhf_reward": -12.048970222473145, "objective/scores": -11.426895141601562, "policy/approxkl_avg": 1.017979087691856e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9465608596801758, "step": 462, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999421238899231, "val/ratio_var": NaN }, { "episode": 463, "epoch": 0.08694835680751174, "eps": 0, "loss/policy_avg": 2.5191397412527294e-07, "loss/value_avg": 0.27495190501213074, "lr": 1.6140000000000001e-06, "objective/entropy": 118.63507080078125, "objective/kl": 22.774822235107422, "objective/non_score_reward": -1.138741135597229, "objective/rlhf_reward": -12.428406715393066, "objective/scores": -11.289665222167969, "policy/approxkl_avg": 1.1499125918135178e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8988935947418213, "step": 463, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999045133590698, "val/ratio_var": NaN }, { "episode": 464, "epoch": 0.08713615023474178, "eps": 0, "loss/policy_avg": -7.124667172320187e-05, "loss/value_avg": 0.1438300907611847, "lr": 1.6110000000000001e-06, "objective/entropy": 71.38264465332031, "objective/kl": 18.238338470458984, "objective/non_score_reward": -0.9119168519973755, "objective/rlhf_reward": -11.842425346374512, "objective/scores": -10.930508613586426, "policy/approxkl_avg": 6.464598101274532e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0043185949325562, "step": 464, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999428391456604, "val/ratio_var": NaN }, { "episode": 465, "epoch": 0.08732394366197183, "eps": 0, "loss/policy_avg": -9.986589475374785e-07, "loss/value_avg": 0.26521697640419006, "lr": 1.608e-06, "objective/entropy": 114.7767333984375, "objective/kl": 24.91357421875, "objective/non_score_reward": -1.2456786632537842, "objective/rlhf_reward": -13.150422096252441, "objective/scores": -11.904743194580078, "policy/approxkl_avg": 8.439493370815399e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0625483989715576, "step": 465, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000495910644531, "val/ratio_var": NaN }, { "episode": 466, "epoch": 0.08751173708920187, "eps": 0, "loss/policy_avg": 5.312685516400961e-06, "loss/value_avg": 0.20409314334392548, "lr": 1.605e-06, "objective/entropy": 127.17861938476562, "objective/kl": 16.04644203186035, "objective/non_score_reward": -0.8023221492767334, "objective/rlhf_reward": -11.604818344116211, "objective/scores": -10.802495956420898, "policy/approxkl_avg": 1.0570849440227903e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.273297071456909, "step": 466, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999172687530518, "val/ratio_var": NaN }, { "episode": 467, "epoch": 0.08769953051643192, "eps": 0, "loss/policy_avg": 3.9671951526543126e-05, "loss/value_avg": 0.08490843325853348, "lr": 1.602e-06, "objective/entropy": 93.14631652832031, "objective/kl": 10.99929141998291, "objective/non_score_reward": -0.5499645471572876, "objective/rlhf_reward": -11.019684791564941, "objective/scores": -10.469719886779785, "policy/approxkl_avg": 5.148883275296612e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4594542980194092, "step": 467, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999977946281433, "val/ratio_var": NaN }, { "episode": 468, "epoch": 0.08788732394366197, "eps": 0, "loss/policy_avg": -2.5191397980961483e-06, "loss/value_avg": 0.08372868597507477, "lr": 1.599e-06, "objective/entropy": 86.11360168457031, "objective/kl": 6.306999206542969, "objective/non_score_reward": -0.3153499662876129, "objective/rlhf_reward": -11.825650215148926, "objective/scores": -11.510300636291504, "policy/approxkl_avg": 9.919330068441923e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8366607427597046, "step": 468, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000327825546265, "val/ratio_var": NaN }, { "episode": 469, "epoch": 0.08807511737089202, "eps": 0, "loss/policy_avg": 3.6923389416188e-05, "loss/value_avg": 0.34833651781082153, "lr": 1.596e-06, "objective/entropy": 119.3447036743164, "objective/kl": 32.7051887512207, "objective/non_score_reward": -1.6352593898773193, "objective/rlhf_reward": -11.084555625915527, "objective/scores": -9.449295997619629, "policy/approxkl_avg": 1.1379329833971497e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.95537531375885, "step": 469, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000466108322144, "val/ratio_var": NaN }, { "episode": 470, "epoch": 0.08826291079812207, "eps": 0, "loss/policy_avg": -7.718914275756106e-05, "loss/value_avg": 0.4569690525531769, "lr": 1.593e-06, "objective/entropy": 95.94668579101562, "objective/kl": 15.142583847045898, "objective/non_score_reward": -0.7571291923522949, "objective/rlhf_reward": -9.559106826782227, "objective/scores": -8.801977157592773, "policy/approxkl_avg": 4.0269096501788226e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.854250431060791, "step": 470, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999280571937561, "val/ratio_var": NaN }, { "episode": 471, "epoch": 0.08845070422535212, "eps": 0, "loss/policy_avg": 3.0508581403410062e-05, "loss/value_avg": 0.24125581979751587, "lr": 1.59e-06, "objective/entropy": 100.30134582519531, "objective/kl": 24.89280128479004, "objective/non_score_reward": -1.2446401119232178, "objective/rlhf_reward": -12.12216567993164, "objective/scores": -10.877525329589844, "policy/approxkl_avg": 8.223473457746877e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.162344217300415, "step": 471, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999395608901978, "val/ratio_var": NaN }, { "episode": 472, "epoch": 0.08863849765258217, "eps": 0, "loss/policy_avg": -7.875010487623513e-05, "loss/value_avg": 0.4869254529476166, "lr": 1.5870000000000002e-06, "objective/entropy": 86.168701171875, "objective/kl": 17.705257415771484, "objective/non_score_reward": -0.8852629065513611, "objective/rlhf_reward": -10.314655303955078, "objective/scores": -9.42939281463623, "policy/approxkl_avg": 6.75459048693483e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8454337120056152, "step": 472, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001248121261597, "val/ratio_var": NaN }, { "episode": 473, "epoch": 0.0888262910798122, "eps": 0, "loss/policy_avg": 8.982532744994387e-05, "loss/value_avg": 0.11824019998311996, "lr": 1.5840000000000002e-06, "objective/entropy": 99.60870361328125, "objective/kl": 15.685578346252441, "objective/non_score_reward": -0.7842788696289062, "objective/rlhf_reward": -10.961886405944824, "objective/scores": -10.177607536315918, "policy/approxkl_avg": 1.0063148891958917e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.818539023399353, "step": 473, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000274181365967, "val/ratio_var": NaN }, { "episode": 474, "epoch": 0.08901408450704225, "eps": 0, "loss/policy_avg": 0.00012894396786578, "loss/value_avg": 0.1294858753681183, "lr": 1.5810000000000002e-06, "objective/entropy": 138.82000732421875, "objective/kl": 23.240215301513672, "objective/non_score_reward": -1.162010669708252, "objective/rlhf_reward": -11.969470977783203, "objective/scores": -10.80746078491211, "policy/approxkl_avg": 1.1722968196181682e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3550772666931152, "step": 474, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999788403511047, "val/ratio_var": NaN }, { "episode": 475, "epoch": 0.0892018779342723, "eps": 0, "loss/policy_avg": 1.8812575945048593e-05, "loss/value_avg": 0.15006154775619507, "lr": 1.5780000000000002e-06, "objective/entropy": 40.57411575317383, "objective/kl": 13.920063018798828, "objective/non_score_reward": -0.6960031390190125, "objective/rlhf_reward": -12.53956413269043, "objective/scores": -11.843561172485352, "policy/approxkl_avg": 3.9997061662688793e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.004408359527588, "step": 475, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000977516174316, "val/ratio_var": NaN }, { "episode": 476, "epoch": 0.08938967136150235, "eps": 0, "loss/policy_avg": 4.870486736763269e-05, "loss/value_avg": 0.6549510359764099, "lr": 1.5750000000000002e-06, "objective/entropy": 98.45263671875, "objective/kl": 27.160625457763672, "objective/non_score_reward": -1.3580312728881836, "objective/rlhf_reward": -10.450736045837402, "objective/scores": -9.092704772949219, "policy/approxkl_avg": 7.965882531379975e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6799086332321167, "step": 476, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999622106552124, "val/ratio_var": NaN }, { "episode": 477, "epoch": 0.0895774647887324, "eps": 0, "loss/policy_avg": 4.3284217099426314e-05, "loss/value_avg": 1.077638030052185, "lr": 1.5720000000000002e-06, "objective/entropy": 93.7730941772461, "objective/kl": 20.154804229736328, "objective/non_score_reward": -1.0077402591705322, "objective/rlhf_reward": -8.4415864944458, "objective/scores": -7.433846473693848, "policy/approxkl_avg": 1.009021346476402e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4193460941314697, "step": 477, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999087452888489, "val/ratio_var": NaN }, { "episode": 478, "epoch": 0.08976525821596244, "eps": 0, "loss/policy_avg": -4.708092092187144e-05, "loss/value_avg": 14.732449531555176, "lr": 1.5690000000000001e-06, "objective/entropy": 53.029518127441406, "objective/kl": 11.670083999633789, "objective/non_score_reward": -0.5835041999816895, "objective/rlhf_reward": -1.4857890605926514, "objective/scores": -0.9022848606109619, "policy/approxkl_avg": 2.116018471554071e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0695973634719849, "step": 478, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000163316726685, "val/ratio_var": NaN }, { "episode": 479, "epoch": 0.08995305164319249, "eps": 0, "loss/policy_avg": -6.019844295224175e-05, "loss/value_avg": 0.23539359867572784, "lr": 1.5660000000000001e-06, "objective/entropy": 72.89903259277344, "objective/kl": 21.94855499267578, "objective/non_score_reward": -1.0974276065826416, "objective/rlhf_reward": -12.872030258178711, "objective/scores": -11.774602890014648, "policy/approxkl_avg": 8.389290684363004e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8289426565170288, "step": 479, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999830722808838, "val/ratio_var": NaN }, { "episode": 480, "epoch": 0.09014084507042254, "eps": 0, "loss/policy_avg": 3.498005389701575e-05, "loss/value_avg": 0.9106916189193726, "lr": 1.5630000000000001e-06, "objective/entropy": 102.62167358398438, "objective/kl": 18.66549301147461, "objective/non_score_reward": -0.9332745671272278, "objective/rlhf_reward": -8.877883911132812, "objective/scores": -7.944609642028809, "policy/approxkl_avg": 6.999973578558638e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7091847658157349, "step": 480, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000544786453247, "val/ratio_var": NaN }, { "episode": 481, "epoch": 0.09032863849765259, "eps": 0, "loss/policy_avg": -2.840329943865072e-05, "loss/value_avg": 0.18029935657978058, "lr": 1.56e-06, "objective/entropy": 83.58941650390625, "objective/kl": 29.576374053955078, "objective/non_score_reward": -1.4788188934326172, "objective/rlhf_reward": -13.168986320495605, "objective/scores": -11.690167427062988, "policy/approxkl_avg": 6.180609801731407e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.482795000076294, "step": 481, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000125169754028, "val/ratio_var": NaN }, { "episode": 482, "epoch": 0.09051643192488262, "eps": 0, "loss/policy_avg": -2.0407280317158438e-05, "loss/value_avg": 0.10989631712436676, "lr": 1.557e-06, "objective/entropy": 76.18595886230469, "objective/kl": 15.148887634277344, "objective/non_score_reward": -0.7574443817138672, "objective/rlhf_reward": -12.161131858825684, "objective/scores": -11.403687477111816, "policy/approxkl_avg": 7.238642041329513e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.550486445426941, "step": 482, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999215602874756, "val/ratio_var": NaN }, { "episode": 483, "epoch": 0.09070422535211267, "eps": 0, "loss/policy_avg": 9.157072781817988e-05, "loss/value_avg": 1.286600947380066, "lr": 1.554e-06, "objective/entropy": 103.87676239013672, "objective/kl": 9.97043228149414, "objective/non_score_reward": -0.4985215961933136, "objective/rlhf_reward": -10.622509002685547, "objective/scores": -10.123987197875977, "policy/approxkl_avg": 1.2221056522321305e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6915191411972046, "step": 483, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000016689300537, "val/ratio_var": NaN }, { "episode": 484, "epoch": 0.09089201877934272, "eps": 0, "loss/policy_avg": -1.6824254771563574e-06, "loss/value_avg": 0.15773381292819977, "lr": 1.551e-06, "objective/entropy": 111.93312072753906, "objective/kl": 21.705472946166992, "objective/non_score_reward": -1.0852736234664917, "objective/rlhf_reward": -11.873276710510254, "objective/scores": -10.788002967834473, "policy/approxkl_avg": 1.213596476645762e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.438967227935791, "step": 484, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000591278076172, "val/ratio_var": NaN }, { "episode": 485, "epoch": 0.09107981220657277, "eps": 0, "loss/policy_avg": -1.210086793435039e-05, "loss/value_avg": 0.11654810607433319, "lr": 1.548e-06, "objective/entropy": 93.58294677734375, "objective/kl": 13.38479995727539, "objective/non_score_reward": -0.6692399978637695, "objective/rlhf_reward": -11.17141342163086, "objective/scores": -10.50217342376709, "policy/approxkl_avg": 1.242320308847411e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.885318398475647, "step": 485, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999579191207886, "val/ratio_var": NaN }, { "episode": 486, "epoch": 0.09126760563380282, "eps": 0, "loss/policy_avg": 9.633460285840556e-05, "loss/value_avg": 0.2088077813386917, "lr": 1.545e-06, "objective/entropy": 127.75273132324219, "objective/kl": 10.707294464111328, "objective/non_score_reward": -0.5353647470474243, "objective/rlhf_reward": -10.761602401733398, "objective/scores": -10.226237297058105, "policy/approxkl_avg": 2.066607578399271e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3129053115844727, "step": 486, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999818801879883, "val/ratio_var": NaN }, { "episode": 487, "epoch": 0.09145539906103287, "eps": 0, "loss/policy_avg": 1.7328082321910188e-05, "loss/value_avg": 0.170135498046875, "lr": 1.542e-06, "objective/entropy": 119.47352600097656, "objective/kl": 29.615890502929688, "objective/non_score_reward": -1.4807945489883423, "objective/rlhf_reward": -11.006932258605957, "objective/scores": -9.526137351989746, "policy/approxkl_avg": 1.0192086108418152e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0657706260681152, "step": 487, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999368190765381, "val/ratio_var": NaN }, { "episode": 488, "epoch": 0.09164319248826291, "eps": 0, "loss/policy_avg": -5.7589331845520064e-05, "loss/value_avg": 0.14251261949539185, "lr": 1.539e-06, "objective/entropy": 95.46951293945312, "objective/kl": 25.972667694091797, "objective/non_score_reward": -1.2986334562301636, "objective/rlhf_reward": -11.76696491241455, "objective/scores": -10.468331336975098, "policy/approxkl_avg": 1.533610856085943e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9220432043075562, "step": 488, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999918341636658, "val/ratio_var": NaN }, { "episode": 489, "epoch": 0.09183098591549296, "eps": 0, "loss/policy_avg": 3.830216883216053e-05, "loss/value_avg": 0.22911056876182556, "lr": 1.536e-06, "objective/entropy": 113.52003479003906, "objective/kl": 25.566478729248047, "objective/non_score_reward": -1.2783238887786865, "objective/rlhf_reward": -12.771879196166992, "objective/scores": -11.493555068969727, "policy/approxkl_avg": 5.955830317816435e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5215028524398804, "step": 489, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998629689216614, "val/ratio_var": NaN }, { "episode": 490, "epoch": 0.09201877934272301, "eps": 0, "loss/policy_avg": 8.709025860298425e-05, "loss/value_avg": 0.10096271336078644, "lr": 1.533e-06, "objective/entropy": 81.30481719970703, "objective/kl": 21.105846405029297, "objective/non_score_reward": -1.0552924871444702, "objective/rlhf_reward": -11.814615249633789, "objective/scores": -10.759323120117188, "policy/approxkl_avg": 8.358642844541464e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7997586727142334, "step": 490, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999393820762634, "val/ratio_var": NaN }, { "episode": 491, "epoch": 0.09220657276995305, "eps": 0, "loss/policy_avg": -6.763215060345829e-05, "loss/value_avg": 0.12065355479717255, "lr": 1.53e-06, "objective/entropy": 82.89447021484375, "objective/kl": 14.676383018493652, "objective/non_score_reward": -0.7338191866874695, "objective/rlhf_reward": -12.1204252243042, "objective/scores": -11.386606216430664, "policy/approxkl_avg": 7.498193355104377e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.60027277469635, "step": 491, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000003457069397, "val/ratio_var": NaN }, { "episode": 492, "epoch": 0.0923943661971831, "eps": 0, "loss/policy_avg": 4.5191565732238814e-05, "loss/value_avg": 0.13798578083515167, "lr": 1.5270000000000002e-06, "objective/entropy": 80.90894317626953, "objective/kl": 21.60472297668457, "objective/non_score_reward": -1.0802361965179443, "objective/rlhf_reward": -11.790628433227539, "objective/scores": -10.710391998291016, "policy/approxkl_avg": 7.647550148703885e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.532118320465088, "step": 492, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999868869781494, "val/ratio_var": NaN }, { "episode": 493, "epoch": 0.09258215962441314, "eps": 0, "loss/policy_avg": -2.3877844796516e-05, "loss/value_avg": 0.08475194126367569, "lr": 1.5240000000000001e-06, "objective/entropy": 83.70562744140625, "objective/kl": 17.8057804107666, "objective/non_score_reward": -0.8902889490127563, "objective/rlhf_reward": -11.676443099975586, "objective/scores": -10.786153793334961, "policy/approxkl_avg": 9.86976402828077e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3371490240097046, "step": 493, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999960660934448, "val/ratio_var": NaN }, { "episode": 494, "epoch": 0.09276995305164319, "eps": 0, "loss/policy_avg": 9.963647607946768e-05, "loss/value_avg": 0.3794405162334442, "lr": 1.5210000000000001e-06, "objective/entropy": 123.58428955078125, "objective/kl": 41.18065643310547, "objective/non_score_reward": -2.059032917022705, "objective/rlhf_reward": -11.898841857910156, "objective/scores": -9.83980941772461, "policy/approxkl_avg": 9.496010022758128e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1251888275146484, "step": 494, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001009702682495, "val/ratio_var": NaN }, { "episode": 495, "epoch": 0.09295774647887324, "eps": 0, "loss/policy_avg": 4.5594180846819654e-05, "loss/value_avg": 0.08350428938865662, "lr": 1.5180000000000001e-06, "objective/entropy": 83.5281982421875, "objective/kl": 8.689857482910156, "objective/non_score_reward": -0.43449288606643677, "objective/rlhf_reward": -11.701571464538574, "objective/scores": -11.267078399658203, "policy/approxkl_avg": 6.310983735602349e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5166176557540894, "step": 495, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999549388885498, "val/ratio_var": NaN }, { "episode": 496, "epoch": 0.09314553990610329, "eps": 0, "loss/policy_avg": 1.5672647350584157e-05, "loss/value_avg": 0.2095647156238556, "lr": 1.5150000000000001e-06, "objective/entropy": 138.8048858642578, "objective/kl": 16.432735443115234, "objective/non_score_reward": -0.8216367959976196, "objective/rlhf_reward": -12.224435806274414, "objective/scores": -11.402798652648926, "policy/approxkl_avg": 1.1654913123493316e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3942573070526123, "step": 496, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000633001327515, "val/ratio_var": NaN }, { "episode": 497, "epoch": 0.09333333333333334, "eps": 0, "loss/policy_avg": 1.7955617295228876e-05, "loss/value_avg": 0.3347325623035431, "lr": 1.512e-06, "objective/entropy": 54.163818359375, "objective/kl": 15.320283889770508, "objective/non_score_reward": -0.7660142183303833, "objective/rlhf_reward": -12.187238693237305, "objective/scores": -11.421224594116211, "policy/approxkl_avg": 4.4130999299341056e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9896954298019409, "step": 497, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000286102294922, "val/ratio_var": NaN }, { "episode": 498, "epoch": 0.09352112676056339, "eps": 0, "loss/policy_avg": -4.840347173740156e-06, "loss/value_avg": 0.3186908960342407, "lr": 1.509e-06, "objective/entropy": 92.13838958740234, "objective/kl": 20.508949279785156, "objective/non_score_reward": -1.0254474878311157, "objective/rlhf_reward": -10.828577995300293, "objective/scores": -9.803130149841309, "policy/approxkl_avg": 6.846320843578724e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.834788203239441, "step": 498, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999839067459106, "val/ratio_var": NaN }, { "episode": 499, "epoch": 0.09370892018779342, "eps": 0, "loss/policy_avg": 1.590656756889075e-05, "loss/value_avg": 0.5989781022071838, "lr": 1.506e-06, "objective/entropy": 37.422237396240234, "objective/kl": 6.80887508392334, "objective/non_score_reward": -0.3404437303543091, "objective/rlhf_reward": -8.87389087677002, "objective/scores": -8.533447265625, "policy/approxkl_avg": 1.9450771659990096e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.912841796875, "step": 499, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000061988830566, "val/ratio_var": NaN }, { "episode": 500, "epoch": 0.09389671361502347, "eps": 0, "loss/policy_avg": -7.336994258366758e-06, "loss/value_avg": 1.4201020002365112, "lr": 1.503e-06, "objective/entropy": 126.50419616699219, "objective/kl": 18.33389663696289, "objective/non_score_reward": -0.9166948795318604, "objective/rlhf_reward": -7.173223495483398, "objective/scores": -6.256528854370117, "policy/approxkl_avg": 7.173509430913327e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2721407413482666, "step": 500, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000746250152588, "val/ratio_var": NaN }, { "episode": 501, "epoch": 0.09408450704225352, "eps": 0, "loss/policy_avg": -7.692373037571087e-05, "loss/value_avg": 0.45288902521133423, "lr": 1.5e-06, "objective/entropy": 144.72235107421875, "objective/kl": 19.906396865844727, "objective/non_score_reward": -0.9953198432922363, "objective/rlhf_reward": -9.921859741210938, "objective/scores": -8.92654037475586, "policy/approxkl_avg": 9.163132830281029e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3932852745056152, "step": 501, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000255107879639, "val/ratio_var": NaN }, { "episode": 502, "epoch": 0.09427230046948357, "eps": 0, "loss/policy_avg": 3.854283568216488e-05, "loss/value_avg": 0.12053761631250381, "lr": 1.497e-06, "objective/entropy": 116.66473388671875, "objective/kl": 19.382675170898438, "objective/non_score_reward": -0.9691336750984192, "objective/rlhf_reward": -12.235542297363281, "objective/scores": -11.266408920288086, "policy/approxkl_avg": 6.845436928415438e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9537699222564697, "step": 502, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999385476112366, "val/ratio_var": NaN }, { "episode": 503, "epoch": 0.09446009389671362, "eps": 0, "loss/policy_avg": 1.2744148079946171e-05, "loss/value_avg": 0.19690930843353271, "lr": 1.494e-06, "objective/entropy": 110.22508239746094, "objective/kl": 26.335819244384766, "objective/non_score_reward": -1.3167909383773804, "objective/rlhf_reward": -11.329935073852539, "objective/scores": -10.013144493103027, "policy/approxkl_avg": 9.903285302925724e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2727742195129395, "step": 503, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999955952167511, "val/ratio_var": NaN }, { "episode": 504, "epoch": 0.09464788732394366, "eps": 0, "loss/policy_avg": -5.803018211736344e-05, "loss/value_avg": 0.6585785746574402, "lr": 1.491e-06, "objective/entropy": 132.10189819335938, "objective/kl": 36.69512939453125, "objective/non_score_reward": -1.8347567319869995, "objective/rlhf_reward": -11.03756046295166, "objective/scores": -9.202803611755371, "policy/approxkl_avg": 1.3248335051230242e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.6678168773651123, "step": 504, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999265074729919, "val/ratio_var": NaN }, { "episode": 505, "epoch": 0.09483568075117371, "eps": 0, "loss/policy_avg": -6.0299658798612654e-05, "loss/value_avg": 0.17370440065860748, "lr": 1.488e-06, "objective/entropy": 120.6812515258789, "objective/kl": 28.390167236328125, "objective/non_score_reward": -1.419508457183838, "objective/rlhf_reward": -11.395561218261719, "objective/scores": -9.976053237915039, "policy/approxkl_avg": 7.906366050747238e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7441959381103516, "step": 505, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000027418136597, "val/ratio_var": NaN }, { "episode": 506, "epoch": 0.09502347417840376, "eps": 0, "loss/policy_avg": -1.340542212346918e-06, "loss/value_avg": 0.4897083044052124, "lr": 1.485e-06, "objective/entropy": 145.52560424804688, "objective/kl": 37.543243408203125, "objective/non_score_reward": -1.8771620988845825, "objective/rlhf_reward": -12.956381797790527, "objective/scores": -11.079219818115234, "policy/approxkl_avg": 1.3396203257798334e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.6527099609375, "step": 506, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999653100967407, "val/ratio_var": NaN }, { "episode": 507, "epoch": 0.09521126760563381, "eps": 0, "loss/policy_avg": -1.2964572306373157e-05, "loss/value_avg": 0.1347162276506424, "lr": 1.482e-06, "objective/entropy": 96.83988952636719, "objective/kl": 15.570594787597656, "objective/non_score_reward": -0.7785297632217407, "objective/rlhf_reward": -11.915651321411133, "objective/scores": -11.137121200561523, "policy/approxkl_avg": 9.147771606876631e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.682928204536438, "step": 507, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999839067459106, "val/ratio_var": NaN }, { "episode": 508, "epoch": 0.09539906103286384, "eps": 0, "loss/policy_avg": 3.0782986868871376e-05, "loss/value_avg": 0.13617512583732605, "lr": 1.479e-06, "objective/entropy": 119.45001983642578, "objective/kl": 21.147838592529297, "objective/non_score_reward": -1.057391881942749, "objective/rlhf_reward": -10.759563446044922, "objective/scores": -9.702171325683594, "policy/approxkl_avg": 8.586974331592501e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.184466600418091, "step": 508, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999862909317017, "val/ratio_var": NaN }, { "episode": 509, "epoch": 0.09558685446009389, "eps": 0, "loss/policy_avg": -3.362601637491025e-05, "loss/value_avg": 0.07302041351795197, "lr": 1.476e-06, "objective/entropy": 103.81346893310547, "objective/kl": 17.17847442626953, "objective/non_score_reward": -0.8589237332344055, "objective/rlhf_reward": -11.327434539794922, "objective/scores": -10.468510627746582, "policy/approxkl_avg": 8.447894828123026e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.960398554801941, "step": 509, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999417662620544, "val/ratio_var": NaN }, { "episode": 510, "epoch": 0.09577464788732394, "eps": 0, "loss/policy_avg": 0.0001094476247089915, "loss/value_avg": 0.2607609033584595, "lr": 1.473e-06, "objective/entropy": 101.07476806640625, "objective/kl": 22.452938079833984, "objective/non_score_reward": -1.1226468086242676, "objective/rlhf_reward": -9.586967468261719, "objective/scores": -8.46432113647461, "policy/approxkl_avg": 1.9944984330777515e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.640017032623291, "step": 510, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000124216079712, "val/ratio_var": NaN }, { "episode": 511, "epoch": 0.09596244131455399, "eps": 0, "loss/policy_avg": 4.996443749405444e-05, "loss/value_avg": 0.24135269224643707, "lr": 1.4700000000000001e-06, "objective/entropy": 75.67183685302734, "objective/kl": 18.957738876342773, "objective/non_score_reward": -0.9478868842124939, "objective/rlhf_reward": -10.539834976196289, "objective/scores": -9.591948509216309, "policy/approxkl_avg": 6.728853918502864e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7303190231323242, "step": 511, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999659061431885, "val/ratio_var": NaN }, { "episode": 512, "epoch": 0.09615023474178404, "eps": 0, "loss/policy_avg": -4.9055746785597876e-05, "loss/value_avg": 0.29601821303367615, "lr": 1.467e-06, "objective/entropy": 109.93282318115234, "objective/kl": 18.21490478515625, "objective/non_score_reward": -0.9107453227043152, "objective/rlhf_reward": -12.78204345703125, "objective/scores": -11.871297836303711, "policy/approxkl_avg": 6.41921644728427e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2407479286193848, "step": 512, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000041723251343, "val/ratio_var": NaN }, { "episode": 513, "epoch": 0.09633802816901409, "eps": 0, "loss/policy_avg": 6.052682874724269e-05, "loss/value_avg": 0.06561542302370071, "lr": 1.464e-06, "objective/entropy": 101.01287841796875, "objective/kl": 9.71306037902832, "objective/non_score_reward": -0.4856530427932739, "objective/rlhf_reward": -10.557515144348145, "objective/scores": -10.07186222076416, "policy/approxkl_avg": 1.0694029128899274e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9483850002288818, "step": 513, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000157356262207, "val/ratio_var": NaN }, { "episode": 514, "epoch": 0.09652582159624414, "eps": 0, "loss/policy_avg": -1.2705911103694234e-05, "loss/value_avg": 0.1592652052640915, "lr": 1.461e-06, "objective/entropy": 81.37966918945312, "objective/kl": 24.8903751373291, "objective/non_score_reward": -1.244518756866455, "objective/rlhf_reward": -12.694019317626953, "objective/scores": -11.44950008392334, "policy/approxkl_avg": 6.137814523299312e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4859273433685303, "step": 514, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999901652336121, "val/ratio_var": NaN }, { "episode": 515, "epoch": 0.09671361502347418, "eps": 0, "loss/policy_avg": -2.0611960280803032e-05, "loss/value_avg": 0.09607363492250443, "lr": 1.458e-06, "objective/entropy": 86.64463806152344, "objective/kl": 15.6563081741333, "objective/non_score_reward": -0.7828153967857361, "objective/rlhf_reward": -11.340073585510254, "objective/scores": -10.557258605957031, "policy/approxkl_avg": 9.746821660883143e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.042984962463379, "step": 515, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999886155128479, "val/ratio_var": NaN }, { "episode": 516, "epoch": 0.09690140845070423, "eps": 0, "loss/policy_avg": 6.041436790837906e-05, "loss/value_avg": 0.14302361011505127, "lr": 1.455e-06, "objective/entropy": 75.64823913574219, "objective/kl": 19.600433349609375, "objective/non_score_reward": -0.9800218343734741, "objective/rlhf_reward": -10.766225814819336, "objective/scores": -9.78620433807373, "policy/approxkl_avg": 6.979640687632127e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.728386640548706, "step": 516, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999775290489197, "val/ratio_var": NaN }, { "episode": 517, "epoch": 0.09708920187793427, "eps": 0, "loss/policy_avg": 8.5380845121108e-05, "loss/value_avg": 0.1005096435546875, "lr": 1.452e-06, "objective/entropy": 108.2346420288086, "objective/kl": 16.0548095703125, "objective/non_score_reward": -0.8027405738830566, "objective/rlhf_reward": -11.85751724243164, "objective/scores": -11.054777145385742, "policy/approxkl_avg": 1.3891097694340715e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0766186714172363, "step": 517, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001190900802612, "val/ratio_var": NaN }, { "episode": 518, "epoch": 0.09727699530516432, "eps": 0, "loss/policy_avg": -2.57851934293285e-05, "loss/value_avg": 0.08503925800323486, "lr": 1.449e-06, "objective/entropy": 106.13502502441406, "objective/kl": 11.717206954956055, "objective/non_score_reward": -0.5858603715896606, "objective/rlhf_reward": -11.361739158630371, "objective/scores": -10.77587890625, "policy/approxkl_avg": 6.457118217895186e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1344478130340576, "step": 518, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999451041221619, "val/ratio_var": NaN }, { "episode": 519, "epoch": 0.09746478873239436, "eps": 0, "loss/policy_avg": -2.4219729311880656e-05, "loss/value_avg": 0.11787485331296921, "lr": 1.446e-06, "objective/entropy": 103.38117980957031, "objective/kl": 22.755802154541016, "objective/non_score_reward": -1.1377900838851929, "objective/rlhf_reward": -12.34152889251709, "objective/scores": -11.203739166259766, "policy/approxkl_avg": 7.759850007005298e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9159120321273804, "step": 519, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000334978103638, "val/ratio_var": NaN }, { "episode": 520, "epoch": 0.09765258215962441, "eps": 0, "loss/policy_avg": -8.440017700195312e-05, "loss/value_avg": 2.1647255420684814, "lr": 1.443e-06, "objective/entropy": 86.23223876953125, "objective/kl": 20.014057159423828, "objective/non_score_reward": -1.0007028579711914, "objective/rlhf_reward": -10.824135780334473, "objective/scores": -9.823432922363281, "policy/approxkl_avg": 6.890677184401284e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8245043754577637, "step": 520, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999527335166931, "val/ratio_var": NaN }, { "episode": 521, "epoch": 0.09784037558685446, "eps": 0, "loss/policy_avg": 8.008164877537638e-05, "loss/value_avg": 0.20307418704032898, "lr": 1.44e-06, "objective/entropy": 94.36631774902344, "objective/kl": 10.032766342163086, "objective/non_score_reward": -0.5016383528709412, "objective/rlhf_reward": -10.642638206481934, "objective/scores": -10.140999794006348, "policy/approxkl_avg": 1.1392800303156037e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.093026876449585, "step": 521, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999598264694214, "val/ratio_var": NaN }, { "episode": 522, "epoch": 0.09802816901408451, "eps": 0, "loss/policy_avg": 0.00011134597298223525, "loss/value_avg": 0.12453865259885788, "lr": 1.437e-06, "objective/entropy": 105.41987609863281, "objective/kl": 15.233728408813477, "objective/non_score_reward": -0.761686384677887, "objective/rlhf_reward": -12.069845199584961, "objective/scores": -11.308158874511719, "policy/approxkl_avg": 1.1167326618988227e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9362033605575562, "step": 522, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000271797180176, "val/ratio_var": NaN }, { "episode": 523, "epoch": 0.09821596244131456, "eps": 0, "loss/policy_avg": -4.249248740961775e-05, "loss/value_avg": 0.13133065402507782, "lr": 1.434e-06, "objective/entropy": 50.86054992675781, "objective/kl": 26.311765670776367, "objective/non_score_reward": -1.3155882358551025, "objective/rlhf_reward": -12.163162231445312, "objective/scores": -10.847574234008789, "policy/approxkl_avg": 2.852095271066446e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5508733987808228, "step": 523, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000194311141968, "val/ratio_var": NaN }, { "episode": 524, "epoch": 0.0984037558685446, "eps": 0, "loss/policy_avg": 4.156580689596012e-05, "loss/value_avg": 0.09399835020303726, "lr": 1.431e-06, "objective/entropy": 82.41399383544922, "objective/kl": 18.2725830078125, "objective/non_score_reward": -0.9136292338371277, "objective/rlhf_reward": -11.496065139770508, "objective/scores": -10.582435607910156, "policy/approxkl_avg": 7.873784113598958e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8005625009536743, "step": 524, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999922513961792, "val/ratio_var": NaN }, { "episode": 525, "epoch": 0.09859154929577464, "eps": 0, "loss/policy_avg": -4.280737994122319e-05, "loss/value_avg": 0.07927043735980988, "lr": 1.428e-06, "objective/entropy": 41.517757415771484, "objective/kl": 11.965869903564453, "objective/non_score_reward": -0.5982934832572937, "objective/rlhf_reward": -11.014379501342773, "objective/scores": -10.416086196899414, "policy/approxkl_avg": 4.311128520839702e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1555613279342651, "step": 525, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000276565551758, "val/ratio_var": NaN }, { "episode": 526, "epoch": 0.09877934272300469, "eps": 0, "loss/policy_avg": -7.9352903412655e-06, "loss/value_avg": 0.24933919310569763, "lr": 1.425e-06, "objective/entropy": 80.27218627929688, "objective/kl": 31.4727725982666, "objective/non_score_reward": -1.573638677597046, "objective/rlhf_reward": -12.663581848144531, "objective/scores": -11.089942932128906, "policy/approxkl_avg": 8.893250225128213e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5925453901290894, "step": 526, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999551177024841, "val/ratio_var": NaN }, { "episode": 527, "epoch": 0.09896713615023474, "eps": 0, "loss/policy_avg": 4.7611742047593e-05, "loss/value_avg": 0.32789674401283264, "lr": 1.422e-06, "objective/entropy": 130.66769409179688, "objective/kl": 27.161479949951172, "objective/non_score_reward": -1.3580739498138428, "objective/rlhf_reward": -13.387893676757812, "objective/scores": -12.02981948852539, "policy/approxkl_avg": 9.81212480155591e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.411264181137085, "step": 527, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999759793281555, "val/ratio_var": NaN }, { "episode": 528, "epoch": 0.09915492957746479, "eps": 0, "loss/policy_avg": -4.279838321963325e-05, "loss/value_avg": 0.13903763890266418, "lr": 1.4189999999999999e-06, "objective/entropy": 117.62532043457031, "objective/kl": 23.646146774291992, "objective/non_score_reward": -1.182307481765747, "objective/rlhf_reward": -11.630778312683105, "objective/scores": -10.448471069335938, "policy/approxkl_avg": 7.524138823100657e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2901220321655273, "step": 528, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999750852584839, "val/ratio_var": NaN }, { "episode": 529, "epoch": 0.09934272300469484, "eps": 0, "loss/policy_avg": -3.2267482310999185e-05, "loss/value_avg": 0.15419122576713562, "lr": 1.4159999999999999e-06, "objective/entropy": 87.92061614990234, "objective/kl": 15.916977882385254, "objective/non_score_reward": -0.7958488464355469, "objective/rlhf_reward": -12.030367851257324, "objective/scores": -11.234519004821777, "policy/approxkl_avg": 5.582483808552752e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5167973041534424, "step": 529, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999701976776123, "val/ratio_var": NaN }, { "episode": 530, "epoch": 0.09953051643192488, "eps": 0, "loss/policy_avg": 1.0301481779606547e-05, "loss/value_avg": 0.8539859652519226, "lr": 1.4129999999999999e-06, "objective/entropy": 107.2091293334961, "objective/kl": 14.351239204406738, "objective/non_score_reward": -0.7175619006156921, "objective/rlhf_reward": -10.418474197387695, "objective/scores": -9.700912475585938, "policy/approxkl_avg": 1.1372728891956285e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8744657039642334, "step": 530, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999556541442871, "val/ratio_var": NaN }, { "episode": 531, "epoch": 0.09971830985915493, "eps": 0, "loss/policy_avg": -3.9748425479047e-05, "loss/value_avg": 0.14023016393184662, "lr": 1.41e-06, "objective/entropy": 82.30229187011719, "objective/kl": 16.559858322143555, "objective/non_score_reward": -0.8279929161071777, "objective/rlhf_reward": -11.37942123413086, "objective/scores": -10.55142879486084, "policy/approxkl_avg": 4.400866870923892e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5182576179504395, "step": 531, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999441504478455, "val/ratio_var": NaN }, { "episode": 532, "epoch": 0.09990610328638498, "eps": 0, "loss/policy_avg": -3.5411907447269186e-05, "loss/value_avg": 0.4439960718154907, "lr": 1.407e-06, "objective/entropy": 102.79071807861328, "objective/kl": 26.929332733154297, "objective/non_score_reward": -1.3464666604995728, "objective/rlhf_reward": -10.01906681060791, "objective/scores": -8.672599792480469, "policy/approxkl_avg": 6.952573272656082e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1065926551818848, "step": 532, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999454021453857, "val/ratio_var": NaN }, { "episode": 533, "epoch": 0.10009389671361503, "eps": 0, "loss/policy_avg": 3.8875725294929e-05, "loss/value_avg": 0.13096942007541656, "lr": 1.404e-06, "objective/entropy": 93.50123596191406, "objective/kl": 25.72262191772461, "objective/non_score_reward": -1.2861311435699463, "objective/rlhf_reward": -12.022703170776367, "objective/scores": -10.736572265625, "policy/approxkl_avg": 8.430961173644391e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8544899225234985, "step": 533, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000014305114746, "val/ratio_var": NaN }, { "episode": 534, "epoch": 0.10028169014084506, "eps": 0, "loss/policy_avg": -8.00366688054055e-05, "loss/value_avg": 0.20319892466068268, "lr": 1.401e-06, "objective/entropy": 131.54049682617188, "objective/kl": 23.741226196289062, "objective/non_score_reward": -1.1870611906051636, "objective/rlhf_reward": -11.774001121520996, "objective/scores": -10.586939811706543, "policy/approxkl_avg": 1.3847098045971507e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.5452051162719727, "step": 534, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999386668205261, "val/ratio_var": NaN }, { "episode": 535, "epoch": 0.10046948356807511, "eps": 0, "loss/policy_avg": 2.9271503080963157e-05, "loss/value_avg": 0.19158582389354706, "lr": 1.3980000000000002e-06, "objective/entropy": 129.72727966308594, "objective/kl": 15.215906143188477, "objective/non_score_reward": -0.7607953548431396, "objective/rlhf_reward": -11.199671745300293, "objective/scores": -10.438876152038574, "policy/approxkl_avg": 6.609170100091433e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.117657423019409, "step": 535, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999701976776123, "val/ratio_var": NaN }, { "episode": 536, "epoch": 0.10065727699530516, "eps": 0, "loss/policy_avg": 0.000143267068779096, "loss/value_avg": 0.1817111223936081, "lr": 1.3950000000000002e-06, "objective/entropy": 120.94153594970703, "objective/kl": 13.132787704467773, "objective/non_score_reward": -0.6566393375396729, "objective/rlhf_reward": -10.715518951416016, "objective/scores": -10.058879852294922, "policy/approxkl_avg": 1.6786758294529136e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7306967973709106, "step": 536, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000108480453491, "val/ratio_var": NaN }, { "episode": 537, "epoch": 0.10084507042253521, "eps": 0, "loss/policy_avg": -3.1448758818442e-05, "loss/value_avg": 0.12285052239894867, "lr": 1.3920000000000002e-06, "objective/entropy": 102.39344787597656, "objective/kl": 12.914897918701172, "objective/non_score_reward": -0.6457449197769165, "objective/rlhf_reward": -12.36696720123291, "objective/scores": -11.721221923828125, "policy/approxkl_avg": 8.486040314892307e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8693249225616455, "step": 537, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999923586845398, "val/ratio_var": NaN }, { "episode": 538, "epoch": 0.10103286384976526, "eps": 0, "loss/policy_avg": -5.713948485208675e-05, "loss/value_avg": 0.1363973468542099, "lr": 1.3890000000000002e-06, "objective/entropy": 118.07118225097656, "objective/kl": 10.795971870422363, "objective/non_score_reward": -0.5397986173629761, "objective/rlhf_reward": -11.804311752319336, "objective/scores": -11.26451301574707, "policy/approxkl_avg": 1.5161856481427094e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7805083990097046, "step": 538, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000094175338745, "val/ratio_var": NaN }, { "episode": 539, "epoch": 0.10122065727699531, "eps": 0, "loss/policy_avg": 6.77468642606982e-06, "loss/value_avg": 0.7254307866096497, "lr": 1.3860000000000002e-06, "objective/entropy": 131.8334197998047, "objective/kl": 20.175315856933594, "objective/non_score_reward": -1.0087658166885376, "objective/rlhf_reward": -9.887797355651855, "objective/scores": -8.87903118133545, "policy/approxkl_avg": 1.2026188755953626e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2924575805664062, "step": 539, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000284910202026, "val/ratio_var": NaN }, { "episode": 540, "epoch": 0.10140845070422536, "eps": 0, "loss/policy_avg": 3.481810927041806e-05, "loss/value_avg": 1.6269960403442383, "lr": 1.3830000000000001e-06, "objective/entropy": 127.52076721191406, "objective/kl": 15.444443702697754, "objective/non_score_reward": -0.7722222208976746, "objective/rlhf_reward": -10.380410194396973, "objective/scores": -9.608187675476074, "policy/approxkl_avg": 1.0216670176532716e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2494864463806152, "step": 540, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0001311302185059, "val/ratio_var": NaN }, { "episode": 541, "epoch": 0.1015962441314554, "eps": 0, "loss/policy_avg": -1.129114389186725e-05, "loss/value_avg": 0.09749162197113037, "lr": 1.3800000000000001e-06, "objective/entropy": 86.97340393066406, "objective/kl": 17.7469482421875, "objective/non_score_reward": -0.8873474597930908, "objective/rlhf_reward": -11.489445686340332, "objective/scores": -10.60209846496582, "policy/approxkl_avg": 1.0764473756808002e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8102797269821167, "step": 541, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000344514846802, "val/ratio_var": NaN }, { "episode": 542, "epoch": 0.10178403755868545, "eps": 0, "loss/policy_avg": 1.3819280866300687e-05, "loss/value_avg": 1.797200322151184, "lr": 1.3770000000000001e-06, "objective/entropy": 76.25851440429688, "objective/kl": 12.68696117401123, "objective/non_score_reward": -0.6343480944633484, "objective/rlhf_reward": -8.299803733825684, "objective/scores": -7.665455341339111, "policy/approxkl_avg": 4.5989111185917864e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5723024606704712, "step": 542, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999632835388184, "val/ratio_var": NaN }, { "episode": 543, "epoch": 0.10197183098591549, "eps": 0, "loss/policy_avg": 7.928992272354662e-05, "loss/value_avg": 1.3519549369812012, "lr": 1.374e-06, "objective/entropy": 58.4592170715332, "objective/kl": 15.143638610839844, "objective/non_score_reward": -0.7571819424629211, "objective/rlhf_reward": -6.810471534729004, "objective/scores": -6.053289413452148, "policy/approxkl_avg": 6.062220592184531e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4300445318222046, "step": 543, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999024271965027, "val/ratio_var": NaN }, { "episode": 544, "epoch": 0.10215962441314554, "eps": 0, "loss/policy_avg": 3.132280471618287e-05, "loss/value_avg": 0.1580098420381546, "lr": 1.371e-06, "objective/entropy": 100.18933868408203, "objective/kl": 15.767082214355469, "objective/non_score_reward": -0.7883540391921997, "objective/rlhf_reward": -11.833037376403809, "objective/scores": -11.044683456420898, "policy/approxkl_avg": 1.398214379833007e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7623060941696167, "step": 544, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999916136264801, "val/ratio_var": NaN }, { "episode": 545, "epoch": 0.10234741784037558, "eps": 0, "loss/policy_avg": -1.4620007277699187e-05, "loss/value_avg": 1.1174957752227783, "lr": 1.368e-06, "objective/entropy": 79.13148498535156, "objective/kl": 26.050569534301758, "objective/non_score_reward": -1.3025283813476562, "objective/rlhf_reward": -6.897045135498047, "objective/scores": -5.594516754150391, "policy/approxkl_avg": 6.330240154284184e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.453908085823059, "step": 545, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999944567680359, "val/ratio_var": NaN }, { "episode": 546, "epoch": 0.10253521126760563, "eps": 0, "loss/policy_avg": 1.0681602361728437e-05, "loss/value_avg": 0.38082587718963623, "lr": 1.365e-06, "objective/entropy": 108.76702117919922, "objective/kl": 29.808197021484375, "objective/non_score_reward": -1.4904098510742188, "objective/rlhf_reward": -10.654052734375, "objective/scores": -9.163642883300781, "policy/approxkl_avg": 1.0156056617915965e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.035907030105591, "step": 546, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999415874481201, "val/ratio_var": NaN }, { "episode": 547, "epoch": 0.10272300469483568, "eps": 0, "loss/policy_avg": -1.6833251720527187e-05, "loss/value_avg": 0.07994166016578674, "lr": 1.362e-06, "objective/entropy": 52.19115447998047, "objective/kl": 13.776519775390625, "objective/non_score_reward": -0.6888260245323181, "objective/rlhf_reward": -11.535087585449219, "objective/scores": -10.846261978149414, "policy/approxkl_avg": 5.85633443961342e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5049450397491455, "step": 547, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000413656234741, "val/ratio_var": NaN }, { "episode": 548, "epoch": 0.10291079812206573, "eps": 0, "loss/policy_avg": -5.465633512358181e-05, "loss/value_avg": 0.15196876227855682, "lr": 1.359e-06, "objective/entropy": 68.89495849609375, "objective/kl": 16.073650360107422, "objective/non_score_reward": -0.8036825656890869, "objective/rlhf_reward": -12.57775592803955, "objective/scores": -11.774073600769043, "policy/approxkl_avg": 9.968177749897222e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3742905855178833, "step": 548, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.99994957447052, "val/ratio_var": NaN }, { "episode": 549, "epoch": 0.10309859154929578, "eps": 0, "loss/policy_avg": -2.2741984139429405e-05, "loss/value_avg": 0.168319433927536, "lr": 1.356e-06, "objective/entropy": 66.44024658203125, "objective/kl": 14.664892196655273, "objective/non_score_reward": -0.7332445979118347, "objective/rlhf_reward": -12.248862266540527, "objective/scores": -11.515617370605469, "policy/approxkl_avg": 1.000853089294651e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1768476963043213, "step": 549, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000771284103394, "val/ratio_var": NaN }, { "episode": 550, "epoch": 0.10328638497652583, "eps": 0, "loss/policy_avg": 1.0467924766999204e-05, "loss/value_avg": 0.0853177011013031, "lr": 1.353e-06, "objective/entropy": 109.40479278564453, "objective/kl": 26.528594970703125, "objective/non_score_reward": -1.326429843902588, "objective/rlhf_reward": -11.823736190795898, "objective/scores": -10.497305870056152, "policy/approxkl_avg": 1.1875266636707238e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.229107618331909, "step": 550, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999791383743286, "val/ratio_var": NaN }, { "episode": 551, "epoch": 0.10347417840375586, "eps": 0, "loss/policy_avg": -9.720730304252356e-05, "loss/value_avg": 0.11075206845998764, "lr": 1.35e-06, "objective/entropy": 97.41773223876953, "objective/kl": 23.081867218017578, "objective/non_score_reward": -1.1540933847427368, "objective/rlhf_reward": -12.473712921142578, "objective/scores": -11.319619178771973, "policy/approxkl_avg": 1.5177678847067e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2699227333068848, "step": 551, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001074075698853, "val/ratio_var": NaN }, { "episode": 552, "epoch": 0.10366197183098591, "eps": 0, "loss/policy_avg": -3.523196937749162e-05, "loss/value_avg": 0.15344873070716858, "lr": 1.347e-06, "objective/entropy": 57.82018280029297, "objective/kl": 20.626697540283203, "objective/non_score_reward": -1.0313348770141602, "objective/rlhf_reward": -12.513197898864746, "objective/scores": -11.481863021850586, "policy/approxkl_avg": 5.95585802898313e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2880651950836182, "step": 552, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999997019767761, "val/ratio_var": NaN }, { "episode": 553, "epoch": 0.10384976525821596, "eps": 0, "loss/policy_avg": -3.2393436413258314e-05, "loss/value_avg": 0.10735967755317688, "lr": 1.344e-06, "objective/entropy": 81.76519012451172, "objective/kl": 15.046640396118164, "objective/non_score_reward": -0.7523319721221924, "objective/rlhf_reward": -12.149967193603516, "objective/scores": -11.397635459899902, "policy/approxkl_avg": 9.581295756788677e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2707197666168213, "step": 553, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000009536743164, "val/ratio_var": NaN }, { "episode": 554, "epoch": 0.10403755868544601, "eps": 0, "loss/policy_avg": 1.3945237924417597e-06, "loss/value_avg": 0.10829874128103256, "lr": 1.3410000000000002e-06, "objective/entropy": 73.16210174560547, "objective/kl": 10.44714641571045, "objective/non_score_reward": -0.5223572850227356, "objective/rlhf_reward": -11.095977783203125, "objective/scores": -10.573620796203613, "policy/approxkl_avg": 6.329180735065165e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4861301183700562, "step": 554, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000245571136475, "val/ratio_var": NaN }, { "episode": 555, "epoch": 0.10422535211267606, "eps": 0, "loss/policy_avg": 5.1138536946382374e-05, "loss/value_avg": 0.5579447746276855, "lr": 1.3380000000000001e-06, "objective/entropy": 129.35574340820312, "objective/kl": 10.126901626586914, "objective/non_score_reward": -0.5063451528549194, "objective/rlhf_reward": -9.441937446594238, "objective/scores": -8.935592651367188, "policy/approxkl_avg": 9.914164422752947e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.5499889850616455, "step": 555, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000126361846924, "val/ratio_var": NaN }, { "episode": 556, "epoch": 0.1044131455399061, "eps": 0, "loss/policy_avg": 2.233037412224803e-05, "loss/value_avg": 0.23257611691951752, "lr": 1.3350000000000001e-06, "objective/entropy": 83.51187133789062, "objective/kl": 20.973854064941406, "objective/non_score_reward": -1.0486927032470703, "objective/rlhf_reward": -12.939436912536621, "objective/scores": -11.89074420928955, "policy/approxkl_avg": 7.0403679330866e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7265256643295288, "step": 556, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000072717666626, "val/ratio_var": NaN }, { "episode": 557, "epoch": 0.10460093896713615, "eps": 0, "loss/policy_avg": 9.377497190143913e-05, "loss/value_avg": 0.2380802184343338, "lr": 1.3320000000000001e-06, "objective/entropy": 107.23419189453125, "objective/kl": 18.002805709838867, "objective/non_score_reward": -0.9001402854919434, "objective/rlhf_reward": -12.61275863647461, "objective/scores": -11.712617874145508, "policy/approxkl_avg": 2.2428028501053632e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.030782461166382, "step": 557, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.000217080116272, "val/ratio_var": NaN }, { "episode": 558, "epoch": 0.1047887323943662, "eps": 0, "loss/policy_avg": 2.5024954084074125e-05, "loss/value_avg": 0.898361325263977, "lr": 1.3290000000000001e-06, "objective/entropy": 73.61134338378906, "objective/kl": 20.963552474975586, "objective/non_score_reward": -1.0481775999069214, "objective/rlhf_reward": -10.507295608520508, "objective/scores": -9.459117889404297, "policy/approxkl_avg": 6.06519208190548e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3218533992767334, "step": 558, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999973475933075, "val/ratio_var": NaN }, { "episode": 559, "epoch": 0.10497652582159625, "eps": 0, "loss/policy_avg": 5.0265833124285564e-05, "loss/value_avg": 0.055334221571683884, "lr": 1.326e-06, "objective/entropy": 74.64337158203125, "objective/kl": 14.103096008300781, "objective/non_score_reward": -0.7051547765731812, "objective/rlhf_reward": -11.058335304260254, "objective/scores": -10.353180885314941, "policy/approxkl_avg": 9.664422861987987e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7022175788879395, "step": 559, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998776316642761, "val/ratio_var": NaN }, { "episode": 560, "epoch": 0.10516431924882629, "eps": 0, "loss/policy_avg": -1.7184131138492376e-05, "loss/value_avg": 1.1872165203094482, "lr": 1.323e-06, "objective/entropy": 103.6999740600586, "objective/kl": 34.55077362060547, "objective/non_score_reward": -1.727538824081421, "objective/rlhf_reward": -11.881194114685059, "objective/scores": -10.153655052185059, "policy/approxkl_avg": 7.20427095757259e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9477170705795288, "step": 560, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0001111030578613, "val/ratio_var": NaN }, { "episode": 561, "epoch": 0.10535211267605633, "eps": 0, "loss/policy_avg": -1.1462085240054876e-05, "loss/value_avg": 0.1990371197462082, "lr": 1.32e-06, "objective/entropy": 82.73759460449219, "objective/kl": 12.077787399291992, "objective/non_score_reward": -0.6038893461227417, "objective/rlhf_reward": -11.410911560058594, "objective/scores": -10.807022094726562, "policy/approxkl_avg": 1.193136540678097e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5029044151306152, "step": 561, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000122785568237, "val/ratio_var": NaN }, { "episode": 562, "epoch": 0.10553990610328638, "eps": 0, "loss/policy_avg": 4.4546039134729654e-05, "loss/value_avg": 4.6616034507751465, "lr": 1.317e-06, "objective/entropy": 35.40876770019531, "objective/kl": 25.041648864746094, "objective/non_score_reward": -1.2520825862884521, "objective/rlhf_reward": -4.948711395263672, "objective/scores": -3.6966285705566406, "policy/approxkl_avg": 8.136983353779215e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5447733402252197, "step": 562, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.99993896484375, "val/ratio_var": NaN }, { "episode": 563, "epoch": 0.10572769953051643, "eps": 0, "loss/policy_avg": 3.2352952985093e-05, "loss/value_avg": 0.09764180332422256, "lr": 1.314e-06, "objective/entropy": 66.52935791015625, "objective/kl": 19.12055206298828, "objective/non_score_reward": -0.956027626991272, "objective/rlhf_reward": -12.049025535583496, "objective/scores": -11.092997550964355, "policy/approxkl_avg": 5.679325454366335e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.573414921760559, "step": 563, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000251531600952, "val/ratio_var": NaN }, { "episode": 564, "epoch": 0.10591549295774648, "eps": 0, "loss/policy_avg": 3.3198662094946485e-06, "loss/value_avg": 0.18153122067451477, "lr": 1.311e-06, "objective/entropy": 86.74788665771484, "objective/kl": 18.185230255126953, "objective/non_score_reward": -0.9092614650726318, "objective/rlhf_reward": -11.32772445678711, "objective/scores": -10.418462753295898, "policy/approxkl_avg": 6.185384648915715e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8719459772109985, "step": 564, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.00006103515625, "val/ratio_var": NaN }, { "episode": 565, "epoch": 0.10610328638497653, "eps": 0, "loss/policy_avg": -6.217776535777375e-05, "loss/value_avg": 0.22426439821720123, "lr": 1.308e-06, "objective/entropy": 106.234130859375, "objective/kl": 14.434924125671387, "objective/non_score_reward": -0.7217462658882141, "objective/rlhf_reward": -10.551556587219238, "objective/scores": -9.82981014251709, "policy/approxkl_avg": 9.05394443861951e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.003913164138794, "step": 565, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999119639396667, "val/ratio_var": NaN }, { "episode": 566, "epoch": 0.10629107981220658, "eps": 0, "loss/policy_avg": 1.3765298945145332e-06, "loss/value_avg": 0.2732374370098114, "lr": 1.305e-06, "objective/entropy": 129.91943359375, "objective/kl": 23.946014404296875, "objective/non_score_reward": -1.197300672531128, "objective/rlhf_reward": -12.275349617004395, "objective/scores": -11.078048706054688, "policy/approxkl_avg": 1.637395286024912e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2322123050689697, "step": 566, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999211430549622, "val/ratio_var": NaN }, { "episode": 567, "epoch": 0.10647887323943662, "eps": 0, "loss/policy_avg": -5.963163494016044e-05, "loss/value_avg": 0.18505217134952545, "lr": 1.302e-06, "objective/entropy": 96.85523986816406, "objective/kl": 19.55370330810547, "objective/non_score_reward": -0.9776851534843445, "objective/rlhf_reward": -11.57748794555664, "objective/scores": -10.59980297088623, "policy/approxkl_avg": 1.3141814747541503e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6107592582702637, "step": 567, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001475811004639, "val/ratio_var": NaN }, { "episode": 568, "epoch": 0.10666666666666667, "eps": 0, "loss/policy_avg": 5.715298175346106e-05, "loss/value_avg": 1.3419132232666016, "lr": 1.299e-06, "objective/entropy": 140.5446319580078, "objective/kl": 28.85759925842285, "objective/non_score_reward": -1.4428799152374268, "objective/rlhf_reward": -11.114129066467285, "objective/scores": -9.671249389648438, "policy/approxkl_avg": 8.666984996352767e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.249875545501709, "step": 568, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999790191650391, "val/ratio_var": NaN }, { "episode": 569, "epoch": 0.10685446009389671, "eps": 0, "loss/policy_avg": -5.1055314543191344e-05, "loss/value_avg": 0.578345537185669, "lr": 1.296e-06, "objective/entropy": 106.51056671142578, "objective/kl": 14.432626724243164, "objective/non_score_reward": -0.7216314077377319, "objective/rlhf_reward": -9.108094215393066, "objective/scores": -8.386463165283203, "policy/approxkl_avg": 1.3504484286386287e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.144669532775879, "step": 569, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999620318412781, "val/ratio_var": NaN }, { "episode": 570, "epoch": 0.10704225352112676, "eps": 0, "loss/policy_avg": -1.1974910194112454e-05, "loss/value_avg": 0.2492661327123642, "lr": 1.293e-06, "objective/entropy": 77.47781372070312, "objective/kl": 18.654376983642578, "objective/non_score_reward": -0.932718813419342, "objective/rlhf_reward": -10.621548652648926, "objective/scores": -9.68882942199707, "policy/approxkl_avg": 8.706187770712859e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.514650821685791, "step": 570, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999580979347229, "val/ratio_var": NaN }, { "episode": 571, "epoch": 0.1072300469483568, "eps": 0, "loss/policy_avg": 5.402655006037094e-05, "loss/value_avg": 0.37324848771095276, "lr": 1.29e-06, "objective/entropy": 43.76105499267578, "objective/kl": 23.892578125, "objective/non_score_reward": -1.1946289539337158, "objective/rlhf_reward": -11.559858322143555, "objective/scores": -10.365229606628418, "policy/approxkl_avg": 4.2770782471279745e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9408235549926758, "step": 571, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000882148742676, "val/ratio_var": NaN }, { "episode": 572, "epoch": 0.10741784037558685, "eps": 0, "loss/policy_avg": -1.5177816749201156e-05, "loss/value_avg": 0.10228291153907776, "lr": 1.287e-06, "objective/entropy": 89.10235595703125, "objective/kl": 21.23446273803711, "objective/non_score_reward": -1.061723232269287, "objective/rlhf_reward": -12.431123733520508, "objective/scores": -11.369400024414062, "policy/approxkl_avg": 4.19129904116744e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8734614849090576, "step": 572, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000492334365845, "val/ratio_var": NaN }, { "episode": 573, "epoch": 0.1076056338028169, "eps": 0, "loss/policy_avg": -5.480928302858956e-05, "loss/value_avg": 1.7282055616378784, "lr": 1.284e-06, "objective/entropy": 94.68343353271484, "objective/kl": 15.986305236816406, "objective/non_score_reward": -0.7993152737617493, "objective/rlhf_reward": -10.998335838317871, "objective/scores": -10.199020385742188, "policy/approxkl_avg": 1.5968369382335368e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0071053504943848, "step": 573, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999410510063171, "val/ratio_var": NaN }, { "episode": 574, "epoch": 0.10779342723004695, "eps": 0, "loss/policy_avg": 3.53084433299955e-05, "loss/value_avg": 0.6231512427330017, "lr": 1.281e-06, "objective/entropy": 52.635887145996094, "objective/kl": 23.12152862548828, "objective/non_score_reward": -1.156076431274414, "objective/rlhf_reward": -11.370244979858398, "objective/scores": -10.214168548583984, "policy/approxkl_avg": 7.872765905858614e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2279720306396484, "step": 574, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000012993812561, "val/ratio_var": NaN }, { "episode": 575, "epoch": 0.107981220657277, "eps": 0, "loss/policy_avg": -4.281637666281313e-05, "loss/value_avg": 1.4902806282043457, "lr": 1.278e-06, "objective/entropy": 73.43919372558594, "objective/kl": 32.26364517211914, "objective/non_score_reward": -1.6131823062896729, "objective/rlhf_reward": -9.778216361999512, "objective/scores": -8.165034294128418, "policy/approxkl_avg": 5.752738019282333e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2292065620422363, "step": 575, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000035762786865, "val/ratio_var": NaN }, { "episode": 576, "epoch": 0.10816901408450705, "eps": 0, "loss/policy_avg": -4.264543804310961e-06, "loss/value_avg": 1.015428066253662, "lr": 1.275e-06, "objective/entropy": 59.01552200317383, "objective/kl": 43.666160583496094, "objective/non_score_reward": -2.1833081245422363, "objective/rlhf_reward": -10.030560493469238, "objective/scores": -7.847252368927002, "policy/approxkl_avg": 8.045840615977795e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0922068357467651, "step": 576, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999663829803467, "val/ratio_var": NaN }, { "episode": 577, "epoch": 0.10835680751173708, "eps": 0, "loss/policy_avg": -0.00011908333544852212, "loss/value_avg": 0.08941687643527985, "lr": 1.272e-06, "objective/entropy": 136.62681579589844, "objective/kl": 25.73004722595215, "objective/non_score_reward": -1.2865023612976074, "objective/rlhf_reward": -12.141712188720703, "objective/scores": -10.855209350585938, "policy/approxkl_avg": 1.8473944862762437e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0919580459594727, "step": 577, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999953031539917, "val/ratio_var": NaN }, { "episode": 578, "epoch": 0.10854460093896713, "eps": 0, "loss/policy_avg": 7.989271398400888e-05, "loss/value_avg": 0.05692410096526146, "lr": 1.269e-06, "objective/entropy": 84.63134765625, "objective/kl": 21.897052764892578, "objective/non_score_reward": -1.0948525667190552, "objective/rlhf_reward": -11.80732536315918, "objective/scores": -10.712472915649414, "policy/approxkl_avg": 7.310377725389117e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7021046876907349, "step": 578, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999762773513794, "val/ratio_var": NaN }, { "episode": 579, "epoch": 0.10873239436619718, "eps": 0, "loss/policy_avg": 1.189393788081361e-05, "loss/value_avg": 0.1829027533531189, "lr": 1.266e-06, "objective/entropy": 118.58077239990234, "objective/kl": 20.336811065673828, "objective/non_score_reward": -1.0168406963348389, "objective/rlhf_reward": -10.865767478942871, "objective/scores": -9.848926544189453, "policy/approxkl_avg": 9.647976639826084e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.076280117034912, "step": 579, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0001308917999268, "val/ratio_var": NaN }, { "episode": 580, "epoch": 0.10892018779342723, "eps": 0, "loss/policy_avg": 1.2460744983400218e-05, "loss/value_avg": 0.06373844295740128, "lr": 1.263e-06, "objective/entropy": 64.80027770996094, "objective/kl": 4.077250957489014, "objective/non_score_reward": -0.2038625180721283, "objective/rlhf_reward": -11.11760139465332, "objective/scores": -10.913739204406738, "policy/approxkl_avg": 4.352036953036986e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.405890703201294, "step": 580, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000874996185303, "val/ratio_var": NaN }, { "episode": 581, "epoch": 0.10910798122065728, "eps": 0, "loss/policy_avg": 2.901509105868172e-05, "loss/value_avg": 1.2080702781677246, "lr": 1.26e-06, "objective/entropy": 76.90887451171875, "objective/kl": 29.805206298828125, "objective/non_score_reward": -1.4902604818344116, "objective/rlhf_reward": -11.224152565002441, "objective/scores": -9.733892440795898, "policy/approxkl_avg": 6.136101404763394e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5316715240478516, "step": 581, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999606013298035, "val/ratio_var": NaN }, { "episode": 582, "epoch": 0.10929577464788733, "eps": 0, "loss/policy_avg": -5.067519396106945e-06, "loss/value_avg": 0.335862398147583, "lr": 1.257e-06, "objective/entropy": 128.10198974609375, "objective/kl": 25.620407104492188, "objective/non_score_reward": -1.2810204029083252, "objective/rlhf_reward": -13.422629356384277, "objective/scores": -12.141609191894531, "policy/approxkl_avg": 1.312865975933164e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8934602737426758, "step": 582, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9998888373374939, "val/ratio_var": NaN }, { "episode": 583, "epoch": 0.10948356807511737, "eps": 0, "loss/policy_avg": 7.683375770284329e-06, "loss/value_avg": 0.9366015791893005, "lr": 1.254e-06, "objective/entropy": 88.51202392578125, "objective/kl": 33.93738555908203, "objective/non_score_reward": -1.6968693733215332, "objective/rlhf_reward": -12.438383102416992, "objective/scores": -10.741514205932617, "policy/approxkl_avg": 9.21395937325542e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.729441523551941, "step": 583, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000202655792236, "val/ratio_var": NaN }, { "episode": 584, "epoch": 0.10967136150234742, "eps": 0, "loss/policy_avg": -7.548421854153275e-05, "loss/value_avg": 1.2808749675750732, "lr": 1.251e-06, "objective/entropy": 88.24446105957031, "objective/kl": 16.773902893066406, "objective/non_score_reward": -0.8386951684951782, "objective/rlhf_reward": -10.608843803405762, "objective/scores": -9.770148277282715, "policy/approxkl_avg": 8.546665952735566e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7298976182937622, "step": 584, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999833703041077, "val/ratio_var": NaN }, { "episode": 585, "epoch": 0.10985915492957747, "eps": 0, "loss/policy_avg": -4.7557758080074564e-05, "loss/value_avg": 0.09512297064065933, "lr": 1.248e-06, "objective/entropy": 71.7193603515625, "objective/kl": 14.984000205993652, "objective/non_score_reward": -0.7492000460624695, "objective/rlhf_reward": -11.565408706665039, "objective/scores": -10.816208839416504, "policy/approxkl_avg": 9.495313690877083e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4242035150527954, "step": 585, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999650120735168, "val/ratio_var": NaN }, { "episode": 586, "epoch": 0.1100469483568075, "eps": 0, "loss/policy_avg": -1.661732494540047e-05, "loss/value_avg": 0.1452462524175644, "lr": 1.245e-06, "objective/entropy": 99.74517059326172, "objective/kl": 9.083871841430664, "objective/non_score_reward": -0.4541935920715332, "objective/rlhf_reward": -10.944723129272461, "objective/scores": -10.490530014038086, "policy/approxkl_avg": 4.6482817595006054e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6673814058303833, "step": 586, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999928891658783, "val/ratio_var": NaN }, { "episode": 587, "epoch": 0.11023474178403755, "eps": 0, "loss/policy_avg": 5.62442910450045e-05, "loss/value_avg": 1.740000605583191, "lr": 1.242e-06, "objective/entropy": 107.32508850097656, "objective/kl": 19.59298324584961, "objective/non_score_reward": -0.9796491861343384, "objective/rlhf_reward": -11.460145950317383, "objective/scores": -10.480496406555176, "policy/approxkl_avg": 1.0184855625539058e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.147336483001709, "step": 587, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999571442604065, "val/ratio_var": NaN }, { "episode": 588, "epoch": 0.1104225352112676, "eps": 0, "loss/policy_avg": -6.416833639377728e-05, "loss/value_avg": 0.10407262295484543, "lr": 1.239e-06, "objective/entropy": 121.1346664428711, "objective/kl": 7.980755805969238, "objective/non_score_reward": -0.39903783798217773, "objective/rlhf_reward": -11.241436004638672, "objective/scores": -10.842398643493652, "policy/approxkl_avg": 1.2373821789424255e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9945759773254395, "step": 588, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999493360519409, "val/ratio_var": NaN }, { "episode": 589, "epoch": 0.11061032863849765, "eps": 0, "loss/policy_avg": -5.396357300924137e-05, "loss/value_avg": 0.6106168627738953, "lr": 1.236e-06, "objective/entropy": 116.49996948242188, "objective/kl": 16.547195434570312, "objective/non_score_reward": -0.8273598551750183, "objective/rlhf_reward": -8.885750770568848, "objective/scores": -8.058390617370605, "policy/approxkl_avg": 6.134651897582444e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9065240621566772, "step": 589, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999092817306519, "val/ratio_var": NaN }, { "episode": 590, "epoch": 0.1107981220657277, "eps": 0, "loss/policy_avg": -0.000122772078611888, "loss/value_avg": 3.829298973083496, "lr": 1.2329999999999999e-06, "objective/entropy": 112.59180450439453, "objective/kl": 56.99951171875, "objective/non_score_reward": -2.8499755859375, "objective/rlhf_reward": -5.983833312988281, "objective/scores": -3.1338577270507812, "policy/approxkl_avg": 1.3314213731518976e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3494181632995605, "step": 590, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000598430633545, "val/ratio_var": NaN }, { "episode": 591, "epoch": 0.11098591549295775, "eps": 0, "loss/policy_avg": -3.58617544407025e-05, "loss/value_avg": 0.845973014831543, "lr": 1.2299999999999999e-06, "objective/entropy": 94.89356994628906, "objective/kl": 22.252241134643555, "objective/non_score_reward": -1.112612009048462, "objective/rlhf_reward": -10.579236030578613, "objective/scores": -9.46662425994873, "policy/approxkl_avg": 1.2148645112119993e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8760825395584106, "step": 591, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998682141304016, "val/ratio_var": NaN }, { "episode": 592, "epoch": 0.1111737089201878, "eps": 0, "loss/policy_avg": -1.2019894711556844e-05, "loss/value_avg": 1.0060526132583618, "lr": 1.2269999999999999e-06, "objective/entropy": 62.852821350097656, "objective/kl": 15.283124923706055, "objective/non_score_reward": -0.7641563415527344, "objective/rlhf_reward": -11.040963172912598, "objective/scores": -10.276806831359863, "policy/approxkl_avg": 9.375926168786464e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2691673040390015, "step": 592, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999628067016602, "val/ratio_var": NaN }, { "episode": 593, "epoch": 0.11136150234741785, "eps": 0, "loss/policy_avg": 4.97799992444925e-05, "loss/value_avg": 0.7814868688583374, "lr": 1.224e-06, "objective/entropy": 121.14610290527344, "objective/kl": 21.403776168823242, "objective/non_score_reward": -1.0701887607574463, "objective/rlhf_reward": -10.663854598999023, "objective/scores": -9.593666076660156, "policy/approxkl_avg": 8.31698017123017e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.279937267303467, "step": 593, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999575614929199, "val/ratio_var": NaN }, { "episode": 594, "epoch": 0.1115492957746479, "eps": 0, "loss/policy_avg": -8.002766662684735e-06, "loss/value_avg": 0.20997440814971924, "lr": 1.221e-06, "objective/entropy": 91.71583557128906, "objective/kl": 18.56287384033203, "objective/non_score_reward": -0.9281437397003174, "objective/rlhf_reward": -10.264030456542969, "objective/scores": -9.33588695526123, "policy/approxkl_avg": 6.39908961375113e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.53899347782135, "step": 594, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000042915344238, "val/ratio_var": NaN }, { "episode": 595, "epoch": 0.11173708920187793, "eps": 0, "loss/policy_avg": -0.00013280364510137588, "loss/value_avg": 1.7505067586898804, "lr": 1.218e-06, "objective/entropy": 123.4354019165039, "objective/kl": 34.510459899902344, "objective/non_score_reward": -1.7255232334136963, "objective/rlhf_reward": -12.507040023803711, "objective/scores": -10.781517028808594, "policy/approxkl_avg": 2.0388361576806346e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9443705081939697, "step": 595, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000193119049072, "val/ratio_var": NaN }, { "episode": 596, "epoch": 0.11192488262910798, "eps": 0, "loss/policy_avg": 2.2960159185458906e-05, "loss/value_avg": 1.1628048419952393, "lr": 1.215e-06, "objective/entropy": 107.21800231933594, "objective/kl": 12.325201034545898, "objective/non_score_reward": -0.6162601113319397, "objective/rlhf_reward": -11.493751525878906, "objective/scores": -10.877490997314453, "policy/approxkl_avg": 1.7702841148548032e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.059298515319824, "step": 596, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000897645950317, "val/ratio_var": NaN }, { "episode": 597, "epoch": 0.11211267605633803, "eps": 0, "loss/policy_avg": 1.9658286873891484e-06, "loss/value_avg": 1.908244252204895, "lr": 1.2120000000000002e-06, "objective/entropy": 50.52460861206055, "objective/kl": 18.052474975585938, "objective/non_score_reward": -0.9026238322257996, "objective/rlhf_reward": -11.740630149841309, "objective/scores": -10.838006019592285, "policy/approxkl_avg": 9.120188337874424e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0673781633377075, "step": 597, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000462532043457, "val/ratio_var": NaN }, { "episode": 598, "epoch": 0.11230046948356807, "eps": 0, "loss/policy_avg": -1.1174183782713953e-05, "loss/value_avg": 0.11824294179677963, "lr": 1.2090000000000002e-06, "objective/entropy": 98.7296371459961, "objective/kl": 17.251724243164062, "objective/non_score_reward": -0.8625862002372742, "objective/rlhf_reward": -12.432256698608398, "objective/scores": -11.569670677185059, "policy/approxkl_avg": 7.505473575974975e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.122088670730591, "step": 598, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999496340751648, "val/ratio_var": NaN }, { "episode": 599, "epoch": 0.11248826291079812, "eps": 0, "loss/policy_avg": 1.114719361794414e-05, "loss/value_avg": 1.4963290691375732, "lr": 1.2060000000000002e-06, "objective/entropy": 116.14815521240234, "objective/kl": 33.772857666015625, "objective/non_score_reward": -1.6886430978775024, "objective/rlhf_reward": -11.847848892211914, "objective/scores": -10.159205436706543, "policy/approxkl_avg": 8.617880808969858e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.037984609603882, "step": 599, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999610185623169, "val/ratio_var": NaN }, { "episode": 600, "epoch": 0.11267605633802817, "eps": 0, "loss/policy_avg": 7.069785351632163e-05, "loss/value_avg": 1.650356411933899, "lr": 1.2030000000000002e-06, "objective/entropy": 73.7842025756836, "objective/kl": 12.519055366516113, "objective/non_score_reward": -0.6259527802467346, "objective/rlhf_reward": -10.210476875305176, "objective/scores": -9.584524154663086, "policy/approxkl_avg": 7.282959302301606e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4966487884521484, "step": 600, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999970555305481, "val/ratio_var": NaN }, { "episode": 601, "epoch": 0.11286384976525822, "eps": 0, "loss/policy_avg": 1.8236771211377345e-05, "loss/value_avg": 1.2351338863372803, "lr": 1.2000000000000002e-06, "objective/entropy": 102.0458755493164, "objective/kl": 37.86832809448242, "objective/non_score_reward": -1.8934166431427002, "objective/rlhf_reward": -11.157910346984863, "objective/scores": -9.264493942260742, "policy/approxkl_avg": 9.094209474369563e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8656398057937622, "step": 601, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9998806715011597, "val/ratio_var": NaN }, { "episode": 602, "epoch": 0.11305164319248827, "eps": 0, "loss/policy_avg": -3.289951564511284e-05, "loss/value_avg": 0.578690230846405, "lr": 1.1970000000000001e-06, "objective/entropy": 115.1370849609375, "objective/kl": 28.076948165893555, "objective/non_score_reward": -1.4038474559783936, "objective/rlhf_reward": -11.867419242858887, "objective/scores": -10.463571548461914, "policy/approxkl_avg": 8.212927582462726e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.078498125076294, "step": 602, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000289678573608, "val/ratio_var": NaN }, { "episode": 603, "epoch": 0.1132394366197183, "eps": 0, "loss/policy_avg": -4.9125472287414595e-05, "loss/value_avg": 2.011223316192627, "lr": 1.1940000000000001e-06, "objective/entropy": 81.45354461669922, "objective/kl": 27.819557189941406, "objective/non_score_reward": -1.3909778594970703, "objective/rlhf_reward": -6.333910942077637, "objective/scores": -4.942933082580566, "policy/approxkl_avg": 1.1022096657598013e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6406711339950562, "step": 603, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000290870666504, "val/ratio_var": NaN }, { "episode": 604, "epoch": 0.11342723004694835, "eps": 0, "loss/policy_avg": -9.428780322195962e-05, "loss/value_avg": 0.4560309648513794, "lr": 1.1910000000000001e-06, "objective/entropy": 120.36141204833984, "objective/kl": 25.31302833557129, "objective/non_score_reward": -1.2656514644622803, "objective/rlhf_reward": -10.862044334411621, "objective/scores": -9.596392631530762, "policy/approxkl_avg": 1.5062859404224582e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9583280086517334, "step": 604, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000392198562622, "val/ratio_var": NaN }, { "episode": 605, "epoch": 0.1136150234741784, "eps": 0, "loss/policy_avg": 2.15341460716445e-05, "loss/value_avg": 0.8600575923919678, "lr": 1.188e-06, "objective/entropy": 138.6592559814453, "objective/kl": 27.95821762084961, "objective/non_score_reward": -1.3979109525680542, "objective/rlhf_reward": -11.921324729919434, "objective/scores": -10.52341365814209, "policy/approxkl_avg": 7.600883122904634e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.842043399810791, "step": 605, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999897480010986, "val/ratio_var": NaN }, { "episode": 606, "epoch": 0.11380281690140845, "eps": 0, "loss/policy_avg": -8.341950888279825e-05, "loss/value_avg": 0.922037661075592, "lr": 1.185e-06, "objective/entropy": 92.87898254394531, "objective/kl": 26.83063507080078, "objective/non_score_reward": -1.3415316343307495, "objective/rlhf_reward": -11.361597061157227, "objective/scores": -10.020065307617188, "policy/approxkl_avg": 6.247417871918515e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8176430463790894, "step": 606, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000202655792236, "val/ratio_var": NaN }, { "episode": 607, "epoch": 0.1139906103286385, "eps": 0, "loss/policy_avg": -8.446765423286706e-05, "loss/value_avg": 0.6765355467796326, "lr": 1.182e-06, "objective/entropy": 138.92449951171875, "objective/kl": 20.945518493652344, "objective/non_score_reward": -1.0472759008407593, "objective/rlhf_reward": -10.490041732788086, "objective/scores": -9.442766189575195, "policy/approxkl_avg": 7.913904909173652e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.385882616043091, "step": 607, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999513626098633, "val/ratio_var": NaN }, { "episode": 608, "epoch": 0.11417840375586855, "eps": 0, "loss/policy_avg": 1.0854792890313547e-05, "loss/value_avg": 0.7005990743637085, "lr": 1.179e-06, "objective/entropy": 104.17340087890625, "objective/kl": 36.488704681396484, "objective/non_score_reward": -1.8244352340698242, "objective/rlhf_reward": -10.338783264160156, "objective/scores": -8.514348030090332, "policy/approxkl_avg": 3.913143942213537e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8011152744293213, "step": 608, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999591112136841, "val/ratio_var": NaN }, { "episode": 609, "epoch": 0.1143661971830986, "eps": 0, "loss/policy_avg": 9.374798537464812e-06, "loss/value_avg": 2.0272066593170166, "lr": 1.176e-06, "objective/entropy": 74.78363037109375, "objective/kl": 26.971664428710938, "objective/non_score_reward": -1.3485832214355469, "objective/rlhf_reward": -12.540300369262695, "objective/scores": -11.191717147827148, "policy/approxkl_avg": 7.643169652737924e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5997751951217651, "step": 609, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999736547470093, "val/ratio_var": NaN }, { "episode": 610, "epoch": 0.11455399061032864, "eps": 0, "loss/policy_avg": 2.8344820748316124e-05, "loss/value_avg": 0.43741777539253235, "lr": 1.173e-06, "objective/entropy": 99.23926544189453, "objective/kl": 35.05697250366211, "objective/non_score_reward": -1.7528486251831055, "objective/rlhf_reward": -10.23187255859375, "objective/scores": -8.479023933410645, "policy/approxkl_avg": 1.0570307296120518e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0960693359375, "step": 610, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000295639038086, "val/ratio_var": NaN }, { "episode": 611, "epoch": 0.11474178403755869, "eps": 0, "loss/policy_avg": 0.0001213865471072495, "loss/value_avg": 0.9677199125289917, "lr": 1.17e-06, "objective/entropy": 133.81507873535156, "objective/kl": 27.63768196105957, "objective/non_score_reward": -1.3818840980529785, "objective/rlhf_reward": -11.748819351196289, "objective/scores": -10.366935729980469, "policy/approxkl_avg": 1.0519256932184362e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.985609531402588, "step": 611, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999110698699951, "val/ratio_var": NaN }, { "episode": 612, "epoch": 0.11492957746478873, "eps": 0, "loss/policy_avg": -5.0421029300196096e-05, "loss/value_avg": 0.4162563681602478, "lr": 1.167e-06, "objective/entropy": 125.15415954589844, "objective/kl": 32.03522491455078, "objective/non_score_reward": -1.6017613410949707, "objective/rlhf_reward": -11.643844604492188, "objective/scores": -10.042082786560059, "policy/approxkl_avg": 2.700203936001344e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1958813667297363, "step": 612, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999555945396423, "val/ratio_var": NaN }, { "episode": 613, "epoch": 0.11511737089201877, "eps": 0, "loss/policy_avg": -1.8389719116385095e-05, "loss/value_avg": 0.7007461190223694, "lr": 1.164e-06, "objective/entropy": 95.66778564453125, "objective/kl": 17.954204559326172, "objective/non_score_reward": -0.8977103233337402, "objective/rlhf_reward": -10.731267929077148, "objective/scores": -9.83355712890625, "policy/approxkl_avg": 8.483547020432525e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.638248085975647, "step": 613, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999884366989136, "val/ratio_var": NaN }, { "episode": 614, "epoch": 0.11530516431924882, "eps": 0, "loss/policy_avg": 2.8448284865589812e-05, "loss/value_avg": 0.12666356563568115, "lr": 1.161e-06, "objective/entropy": 127.95281982421875, "objective/kl": 16.824581146240234, "objective/non_score_reward": -0.8412290215492249, "objective/rlhf_reward": -11.877168655395508, "objective/scores": -11.03593921661377, "policy/approxkl_avg": 1.0402968086964393e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0934574604034424, "step": 614, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999482035636902, "val/ratio_var": NaN }, { "episode": 615, "epoch": 0.11549295774647887, "eps": 0, "loss/policy_avg": -9.347807645099238e-05, "loss/value_avg": 0.6511586308479309, "lr": 1.158e-06, "objective/entropy": 81.805419921875, "objective/kl": 27.347139358520508, "objective/non_score_reward": -1.3673570156097412, "objective/rlhf_reward": -11.894957542419434, "objective/scores": -10.527600288391113, "policy/approxkl_avg": 8.040355226057727e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.803955078125, "step": 615, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000083565711975, "val/ratio_var": NaN }, { "episode": 616, "epoch": 0.11568075117370892, "eps": 0, "loss/policy_avg": 6.9658713073295075e-06, "loss/value_avg": 1.0428014993667603, "lr": 1.155e-06, "objective/entropy": 70.13774871826172, "objective/kl": 38.19187927246094, "objective/non_score_reward": -1.9095940589904785, "objective/rlhf_reward": -10.900224685668945, "objective/scores": -8.990631103515625, "policy/approxkl_avg": 1.096866952821074e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7208137512207031, "step": 616, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000289678573608, "val/ratio_var": NaN }, { "episode": 617, "epoch": 0.11586854460093897, "eps": 0, "loss/policy_avg": 5.435943603515625e-05, "loss/value_avg": 0.8192945718765259, "lr": 1.1520000000000002e-06, "objective/entropy": 104.65206909179688, "objective/kl": 32.42689895629883, "objective/non_score_reward": -1.6213449239730835, "objective/rlhf_reward": -11.871294021606445, "objective/scores": -10.24994945526123, "policy/approxkl_avg": 6.379372763376523e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7748908996582031, "step": 617, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999468922615051, "val/ratio_var": NaN }, { "episode": 618, "epoch": 0.11605633802816902, "eps": 0, "loss/policy_avg": -2.2348367565427907e-05, "loss/value_avg": 1.4020696878433228, "lr": 1.1490000000000001e-06, "objective/entropy": 67.62974548339844, "objective/kl": 25.11774253845215, "objective/non_score_reward": -1.2558872699737549, "objective/rlhf_reward": -9.139139175415039, "objective/scores": -7.883252143859863, "policy/approxkl_avg": 4.596734370920785e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2213020324707031, "step": 618, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999163746833801, "val/ratio_var": NaN }, { "episode": 619, "epoch": 0.11624413145539907, "eps": 0, "loss/policy_avg": -0.00011292943236185238, "loss/value_avg": 0.9130905270576477, "lr": 1.1460000000000001e-06, "objective/entropy": 78.07560729980469, "objective/kl": 26.949508666992188, "objective/non_score_reward": -1.3474754095077515, "objective/rlhf_reward": -9.077205657958984, "objective/scores": -7.729730606079102, "policy/approxkl_avg": 7.991868500312194e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7433621883392334, "step": 619, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000156164169312, "val/ratio_var": NaN }, { "episode": 620, "epoch": 0.11643192488262911, "eps": 0, "loss/policy_avg": 6.956199649721384e-05, "loss/value_avg": 0.5198440551757812, "lr": 1.1430000000000001e-06, "objective/entropy": 76.08948516845703, "objective/kl": 37.11058807373047, "objective/non_score_reward": -1.8555293083190918, "objective/rlhf_reward": -10.819272994995117, "objective/scores": -8.963743209838867, "policy/approxkl_avg": 7.423750503221527e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.858872890472412, "step": 620, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999975562095642, "val/ratio_var": NaN }, { "episode": 621, "epoch": 0.11661971830985915, "eps": 0, "loss/policy_avg": -1.4352348443935625e-05, "loss/value_avg": 0.9663742184638977, "lr": 1.14e-06, "objective/entropy": 71.14654541015625, "objective/kl": 26.131053924560547, "objective/non_score_reward": -1.3065526485443115, "objective/rlhf_reward": -11.478742599487305, "objective/scores": -10.172189712524414, "policy/approxkl_avg": 5.123061797007722e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3747282028198242, "step": 621, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0001040697097778, "val/ratio_var": NaN }, { "episode": 622, "epoch": 0.1168075117370892, "eps": 0, "loss/policy_avg": 7.51423358451575e-05, "loss/value_avg": 0.969784140586853, "lr": 1.137e-06, "objective/entropy": 116.23619079589844, "objective/kl": 51.30542755126953, "objective/non_score_reward": -2.5652711391448975, "objective/rlhf_reward": -12.00180721282959, "objective/scores": -9.436535835266113, "policy/approxkl_avg": 1.6926428259012027e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.150766134262085, "step": 622, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999911189079285, "val/ratio_var": NaN }, { "episode": 623, "epoch": 0.11699530516431925, "eps": 0, "loss/policy_avg": 7.190344331320375e-05, "loss/value_avg": 0.5514601469039917, "lr": 1.134e-06, "objective/entropy": 68.7391128540039, "objective/kl": 20.529993057250977, "objective/non_score_reward": -1.0264997482299805, "objective/rlhf_reward": -10.07262897491455, "objective/scores": -9.04612922668457, "policy/approxkl_avg": 8.452934707747772e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5621522665023804, "step": 623, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999138116836548, "val/ratio_var": NaN }, { "episode": 624, "epoch": 0.1171830985915493, "eps": 0, "loss/policy_avg": -3.936155917472206e-05, "loss/value_avg": 0.8891146183013916, "lr": 1.131e-06, "objective/entropy": 108.68719482421875, "objective/kl": 21.22992706298828, "objective/non_score_reward": -1.0614964962005615, "objective/rlhf_reward": -12.619478225708008, "objective/scores": -11.557981491088867, "policy/approxkl_avg": 1.0669266714558034e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.130718946456909, "step": 624, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999022483825684, "val/ratio_var": NaN }, { "episode": 625, "epoch": 0.11737089201877934, "eps": 0, "loss/policy_avg": -5.9465193771757185e-05, "loss/value_avg": 0.5068313479423523, "lr": 1.128e-06, "objective/entropy": 98.85600280761719, "objective/kl": 29.4016056060791, "objective/non_score_reward": -1.4700802564620972, "objective/rlhf_reward": -11.02827262878418, "objective/scores": -9.558192253112793, "policy/approxkl_avg": 4.152946786462053e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7946984767913818, "step": 625, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999772906303406, "val/ratio_var": NaN }, { "episode": 626, "epoch": 0.11755868544600939, "eps": 0, "loss/policy_avg": 1.0796312608363223e-06, "loss/value_avg": 0.6389533877372742, "lr": 1.125e-06, "objective/entropy": 109.19512939453125, "objective/kl": 39.800411224365234, "objective/non_score_reward": -1.9900206327438354, "objective/rlhf_reward": -11.414934158325195, "objective/scores": -9.42491340637207, "policy/approxkl_avg": 5.593495799871562e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7643859386444092, "step": 626, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000361204147339, "val/ratio_var": NaN }, { "episode": 627, "epoch": 0.11774647887323944, "eps": 0, "loss/policy_avg": 1.4413078133657109e-05, "loss/value_avg": 0.42930030822753906, "lr": 1.122e-06, "objective/entropy": 63.78059768676758, "objective/kl": 23.846229553222656, "objective/non_score_reward": -1.1923115253448486, "objective/rlhf_reward": -9.910884857177734, "objective/scores": -8.718573570251465, "policy/approxkl_avg": 6.691438869665944e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5004076957702637, "step": 627, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999830722808838, "val/ratio_var": NaN }, { "episode": 628, "epoch": 0.11793427230046949, "eps": 0, "loss/policy_avg": -6.136354204500094e-05, "loss/value_avg": 0.7287080883979797, "lr": 1.119e-06, "objective/entropy": 73.69166564941406, "objective/kl": 26.095308303833008, "objective/non_score_reward": -1.3047654628753662, "objective/rlhf_reward": -9.612781524658203, "objective/scores": -8.308015823364258, "policy/approxkl_avg": 6.051916301430538e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5225921869277954, "step": 628, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000088214874268, "val/ratio_var": NaN }, { "episode": 629, "epoch": 0.11812206572769952, "eps": 0, "loss/policy_avg": 3.1597210181644186e-05, "loss/value_avg": 0.391741007566452, "lr": 1.116e-06, "objective/entropy": 66.8505630493164, "objective/kl": 28.82988739013672, "objective/non_score_reward": -1.4414944648742676, "objective/rlhf_reward": -10.849529266357422, "objective/scores": -9.408035278320312, "policy/approxkl_avg": 6.68396538117122e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5115644931793213, "step": 629, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000030279159546, "val/ratio_var": NaN }, { "episode": 630, "epoch": 0.11830985915492957, "eps": 0, "loss/policy_avg": -3.688740252982825e-05, "loss/value_avg": 0.4427209496498108, "lr": 1.113e-06, "objective/entropy": 99.06431579589844, "objective/kl": 37.79683303833008, "objective/non_score_reward": -1.8898415565490723, "objective/rlhf_reward": -11.671003341674805, "objective/scores": -9.78116226196289, "policy/approxkl_avg": 7.870269769227889e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.041289806365967, "step": 630, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998456835746765, "val/ratio_var": NaN }, { "episode": 631, "epoch": 0.11849765258215962, "eps": 0, "loss/policy_avg": 8.949243783717975e-05, "loss/value_avg": 0.7287795543670654, "lr": 1.11e-06, "objective/entropy": 76.01678466796875, "objective/kl": 27.66084098815918, "objective/non_score_reward": -1.3830419778823853, "objective/rlhf_reward": -12.067522048950195, "objective/scores": -10.684479713439941, "policy/approxkl_avg": 7.969354243186899e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5633522272109985, "step": 631, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000282526016235, "val/ratio_var": NaN }, { "episode": 632, "epoch": 0.11868544600938967, "eps": 0, "loss/policy_avg": -3.780508995987475e-05, "loss/value_avg": 0.6389189958572388, "lr": 1.107e-06, "objective/entropy": 85.04682159423828, "objective/kl": 24.2257080078125, "objective/non_score_reward": -1.2112854719161987, "objective/rlhf_reward": -10.334474563598633, "objective/scores": -9.123188972473145, "policy/approxkl_avg": 7.713362748518193e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4945851564407349, "step": 632, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000754594802856, "val/ratio_var": NaN }, { "episode": 633, "epoch": 0.11887323943661972, "eps": 0, "loss/policy_avg": 1.99911737581715e-05, "loss/value_avg": 0.5986186861991882, "lr": 1.104e-06, "objective/entropy": 109.6605224609375, "objective/kl": 27.72635269165039, "objective/non_score_reward": -1.386317491531372, "objective/rlhf_reward": -11.137389183044434, "objective/scores": -9.75107192993164, "policy/approxkl_avg": 7.324167938804749e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0822086334228516, "step": 633, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999651908874512, "val/ratio_var": NaN }, { "episode": 634, "epoch": 0.11906103286384977, "eps": 0, "loss/policy_avg": -6.974417919991538e-05, "loss/value_avg": 0.5946488976478577, "lr": 1.101e-06, "objective/entropy": 24.526823043823242, "objective/kl": 33.90353012084961, "objective/non_score_reward": -1.695176601409912, "objective/rlhf_reward": -10.777315139770508, "objective/scores": -9.082138061523438, "policy/approxkl_avg": 1.6813734404763636e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.47868606448173523, "step": 634, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000004768371582, "val/ratio_var": NaN }, { "episode": 635, "epoch": 0.11924882629107982, "eps": 0, "loss/policy_avg": 5.9100817452417687e-05, "loss/value_avg": 0.7899169921875, "lr": 1.098e-06, "objective/entropy": 52.558624267578125, "objective/kl": 24.847576141357422, "objective/non_score_reward": -1.242378830909729, "objective/rlhf_reward": -8.643941879272461, "objective/scores": -7.4015631675720215, "policy/approxkl_avg": 8.746950186377944e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2094277143478394, "step": 635, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000373125076294, "val/ratio_var": NaN }, { "episode": 636, "epoch": 0.11943661971830986, "eps": 0, "loss/policy_avg": -1.2933082871313673e-05, "loss/value_avg": 1.12809157371521, "lr": 1.095e-06, "objective/entropy": 71.12615203857422, "objective/kl": 26.294586181640625, "objective/non_score_reward": -1.3147294521331787, "objective/rlhf_reward": -10.902179718017578, "objective/scores": -9.58745002746582, "policy/approxkl_avg": 9.756524121939947e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0728967189788818, "step": 636, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000529289245605, "val/ratio_var": NaN }, { "episode": 637, "epoch": 0.11962441314553991, "eps": 0, "loss/policy_avg": -5.9438199969008565e-05, "loss/value_avg": 0.3442104160785675, "lr": 1.092e-06, "objective/entropy": 82.84873962402344, "objective/kl": 21.384265899658203, "objective/non_score_reward": -1.0692132711410522, "objective/rlhf_reward": -10.56593132019043, "objective/scores": -9.496718406677246, "policy/approxkl_avg": 9.833616587684446e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5652799606323242, "step": 637, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000933408737183, "val/ratio_var": NaN }, { "episode": 638, "epoch": 0.11981220657276995, "eps": 0, "loss/policy_avg": 0.0001134962440119125, "loss/value_avg": 1.7857812643051147, "lr": 1.089e-06, "objective/entropy": 66.18215942382812, "objective/kl": 48.5030403137207, "objective/non_score_reward": -2.425152063369751, "objective/rlhf_reward": -7.410731315612793, "objective/scores": -4.985579013824463, "policy/approxkl_avg": 1.1906335828371084e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1372300386428833, "step": 638, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9998435378074646, "val/ratio_var": NaN }, { "episode": 639, "epoch": 0.12, "eps": 0, "loss/policy_avg": 3.265884515712969e-05, "loss/value_avg": 1.9841599464416504, "lr": 1.086e-06, "objective/entropy": 47.873138427734375, "objective/kl": 33.82026290893555, "objective/non_score_reward": -1.6910133361816406, "objective/rlhf_reward": -7.654568195343018, "objective/scores": -5.963554859161377, "policy/approxkl_avg": 6.649592876328825e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.971377968788147, "step": 639, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000018835067749, "val/ratio_var": NaN }, { "episode": 640, "epoch": 0.12018779342723004, "eps": 0, "loss/policy_avg": -9.889422653941438e-05, "loss/value_avg": 0.2642765939235687, "lr": 1.083e-06, "objective/entropy": 97.03423309326172, "objective/kl": 29.63457489013672, "objective/non_score_reward": -1.4817287921905518, "objective/rlhf_reward": -12.100995063781738, "objective/scores": -10.619266510009766, "policy/approxkl_avg": 1.2053082798502146e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9081616401672363, "step": 640, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000023603439331, "val/ratio_var": NaN }, { "episode": 641, "epoch": 0.12037558685446009, "eps": 0, "loss/policy_avg": -9.343984129372984e-05, "loss/value_avg": 0.8179861307144165, "lr": 1.08e-06, "objective/entropy": 67.32034301757812, "objective/kl": 27.878873825073242, "objective/non_score_reward": -1.3939437866210938, "objective/rlhf_reward": -9.722447395324707, "objective/scores": -8.328503608703613, "policy/approxkl_avg": 5.626371901712446e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1557698249816895, "step": 641, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999647736549377, "val/ratio_var": NaN }, { "episode": 642, "epoch": 0.12056338028169014, "eps": 0, "loss/policy_avg": 0.00015978542796801776, "loss/value_avg": 0.6914682388305664, "lr": 1.077e-06, "objective/entropy": 96.40831756591797, "objective/kl": 26.280982971191406, "objective/non_score_reward": -1.314049243927002, "objective/rlhf_reward": -9.924421310424805, "objective/scores": -8.610371589660645, "policy/approxkl_avg": 1.801911935217504e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.854552149772644, "step": 642, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000041127204895, "val/ratio_var": NaN }, { "episode": 643, "epoch": 0.12075117370892019, "eps": 0, "loss/policy_avg": -8.066645386861637e-05, "loss/value_avg": 0.8608419299125671, "lr": 1.074e-06, "objective/entropy": 123.97695922851562, "objective/kl": 32.592987060546875, "objective/non_score_reward": -1.6296496391296387, "objective/rlhf_reward": -9.08236026763916, "objective/scores": -7.4527106285095215, "policy/approxkl_avg": 1.5861697022501176e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.139302968978882, "step": 643, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000158548355103, "val/ratio_var": NaN }, { "episode": 644, "epoch": 0.12093896713615024, "eps": 0, "loss/policy_avg": 4.23845267505385e-05, "loss/value_avg": 0.4493165910243988, "lr": 1.071e-06, "objective/entropy": 113.72929382324219, "objective/kl": 31.27104377746582, "objective/non_score_reward": -1.5635522603988647, "objective/rlhf_reward": -10.860352516174316, "objective/scores": -9.29680061340332, "policy/approxkl_avg": 9.856599092472607e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8983131647109985, "step": 644, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998891949653625, "val/ratio_var": NaN }, { "episode": 645, "epoch": 0.12112676056338029, "eps": 0, "loss/policy_avg": -4.0171280488721095e-06, "loss/value_avg": 0.5104355812072754, "lr": 1.068e-06, "objective/entropy": 31.028196334838867, "objective/kl": 14.728726387023926, "objective/non_score_reward": -0.7364362478256226, "objective/rlhf_reward": -9.856388092041016, "objective/scores": -9.119952201843262, "policy/approxkl_avg": 1.7386792450224675e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5906659960746765, "step": 645, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999260306358337, "val/ratio_var": NaN }, { "episode": 646, "epoch": 0.12131455399061034, "eps": 0, "loss/policy_avg": 5.4989221098367125e-05, "loss/value_avg": 0.8700582981109619, "lr": 1.065e-06, "objective/entropy": 102.12580871582031, "objective/kl": 28.240219116210938, "objective/non_score_reward": -1.412010908126831, "objective/rlhf_reward": -10.944013595581055, "objective/scores": -9.532002449035645, "policy/approxkl_avg": 5.4725799003563225e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8568737506866455, "step": 646, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000252723693848, "val/ratio_var": NaN }, { "episode": 647, "epoch": 0.12150234741784037, "eps": 0, "loss/policy_avg": -8.155714567692485e-06, "loss/value_avg": 0.46077489852905273, "lr": 1.062e-06, "objective/entropy": 91.95164489746094, "objective/kl": 25.54942512512207, "objective/non_score_reward": -1.2774713039398193, "objective/rlhf_reward": -10.413782119750977, "objective/scores": -9.136310577392578, "policy/approxkl_avg": 6.369966598640531e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8885613679885864, "step": 647, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999326467514038, "val/ratio_var": NaN }, { "episode": 648, "epoch": 0.12169014084507042, "eps": 0, "loss/policy_avg": 1.2402264474076219e-05, "loss/value_avg": 0.39760854840278625, "lr": 1.059e-06, "objective/entropy": 49.03078079223633, "objective/kl": 28.8880615234375, "objective/non_score_reward": -1.444403052330017, "objective/rlhf_reward": -10.251646041870117, "objective/scores": -8.807243347167969, "policy/approxkl_avg": 4.56683650895684e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.27129328250885, "step": 648, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000118017196655, "val/ratio_var": NaN }, { "episode": 649, "epoch": 0.12187793427230047, "eps": 0, "loss/policy_avg": 9.371199848828837e-05, "loss/value_avg": 0.8697411417961121, "lr": 1.056e-06, "objective/entropy": 138.870849609375, "objective/kl": 26.925222396850586, "objective/non_score_reward": -1.3462610244750977, "objective/rlhf_reward": -11.394280433654785, "objective/scores": -10.048019409179688, "policy/approxkl_avg": 7.980589344924738e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3415966033935547, "step": 649, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000020265579224, "val/ratio_var": NaN }, { "episode": 650, "epoch": 0.12206572769953052, "eps": 0, "loss/policy_avg": 3.256887794123031e-06, "loss/value_avg": 0.31204378604888916, "lr": 1.053e-06, "objective/entropy": 76.27915954589844, "objective/kl": 32.739681243896484, "objective/non_score_reward": -1.63698410987854, "objective/rlhf_reward": -10.968317031860352, "objective/scores": -9.33133316040039, "policy/approxkl_avg": 1.869112935537487e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4565844535827637, "step": 650, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000181198120117, "val/ratio_var": NaN }, { "episode": 651, "epoch": 0.12225352112676056, "eps": 0, "loss/policy_avg": -7.809333510522265e-06, "loss/value_avg": 0.6247000098228455, "lr": 1.05e-06, "objective/entropy": 112.54379272460938, "objective/kl": 26.853078842163086, "objective/non_score_reward": -1.3426539897918701, "objective/rlhf_reward": -11.515996932983398, "objective/scores": -10.17334270477295, "policy/approxkl_avg": 1.0349548773547212e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7856030464172363, "step": 651, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000005960464478, "val/ratio_var": NaN }, { "episode": 652, "epoch": 0.12244131455399061, "eps": 0, "loss/policy_avg": 9.911465167533606e-05, "loss/value_avg": 0.6809861660003662, "lr": 1.0469999999999999e-06, "objective/entropy": 77.92534637451172, "objective/kl": 20.046802520751953, "objective/non_score_reward": -1.0023400783538818, "objective/rlhf_reward": -11.335124969482422, "objective/scores": -10.332784652709961, "policy/approxkl_avg": 1.141907191026803e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.439029335975647, "step": 652, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000001072883606, "val/ratio_var": NaN }, { "episode": 653, "epoch": 0.12262910798122066, "eps": 0, "loss/policy_avg": -9.379297125633457e-07, "loss/value_avg": 0.5294830203056335, "lr": 1.0439999999999999e-06, "objective/entropy": 130.64266967773438, "objective/kl": 31.344200134277344, "objective/non_score_reward": -1.5672099590301514, "objective/rlhf_reward": -11.117877006530762, "objective/scores": -9.550666809082031, "policy/approxkl_avg": 8.982718213701446e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.319948673248291, "step": 653, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.999947190284729, "val/ratio_var": NaN }, { "episode": 654, "epoch": 0.12281690140845071, "eps": 0, "loss/policy_avg": 1.6221460100496188e-05, "loss/value_avg": 0.4403526484966278, "lr": 1.0409999999999999e-06, "objective/entropy": 123.6383285522461, "objective/kl": 52.059165954589844, "objective/non_score_reward": -2.6029582023620605, "objective/rlhf_reward": -12.132709503173828, "objective/scores": -9.529751777648926, "policy/approxkl_avg": 1.6025333593461255e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.114377498626709, "step": 654, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0001088380813599, "val/ratio_var": NaN }, { "episode": 655, "epoch": 0.12300469483568074, "eps": 0, "loss/policy_avg": 3.514199852361344e-05, "loss/value_avg": 0.5124677419662476, "lr": 1.0379999999999998e-06, "objective/entropy": 76.13504791259766, "objective/kl": 15.033231735229492, "objective/non_score_reward": -0.7516615986824036, "objective/rlhf_reward": -9.190814018249512, "objective/scores": -8.439152717590332, "policy/approxkl_avg": 7.97235557570275e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4504510164260864, "step": 655, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000333786010742, "val/ratio_var": NaN }, { "episode": 656, "epoch": 0.1231924882629108, "eps": 0, "loss/policy_avg": 3.333361746626906e-05, "loss/value_avg": 0.2502870559692383, "lr": 1.035e-06, "objective/entropy": 87.50132751464844, "objective/kl": 46.71669387817383, "objective/non_score_reward": -2.335834503173828, "objective/rlhf_reward": -11.847869873046875, "objective/scores": -9.512035369873047, "policy/approxkl_avg": 1.7098381022151443e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.638054609298706, "step": 656, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000064492225647, "val/ratio_var": NaN }, { "episode": 657, "epoch": 0.12338028169014084, "eps": 0, "loss/policy_avg": 7.9991681559477e-05, "loss/value_avg": 0.2668488621711731, "lr": 1.032e-06, "objective/entropy": 90.07421112060547, "objective/kl": 27.223886489868164, "objective/non_score_reward": -1.3611942529678345, "objective/rlhf_reward": -10.523159980773926, "objective/scores": -9.161965370178223, "policy/approxkl_avg": 7.040363669830185e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.739501953125, "step": 657, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000613927841187, "val/ratio_var": NaN }, { "episode": 658, "epoch": 0.12356807511737089, "eps": 0, "loss/policy_avg": -3.242042657802813e-05, "loss/value_avg": 0.3613717555999756, "lr": 1.029e-06, "objective/entropy": 138.079833984375, "objective/kl": 34.48492431640625, "objective/non_score_reward": -1.7242462635040283, "objective/rlhf_reward": -12.832146644592285, "objective/scores": -11.107900619506836, "policy/approxkl_avg": 7.905086363280134e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.284688711166382, "step": 658, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999925434589386, "val/ratio_var": NaN }, { "episode": 659, "epoch": 0.12375586854460094, "eps": 0, "loss/policy_avg": -9.660900832386687e-05, "loss/value_avg": 0.38387933373451233, "lr": 1.026e-06, "objective/entropy": 88.59703063964844, "objective/kl": 40.447349548339844, "objective/non_score_reward": -2.022367477416992, "objective/rlhf_reward": -11.661483764648438, "objective/scores": -9.639116287231445, "policy/approxkl_avg": 6.546446229549474e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.035640001296997, "step": 659, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000406503677368, "val/ratio_var": NaN }, { "episode": 660, "epoch": 0.12394366197183099, "eps": 0, "loss/policy_avg": 3.2653224479872733e-06, "loss/value_avg": 0.36419785022735596, "lr": 1.0230000000000002e-06, "objective/entropy": 63.23592758178711, "objective/kl": 34.19123077392578, "objective/non_score_reward": -1.7095615863800049, "objective/rlhf_reward": -11.584578514099121, "objective/scores": -9.875017166137695, "policy/approxkl_avg": 8.15374434637306e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3349655866622925, "step": 660, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000874996185303, "val/ratio_var": NaN }, { "episode": 661, "epoch": 0.12413145539906104, "eps": 0, "loss/policy_avg": 6.670322181889787e-05, "loss/value_avg": 0.25589075684547424, "lr": 1.0200000000000002e-06, "objective/entropy": 99.17604064941406, "objective/kl": 26.910770416259766, "objective/non_score_reward": -1.3455384969711304, "objective/rlhf_reward": -11.272851943969727, "objective/scores": -9.927313804626465, "policy/approxkl_avg": 8.313563881756636e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7214747667312622, "step": 661, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999586939811707, "val/ratio_var": NaN }, { "episode": 662, "epoch": 0.12431924882629108, "eps": 0, "loss/policy_avg": -3.879475116264075e-05, "loss/value_avg": 0.45580416917800903, "lr": 1.0170000000000002e-06, "objective/entropy": 90.38248443603516, "objective/kl": 24.94965362548828, "objective/non_score_reward": -1.2474827766418457, "objective/rlhf_reward": -10.42814826965332, "objective/scores": -9.180665016174316, "policy/approxkl_avg": 1.4337916809381568e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.872689962387085, "step": 662, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999818801879883, "val/ratio_var": NaN }, { "episode": 663, "epoch": 0.12450704225352113, "eps": 0, "loss/policy_avg": -4.000933768111281e-05, "loss/value_avg": 0.12695464491844177, "lr": 1.0140000000000002e-06, "objective/entropy": 99.82009887695312, "objective/kl": 33.20677185058594, "objective/non_score_reward": -1.6603386402130127, "objective/rlhf_reward": -11.305058479309082, "objective/scores": -9.644720077514648, "policy/approxkl_avg": 9.781508936157479e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.136463165283203, "step": 663, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000395774841309, "val/ratio_var": NaN }, { "episode": 664, "epoch": 0.12469483568075117, "eps": 0, "loss/policy_avg": 3.4764128940878436e-05, "loss/value_avg": 1.117089033126831, "lr": 1.0110000000000001e-06, "objective/entropy": 119.20668029785156, "objective/kl": 30.015846252441406, "objective/non_score_reward": -1.5007922649383545, "objective/rlhf_reward": -13.645145416259766, "objective/scores": -12.144352912902832, "policy/approxkl_avg": 2.1600132527055393e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8842589855194092, "step": 664, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.00010347366333, "val/ratio_var": NaN }, { "episode": 665, "epoch": 0.12488262910798122, "eps": 0, "loss/policy_avg": 4.323023676988669e-05, "loss/value_avg": 0.3300560414791107, "lr": 1.0080000000000001e-06, "objective/entropy": 87.50535583496094, "objective/kl": 30.17668914794922, "objective/non_score_reward": -1.5088346004486084, "objective/rlhf_reward": -11.797667503356934, "objective/scores": -10.288832664489746, "policy/approxkl_avg": 8.577931964737218e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6965885162353516, "step": 665, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999055862426758, "val/ratio_var": NaN }, { "episode": 666, "epoch": 0.12507042253521128, "eps": 0, "loss/policy_avg": -5.1714338042074814e-05, "loss/value_avg": 1.345415472984314, "lr": 1.0050000000000001e-06, "objective/entropy": 65.3601303100586, "objective/kl": 79.52116394042969, "objective/non_score_reward": -3.976058006286621, "objective/rlhf_reward": -13.893427848815918, "objective/scores": -9.917369842529297, "policy/approxkl_avg": 9.46338261087476e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1538108587265015, "step": 666, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999052882194519, "val/ratio_var": NaN }, { "episode": 667, "epoch": 0.12525821596244133, "eps": 0, "loss/policy_avg": 7.20294046914205e-05, "loss/value_avg": 0.6259029507637024, "lr": 1.002e-06, "objective/entropy": 108.81614685058594, "objective/kl": 26.23343849182129, "objective/non_score_reward": -1.3116719722747803, "objective/rlhf_reward": -12.450176239013672, "objective/scores": -11.138504028320312, "policy/approxkl_avg": 1.325189344925093e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7143232822418213, "step": 667, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999950110912323, "val/ratio_var": NaN }, { "episode": 668, "epoch": 0.12544600938967135, "eps": 0, "loss/policy_avg": -3.395215389900841e-05, "loss/value_avg": 0.6215286254882812, "lr": 9.99e-07, "objective/entropy": 125.37100219726562, "objective/kl": 28.191638946533203, "objective/non_score_reward": -1.4095818996429443, "objective/rlhf_reward": -9.69089412689209, "objective/scores": -8.281311988830566, "policy/approxkl_avg": 1.1387615472813195e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1255321502685547, "step": 668, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999430179595947, "val/ratio_var": NaN }, { "episode": 669, "epoch": 0.1256338028169014, "eps": 0, "loss/policy_avg": 6.296049832599238e-05, "loss/value_avg": 0.7984474897384644, "lr": 9.96e-07, "objective/entropy": 85.79435729980469, "objective/kl": 39.25971984863281, "objective/non_score_reward": -1.9629862308502197, "objective/rlhf_reward": -9.434052467346191, "objective/scores": -7.471066474914551, "policy/approxkl_avg": 3.935497261409182e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5654550790786743, "step": 669, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000687837600708, "val/ratio_var": NaN }, { "episode": 670, "epoch": 0.12582159624413145, "eps": 0, "loss/policy_avg": 8.149416680680588e-05, "loss/value_avg": 0.35018518567085266, "lr": 9.93e-07, "objective/entropy": 79.69126892089844, "objective/kl": 29.47400665283203, "objective/non_score_reward": -1.4737004041671753, "objective/rlhf_reward": -11.156303405761719, "objective/scores": -9.682602882385254, "policy/approxkl_avg": 8.46866470283203e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4965705871582031, "step": 670, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000072717666626, "val/ratio_var": NaN }, { "episode": 671, "epoch": 0.1260093896713615, "eps": 0, "loss/policy_avg": -4.912322310701711e-06, "loss/value_avg": 4.178527355194092, "lr": 9.9e-07, "objective/entropy": 39.163692474365234, "objective/kl": 14.284529685974121, "objective/non_score_reward": -0.7142265439033508, "objective/rlhf_reward": -3.7591538429260254, "objective/scores": -3.0449273586273193, "policy/approxkl_avg": 5.255250457025795e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.7843824028968811, "step": 671, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000001072883606, "val/ratio_var": NaN }, { "episode": 672, "epoch": 0.12619718309859154, "eps": 0, "loss/policy_avg": 1.545672057545744e-05, "loss/value_avg": 0.610058069229126, "lr": 9.87e-07, "objective/entropy": 94.19535064697266, "objective/kl": 17.947185516357422, "objective/non_score_reward": -0.8973594307899475, "objective/rlhf_reward": -9.305487632751465, "objective/scores": -8.408127784729004, "policy/approxkl_avg": 8.758792091612122e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.979925274848938, "step": 672, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000462532043457, "val/ratio_var": NaN }, { "episode": 673, "epoch": 0.1263849765258216, "eps": 0, "loss/policy_avg": 3.7769103073515e-05, "loss/value_avg": 0.20134104788303375, "lr": 9.84e-07, "objective/entropy": 86.34614562988281, "objective/kl": 16.930896759033203, "objective/non_score_reward": -0.8465448021888733, "objective/rlhf_reward": -10.624837875366211, "objective/scores": -9.778292655944824, "policy/approxkl_avg": 1.1208891947944721e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.680355429649353, "step": 673, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998750686645508, "val/ratio_var": NaN }, { "episode": 674, "epoch": 0.12657276995305164, "eps": 0, "loss/policy_avg": -3.052657484658994e-05, "loss/value_avg": 6.159263610839844, "lr": 9.81e-07, "objective/entropy": 71.6851806640625, "objective/kl": 31.67455291748047, "objective/non_score_reward": -1.5837275981903076, "objective/rlhf_reward": -2.742649555206299, "objective/scores": -1.1589219570159912, "policy/approxkl_avg": 6.403440977464925e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3023244142532349, "step": 674, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000170469284058, "val/ratio_var": NaN }, { "episode": 675, "epoch": 0.1267605633802817, "eps": 0, "loss/policy_avg": -2.9298495064722374e-05, "loss/value_avg": 0.5070905685424805, "lr": 9.78e-07, "objective/entropy": 130.37442016601562, "objective/kl": 39.265113830566406, "objective/non_score_reward": -1.9632556438446045, "objective/rlhf_reward": -12.333551406860352, "objective/scores": -10.370295524597168, "policy/approxkl_avg": 6.311410061243805e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.104360818862915, "step": 675, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999897480010986, "val/ratio_var": NaN }, { "episode": 676, "epoch": 0.12694835680751174, "eps": 0, "loss/policy_avg": 7.670780178159475e-05, "loss/value_avg": 0.43919938802719116, "lr": 9.75e-07, "objective/entropy": 117.07693481445312, "objective/kl": 34.929779052734375, "objective/non_score_reward": -1.74648916721344, "objective/rlhf_reward": -12.127240180969238, "objective/scores": -10.38075065612793, "policy/approxkl_avg": 1.1869450133872306e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1393167972564697, "step": 676, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000144243240356, "val/ratio_var": NaN }, { "episode": 677, "epoch": 0.12713615023474178, "eps": 0, "loss/policy_avg": -5.017136572860181e-05, "loss/value_avg": 0.31192079186439514, "lr": 9.72e-07, "objective/entropy": 64.76532745361328, "objective/kl": 22.230928421020508, "objective/non_score_reward": -1.1115463972091675, "objective/rlhf_reward": -10.892926216125488, "objective/scores": -9.781379699707031, "policy/approxkl_avg": 6.903794513846151e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5871374607086182, "step": 677, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000007152557373, "val/ratio_var": NaN }, { "episode": 678, "epoch": 0.12732394366197183, "eps": 0, "loss/policy_avg": -2.348197995161172e-06, "loss/value_avg": 0.3270295262336731, "lr": 9.69e-07, "objective/entropy": 76.50373840332031, "objective/kl": 30.47376251220703, "objective/non_score_reward": -1.5236880779266357, "objective/rlhf_reward": -9.983142852783203, "objective/scores": -8.459454536437988, "policy/approxkl_avg": 5.388772450487522e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.638789415359497, "step": 678, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000200271606445, "val/ratio_var": NaN }, { "episode": 679, "epoch": 0.12751173708920188, "eps": 0, "loss/policy_avg": -0.00011325332161504775, "loss/value_avg": 1.0452512502670288, "lr": 9.660000000000002e-07, "objective/entropy": 72.462890625, "objective/kl": 66.36325073242188, "objective/non_score_reward": -3.318162441253662, "objective/rlhf_reward": -13.586385726928711, "objective/scores": -10.26822280883789, "policy/approxkl_avg": 1.8836381343589892e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.184810996055603, "step": 679, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000462532043457, "val/ratio_var": NaN }, { "episode": 680, "epoch": 0.12769953051643193, "eps": 0, "loss/policy_avg": 1.6581338059040718e-05, "loss/value_avg": 1.0170154571533203, "lr": 9.630000000000001e-07, "objective/entropy": 40.69889831542969, "objective/kl": 27.21140480041504, "objective/non_score_reward": -1.3605701923370361, "objective/rlhf_reward": -13.458516120910645, "objective/scores": -12.097946166992188, "policy/approxkl_avg": 4.8458328905098824e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1512635946273804, "step": 680, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999645352363586, "val/ratio_var": NaN }, { "episode": 681, "epoch": 0.12788732394366198, "eps": 0, "loss/policy_avg": 6.337435479508713e-05, "loss/value_avg": 0.4042023718357086, "lr": 9.600000000000001e-07, "objective/entropy": 145.6177215576172, "objective/kl": 34.25023651123047, "objective/non_score_reward": -1.7125118970870972, "objective/rlhf_reward": -11.64734935760498, "objective/scores": -9.934837341308594, "policy/approxkl_avg": 1.406243228530002e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.186652421951294, "step": 681, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000555515289307, "val/ratio_var": NaN }, { "episode": 682, "epoch": 0.12807511737089203, "eps": 0, "loss/policy_avg": 0.000131301159854047, "loss/value_avg": 3.090543270111084, "lr": 9.570000000000001e-07, "objective/entropy": 100.52531433105469, "objective/kl": 47.13237380981445, "objective/non_score_reward": -2.356618642807007, "objective/rlhf_reward": -5.69344425201416, "objective/scores": -3.336825370788574, "policy/approxkl_avg": 2.7199126861887635e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0515527725219727, "step": 682, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999027848243713, "val/ratio_var": NaN }, { "episode": 683, "epoch": 0.12826291079812208, "eps": 0, "loss/policy_avg": 6.114762072684243e-05, "loss/value_avg": 0.851076066493988, "lr": 9.54e-07, "objective/entropy": 143.53762817382812, "objective/kl": 33.9895133972168, "objective/non_score_reward": -1.6994757652282715, "objective/rlhf_reward": -14.063257217407227, "objective/scores": -12.363780975341797, "policy/approxkl_avg": 9.474661766262216e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2986092567443848, "step": 683, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999884366989136, "val/ratio_var": NaN }, { "episode": 684, "epoch": 0.12845070422535212, "eps": 0, "loss/policy_avg": -3.697737156471703e-06, "loss/value_avg": 0.8150542378425598, "lr": 9.510000000000001e-07, "objective/entropy": 96.95284271240234, "objective/kl": 29.26599884033203, "objective/non_score_reward": -1.4632998704910278, "objective/rlhf_reward": -8.371633529663086, "objective/scores": -6.9083333015441895, "policy/approxkl_avg": 1.2095495094399666e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9376682043075562, "step": 684, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999828338623047, "val/ratio_var": NaN }, { "episode": 685, "epoch": 0.12863849765258217, "eps": 0, "loss/policy_avg": -1.11381959868595e-05, "loss/value_avg": 0.3700055778026581, "lr": 9.480000000000001e-07, "objective/entropy": 63.5592041015625, "objective/kl": 42.694190979003906, "objective/non_score_reward": -2.134709596633911, "objective/rlhf_reward": -12.44439697265625, "objective/scores": -10.309687614440918, "policy/approxkl_avg": 1.0921198878577343e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7059556245803833, "step": 685, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0001013278961182, "val/ratio_var": NaN }, { "episode": 686, "epoch": 0.1288262910798122, "eps": 0, "loss/policy_avg": -5.141744168213336e-06, "loss/value_avg": 0.35212624073028564, "lr": 9.450000000000001e-07, "objective/entropy": 89.4378433227539, "objective/kl": 38.214683532714844, "objective/non_score_reward": -1.9107341766357422, "objective/rlhf_reward": -11.250372886657715, "objective/scores": -9.339638710021973, "policy/approxkl_avg": 1.0137479478089517e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.99883234500885, "step": 686, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.999969482421875, "val/ratio_var": NaN }, { "episode": 687, "epoch": 0.12901408450704224, "eps": 0, "loss/policy_avg": -9.349606989417225e-05, "loss/value_avg": 0.8120190501213074, "lr": 9.42e-07, "objective/entropy": 32.639923095703125, "objective/kl": 53.11697006225586, "objective/non_score_reward": -2.655848503112793, "objective/rlhf_reward": -12.726944923400879, "objective/scores": -10.071096420288086, "policy/approxkl_avg": 4.715940704613786e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.8894273638725281, "step": 687, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999417066574097, "val/ratio_var": NaN }, { "episode": 688, "epoch": 0.1292018779342723, "eps": 0, "loss/policy_avg": 8.301240450236946e-05, "loss/value_avg": 0.6000533699989319, "lr": 9.39e-07, "objective/entropy": 96.499267578125, "objective/kl": 47.31165313720703, "objective/non_score_reward": -2.3655827045440674, "objective/rlhf_reward": -12.789856910705566, "objective/scores": -10.424274444580078, "policy/approxkl_avg": 9.009987422814447e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8483610153198242, "step": 688, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999359846115112, "val/ratio_var": NaN }, { "episode": 689, "epoch": 0.12938967136150234, "eps": 0, "loss/policy_avg": -2.6005618565250188e-05, "loss/value_avg": 1.1140880584716797, "lr": 9.36e-07, "objective/entropy": 68.55485534667969, "objective/kl": 10.033607482910156, "objective/non_score_reward": -0.5016804337501526, "objective/rlhf_reward": -8.456791877746582, "objective/scores": -7.955111503601074, "policy/approxkl_avg": 3.475912890849031e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.197624921798706, "step": 689, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000037670135498, "val/ratio_var": NaN }, { "episode": 690, "epoch": 0.1295774647887324, "eps": 0, "loss/policy_avg": -9.843987936619669e-05, "loss/value_avg": 0.35176071524620056, "lr": 9.33e-07, "objective/entropy": 77.99143981933594, "objective/kl": 28.079818725585938, "objective/non_score_reward": -1.4039909839630127, "objective/rlhf_reward": -10.300599098205566, "objective/scores": -8.896608352661133, "policy/approxkl_avg": 1.3416180877356965e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6704953908920288, "step": 690, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000228881835938, "val/ratio_var": NaN }, { "episode": 691, "epoch": 0.12976525821596244, "eps": 0, "loss/policy_avg": -5.930774568696506e-05, "loss/value_avg": 0.3037230670452118, "lr": 9.3e-07, "objective/entropy": 82.43186950683594, "objective/kl": 36.823360443115234, "objective/non_score_reward": -1.8411681652069092, "objective/rlhf_reward": -11.622908592224121, "objective/scores": -9.781740188598633, "policy/approxkl_avg": 6.192441759367284e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4717026948928833, "step": 691, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999366998672485, "val/ratio_var": NaN }, { "episode": 692, "epoch": 0.12995305164319249, "eps": 0, "loss/policy_avg": 5.315384623827413e-05, "loss/value_avg": 0.5079905986785889, "lr": 9.27e-07, "objective/entropy": 116.00983428955078, "objective/kl": 55.83284378051758, "objective/non_score_reward": -2.791642189025879, "objective/rlhf_reward": -10.841941833496094, "objective/scores": -8.050299644470215, "policy/approxkl_avg": 8.37965359323789e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.207510232925415, "step": 692, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000358819961548, "val/ratio_var": NaN }, { "episode": 693, "epoch": 0.13014084507042253, "eps": 0, "loss/policy_avg": -1.4813441339356359e-05, "loss/value_avg": 0.23592044413089752, "lr": 9.24e-07, "objective/entropy": 2.512791156768799, "objective/kl": 21.98484230041504, "objective/non_score_reward": -1.0992422103881836, "objective/rlhf_reward": -10.376078605651855, "objective/scores": -9.276836395263672, "policy/approxkl_avg": 3.5240352302956524e-10, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.10830861330032349, "step": 693, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999961256980896, "val/ratio_var": NaN }, { "episode": 694, "epoch": 0.13032863849765258, "eps": 0, "loss/policy_avg": -9.218251943821087e-05, "loss/value_avg": 0.37636202573776245, "lr": 9.210000000000001e-07, "objective/entropy": 111.29798126220703, "objective/kl": 41.59011459350586, "objective/non_score_reward": -2.079505681991577, "objective/rlhf_reward": -12.375267028808594, "objective/scores": -10.295761108398438, "policy/approxkl_avg": 7.253618150571128e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.023914337158203, "step": 694, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999985694885254, "val/ratio_var": NaN }, { "episode": 695, "epoch": 0.13051643192488263, "eps": 0, "loss/policy_avg": 4.715289833256975e-05, "loss/value_avg": 0.6006859540939331, "lr": 9.18e-07, "objective/entropy": 78.38548278808594, "objective/kl": 29.079795837402344, "objective/non_score_reward": -1.4539897441864014, "objective/rlhf_reward": -11.019211769104004, "objective/scores": -9.565221786499023, "policy/approxkl_avg": 6.889892745221005e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3155033588409424, "step": 695, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999805092811584, "val/ratio_var": NaN }, { "episode": 696, "epoch": 0.13070422535211268, "eps": 0, "loss/policy_avg": -2.6091089466717676e-07, "loss/value_avg": 0.40639322996139526, "lr": 9.15e-07, "objective/entropy": 65.39900970458984, "objective/kl": 49.998287200927734, "objective/non_score_reward": -2.4999146461486816, "objective/rlhf_reward": -11.392606735229492, "objective/scores": -8.892692565917969, "policy/approxkl_avg": 7.650334765685329e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.533023476600647, "step": 696, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999703168869019, "val/ratio_var": NaN }, { "episode": 697, "epoch": 0.13089201877934273, "eps": 0, "loss/policy_avg": -8.413026807829738e-05, "loss/value_avg": 0.37719547748565674, "lr": 9.12e-07, "objective/entropy": 90.79859924316406, "objective/kl": 40.03633499145508, "objective/non_score_reward": -2.001816749572754, "objective/rlhf_reward": -11.889986038208008, "objective/scores": -9.888169288635254, "policy/approxkl_avg": 4.331257130729682e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.671644687652588, "step": 697, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999828338623047, "val/ratio_var": NaN }, { "episode": 698, "epoch": 0.13107981220657278, "eps": 0, "loss/policy_avg": 9.07520079636015e-05, "loss/value_avg": 3.015502691268921, "lr": 9.09e-07, "objective/entropy": 126.66168212890625, "objective/kl": 26.40863037109375, "objective/non_score_reward": -1.3204314708709717, "objective/rlhf_reward": -5.904123306274414, "objective/scores": -4.5836920738220215, "policy/approxkl_avg": 1.9776122428538656e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1857197284698486, "step": 698, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000683069229126, "val/ratio_var": NaN }, { "episode": 699, "epoch": 0.13126760563380283, "eps": 0, "loss/policy_avg": 9.03291493159486e-06, "loss/value_avg": 0.6971096992492676, "lr": 9.06e-07, "objective/entropy": 132.93026733398438, "objective/kl": 27.53096580505371, "objective/non_score_reward": -1.3765482902526855, "objective/rlhf_reward": -13.523447036743164, "objective/scores": -12.14689826965332, "policy/approxkl_avg": 8.58988045138176e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0957930088043213, "step": 699, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000238418579102, "val/ratio_var": NaN }, { "episode": 700, "epoch": 0.13145539906103287, "eps": 0, "loss/policy_avg": -4.255546627973672e-06, "loss/value_avg": 2.414501190185547, "lr": 9.03e-07, "objective/entropy": 78.75846862792969, "objective/kl": 44.127464294433594, "objective/non_score_reward": -2.2063732147216797, "objective/rlhf_reward": -14.967890739440918, "objective/scores": -12.761517524719238, "policy/approxkl_avg": 1.0966170549409071e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5022456645965576, "step": 700, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000773668289185, "val/ratio_var": NaN }, { "episode": 701, "epoch": 0.13164319248826292, "eps": 0, "loss/policy_avg": -8.655044439365156e-06, "loss/value_avg": 0.37525343894958496, "lr": 9e-07, "objective/entropy": 41.22405242919922, "objective/kl": 27.588314056396484, "objective/non_score_reward": -1.379415512084961, "objective/rlhf_reward": -9.581021308898926, "objective/scores": -8.201605796813965, "policy/approxkl_avg": 5.707187966663696e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0501478910446167, "step": 701, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0001094341278076, "val/ratio_var": NaN }, { "episode": 702, "epoch": 0.13183098591549297, "eps": 0, "loss/policy_avg": -5.827310087624937e-05, "loss/value_avg": 0.112633615732193, "lr": 8.969999999999999e-07, "objective/entropy": 2.907475471496582, "objective/kl": 24.7932186126709, "objective/non_score_reward": -1.2396610975265503, "objective/rlhf_reward": -10.085597038269043, "objective/scores": -8.845935821533203, "policy/approxkl_avg": 5.443320461040457e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.1258729249238968, "step": 702, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999851584434509, "val/ratio_var": NaN }, { "episode": 703, "epoch": 0.132018779342723, "eps": 0, "loss/policy_avg": -0.00016909725673031062, "loss/value_avg": 1.3484810590744019, "lr": 8.939999999999999e-07, "objective/entropy": 100.64347839355469, "objective/kl": 30.544702529907227, "objective/non_score_reward": -1.5272350311279297, "objective/rlhf_reward": -12.923040390014648, "objective/scores": -11.395805358886719, "policy/approxkl_avg": 9.453452065599777e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8868615627288818, "step": 703, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999787211418152, "val/ratio_var": NaN }, { "episode": 704, "epoch": 0.13220657276995304, "eps": 0, "loss/policy_avg": -4.741380962514086e-06, "loss/value_avg": 0.6235293745994568, "lr": 8.91e-07, "objective/entropy": 89.78260803222656, "objective/kl": 37.88606643676758, "objective/non_score_reward": -1.8943032026290894, "objective/rlhf_reward": -11.971650123596191, "objective/scores": -10.077346801757812, "policy/approxkl_avg": 8.358723135870605e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5969399213790894, "step": 704, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.999928891658783, "val/ratio_var": NaN }, { "episode": 705, "epoch": 0.1323943661971831, "eps": 0, "loss/policy_avg": -1.989445536310086e-06, "loss/value_avg": 0.8036093711853027, "lr": 8.88e-07, "objective/entropy": 44.98533630371094, "objective/kl": 68.38500213623047, "objective/non_score_reward": -3.419250011444092, "objective/rlhf_reward": -11.532384872436523, "objective/scores": -8.113134384155273, "policy/approxkl_avg": 4.147914367536032e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0332238674163818, "step": 705, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000132322311401, "val/ratio_var": NaN }, { "episode": 706, "epoch": 0.13258215962441314, "eps": 0, "loss/policy_avg": -0.0003076679422520101, "loss/value_avg": 0.15320651233196259, "lr": 8.85e-07, "objective/entropy": 114.68499755859375, "objective/kl": 28.553749084472656, "objective/non_score_reward": -1.427687406539917, "objective/rlhf_reward": -11.078180313110352, "objective/scores": -9.650492668151855, "policy/approxkl_avg": 1.0055664461106062e-06, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0006840229034424, "step": 706, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998681545257568, "val/ratio_var": NaN }, { "episode": 707, "epoch": 0.13276995305164319, "eps": 0, "loss/policy_avg": -3.893420398526359e-06, "loss/value_avg": 0.8196942806243896, "lr": 8.82e-07, "objective/entropy": 134.7110137939453, "objective/kl": 35.655914306640625, "objective/non_score_reward": -1.7827956676483154, "objective/rlhf_reward": -11.147274017333984, "objective/scores": -9.36447811126709, "policy/approxkl_avg": 1.5181664991814614e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.225710391998291, "step": 707, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0001040697097778, "val/ratio_var": NaN }, { "episode": 708, "epoch": 0.13295774647887323, "eps": 0, "loss/policy_avg": -1.5384746802737936e-05, "loss/value_avg": 0.4459886848926544, "lr": 8.79e-07, "objective/entropy": 118.45359802246094, "objective/kl": 57.86504364013672, "objective/non_score_reward": -2.893252372741699, "objective/rlhf_reward": -11.83791732788086, "objective/scores": -8.94466495513916, "policy/approxkl_avg": 1.316905269277413e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.263934373855591, "step": 708, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999493360519409, "val/ratio_var": NaN }, { "episode": 709, "epoch": 0.13314553990610328, "eps": 0, "loss/policy_avg": -5.515116754395422e-06, "loss/value_avg": 0.19180503487586975, "lr": 8.76e-07, "objective/entropy": 8.50570011138916, "objective/kl": 28.405298233032227, "objective/non_score_reward": -1.4202648401260376, "objective/rlhf_reward": -10.291680335998535, "objective/scores": -8.871415138244629, "policy/approxkl_avg": 2.4145245802742465e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.22565963864326477, "step": 709, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000137090682983, "val/ratio_var": NaN }, { "episode": 710, "epoch": 0.13333333333333333, "eps": 0, "loss/policy_avg": -9.137279266724363e-05, "loss/value_avg": 0.39444711804389954, "lr": 8.729999999999999e-07, "objective/entropy": 90.79580688476562, "objective/kl": 40.800106048583984, "objective/non_score_reward": -2.0400054454803467, "objective/rlhf_reward": -11.866226196289062, "objective/scores": -9.826220512390137, "policy/approxkl_avg": 1.2460175469186652e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.585340976715088, "step": 710, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000289678573608, "val/ratio_var": NaN }, { "episode": 711, "epoch": 0.13352112676056338, "eps": 0, "loss/policy_avg": 4.025225280201994e-05, "loss/value_avg": 0.1964290738105774, "lr": 8.699999999999999e-07, "objective/entropy": 94.52426147460938, "objective/kl": 25.038833618164062, "objective/non_score_reward": -1.2519418001174927, "objective/rlhf_reward": -10.326258659362793, "objective/scores": -9.07431697845459, "policy/approxkl_avg": 1.0503944736228732e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.942009687423706, "step": 711, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000060796737671, "val/ratio_var": NaN }, { "episode": 712, "epoch": 0.13370892018779343, "eps": 0, "loss/policy_avg": 7.776944403303787e-05, "loss/value_avg": 0.6525751948356628, "lr": 8.669999999999999e-07, "objective/entropy": 77.65379333496094, "objective/kl": 32.982391357421875, "objective/non_score_reward": -1.6491196155548096, "objective/rlhf_reward": -9.51500129699707, "objective/scores": -7.86588191986084, "policy/approxkl_avg": 7.931048173759336e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4294065237045288, "step": 712, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000556707382202, "val/ratio_var": NaN }, { "episode": 713, "epoch": 0.13389671361502348, "eps": 0, "loss/policy_avg": 2.172757922380697e-05, "loss/value_avg": 0.6786952018737793, "lr": 8.64e-07, "objective/entropy": 118.79020690917969, "objective/kl": 64.39616394042969, "objective/non_score_reward": -3.219808578491211, "objective/rlhf_reward": -11.972475051879883, "objective/scores": -8.752666473388672, "policy/approxkl_avg": 1.309205259758528e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.331073045730591, "step": 713, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000756978988647, "val/ratio_var": NaN }, { "episode": 714, "epoch": 0.13408450704225353, "eps": 0, "loss/policy_avg": -3.6882905988022685e-05, "loss/value_avg": 1.3594732284545898, "lr": 8.61e-07, "objective/entropy": 31.39244270324707, "objective/kl": 38.91824722290039, "objective/non_score_reward": -1.9459123611450195, "objective/rlhf_reward": -7.960195541381836, "objective/scores": -6.014283180236816, "policy/approxkl_avg": 4.492724059446118e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.6659660935401917, "step": 714, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999944567680359, "val/ratio_var": NaN }, { "episode": 715, "epoch": 0.13427230046948357, "eps": 0, "loss/policy_avg": 0.00016236754890996963, "loss/value_avg": 0.16680379211902618, "lr": 8.58e-07, "objective/entropy": 91.09342956542969, "objective/kl": 27.56725311279297, "objective/non_score_reward": -1.3783626556396484, "objective/rlhf_reward": -11.66098690032959, "objective/scores": -10.282624244689941, "policy/approxkl_avg": 1.1643520991810874e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8487595319747925, "step": 715, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999987483024597, "val/ratio_var": NaN }, { "episode": 716, "epoch": 0.13446009389671362, "eps": 0, "loss/policy_avg": -1.4161163562675938e-05, "loss/value_avg": 0.3532232642173767, "lr": 8.55e-07, "objective/entropy": 3.1049721240997314, "objective/kl": 22.543970108032227, "objective/non_score_reward": -1.127198576927185, "objective/rlhf_reward": -10.039225578308105, "objective/scores": -8.912027359008789, "policy/approxkl_avg": 1.3328635972698066e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.11553610116243362, "step": 716, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999913573265076, "val/ratio_var": NaN }, { "episode": 717, "epoch": 0.13464788732394367, "eps": 0, "loss/policy_avg": -2.510142849132535e-06, "loss/value_avg": 0.24775010347366333, "lr": 8.52e-07, "objective/entropy": 79.98445892333984, "objective/kl": 21.734289169311523, "objective/non_score_reward": -1.0867143869400024, "objective/rlhf_reward": -11.528664588928223, "objective/scores": -10.441949844360352, "policy/approxkl_avg": 7.64327765523376e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5115714073181152, "step": 717, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999132752418518, "val/ratio_var": NaN }, { "episode": 718, "epoch": 0.13483568075117372, "eps": 0, "loss/policy_avg": -0.00011424186232034117, "loss/value_avg": 0.9245889782905579, "lr": 8.489999999999999e-07, "objective/entropy": 71.11760711669922, "objective/kl": 50.896087646484375, "objective/non_score_reward": -2.544804573059082, "objective/rlhf_reward": -14.114310264587402, "objective/scores": -11.56950569152832, "policy/approxkl_avg": 7.265999357741748e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3028126955032349, "step": 718, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0001468658447266, "val/ratio_var": NaN }, { "episode": 719, "epoch": 0.13502347417840377, "eps": 0, "loss/policy_avg": -6.949227099539712e-05, "loss/value_avg": 0.31295788288116455, "lr": 8.459999999999999e-07, "objective/entropy": 156.58575439453125, "objective/kl": 43.091957092285156, "objective/non_score_reward": -2.154597759246826, "objective/rlhf_reward": -12.34724235534668, "objective/scores": -10.192644119262695, "policy/approxkl_avg": 1.4332808007111453e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.6442317962646484, "step": 719, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999163746833801, "val/ratio_var": NaN }, { "episode": 720, "epoch": 0.1352112676056338, "eps": 0, "loss/policy_avg": 5.356770634534769e-05, "loss/value_avg": 0.37309885025024414, "lr": 8.430000000000001e-07, "objective/entropy": 116.73043823242188, "objective/kl": 28.225360870361328, "objective/non_score_reward": -1.4112681150436401, "objective/rlhf_reward": -10.44571590423584, "objective/scores": -9.03444766998291, "policy/approxkl_avg": 4.335666758947809e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8516017198562622, "step": 720, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000228881835938, "val/ratio_var": NaN }, { "episode": 721, "epoch": 0.13539906103286384, "eps": 0, "loss/policy_avg": 5.9960024373140186e-05, "loss/value_avg": 0.39373356103897095, "lr": 8.400000000000001e-07, "objective/entropy": 42.31426239013672, "objective/kl": 38.97489547729492, "objective/non_score_reward": -1.9487450122833252, "objective/rlhf_reward": -12.096786499023438, "objective/scores": -10.148041725158691, "policy/approxkl_avg": 1.2582489716805867e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0481740236282349, "step": 721, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999368190765381, "val/ratio_var": NaN }, { "episode": 722, "epoch": 0.1355868544600939, "eps": 0, "loss/policy_avg": -8.152790542226285e-05, "loss/value_avg": 0.5023741722106934, "lr": 8.370000000000001e-07, "objective/entropy": 96.73545837402344, "objective/kl": 34.17193603515625, "objective/non_score_reward": -1.70859694480896, "objective/rlhf_reward": -9.406135559082031, "objective/scores": -7.697538375854492, "policy/approxkl_avg": 1.0589646848302436e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8657433986663818, "step": 722, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000344514846802, "val/ratio_var": NaN }, { "episode": 723, "epoch": 0.13577464788732393, "eps": 0, "loss/policy_avg": -9.491983655607328e-05, "loss/value_avg": 0.49712252616882324, "lr": 8.340000000000001e-07, "objective/entropy": 50.415016174316406, "objective/kl": 28.37612533569336, "objective/non_score_reward": -1.4188063144683838, "objective/rlhf_reward": -11.225419998168945, "objective/scores": -9.80661392211914, "policy/approxkl_avg": 1.1951703982049366e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9372811913490295, "step": 723, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000556707382202, "val/ratio_var": NaN }, { "episode": 724, "epoch": 0.13596244131455398, "eps": 0, "loss/policy_avg": 4.461576463654637e-05, "loss/value_avg": 0.5016934871673584, "lr": 8.310000000000001e-07, "objective/entropy": 144.0746307373047, "objective/kl": 42.42119598388672, "objective/non_score_reward": -2.1210598945617676, "objective/rlhf_reward": -11.507575988769531, "objective/scores": -9.386515617370605, "policy/approxkl_avg": 1.6374696087950724e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2585909366607666, "step": 724, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999610185623169, "val/ratio_var": NaN }, { "episode": 725, "epoch": 0.13615023474178403, "eps": 0, "loss/policy_avg": 3.1489245884586126e-05, "loss/value_avg": 0.39028793573379517, "lr": 8.280000000000001e-07, "objective/entropy": 65.27084350585938, "objective/kl": 82.99026489257812, "objective/non_score_reward": -4.149513244628906, "objective/rlhf_reward": -13.519548416137695, "objective/scores": -9.370035171508789, "policy/approxkl_avg": 1.0812242834390418e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.384334921836853, "step": 725, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000594854354858, "val/ratio_var": NaN }, { "episode": 726, "epoch": 0.13633802816901408, "eps": 0, "loss/policy_avg": -3.46966517099645e-05, "loss/value_avg": 0.1909586638212204, "lr": 8.25e-07, "objective/entropy": 97.16703796386719, "objective/kl": 29.060909271240234, "objective/non_score_reward": -1.4530456066131592, "objective/rlhf_reward": -10.665982246398926, "objective/scores": -9.212936401367188, "policy/approxkl_avg": 8.820358488037527e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6216257810592651, "step": 726, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000340938568115, "val/ratio_var": NaN }, { "episode": 727, "epoch": 0.13652582159624413, "eps": 0, "loss/policy_avg": -4.1768234950723127e-05, "loss/value_avg": 1.6968450546264648, "lr": 8.220000000000001e-07, "objective/entropy": 44.376739501953125, "objective/kl": 24.32321548461914, "objective/non_score_reward": -1.216160774230957, "objective/rlhf_reward": -7.488217353820801, "objective/scores": -6.272056579589844, "policy/approxkl_avg": 7.53935154307328e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.8047266602516174, "step": 727, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000024437904358, "val/ratio_var": NaN }, { "episode": 728, "epoch": 0.13671361502347418, "eps": 0, "loss/policy_avg": 2.3032134777167812e-05, "loss/value_avg": 0.593880832195282, "lr": 8.190000000000001e-07, "objective/entropy": 104.27099609375, "objective/kl": 54.94978713989258, "objective/non_score_reward": -2.7474892139434814, "objective/rlhf_reward": -12.006926536560059, "objective/scores": -9.259437561035156, "policy/approxkl_avg": 5.361821564520142e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.694941759109497, "step": 728, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999451041221619, "val/ratio_var": NaN }, { "episode": 729, "epoch": 0.13690140845070423, "eps": 0, "loss/policy_avg": 3.6923389416188e-05, "loss/value_avg": 0.34455606341362, "lr": 8.160000000000001e-07, "objective/entropy": 102.075439453125, "objective/kl": 42.11387634277344, "objective/non_score_reward": -2.105693817138672, "objective/rlhf_reward": -12.931970596313477, "objective/scores": -10.826276779174805, "policy/approxkl_avg": 1.2612203192929883e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1155314445495605, "step": 729, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000337362289429, "val/ratio_var": NaN }, { "episode": 730, "epoch": 0.13708920187793427, "eps": 0, "loss/policy_avg": -1.2297675311856437e-05, "loss/value_avg": 0.3215942978858948, "lr": 8.130000000000001e-07, "objective/entropy": 64.47718048095703, "objective/kl": 38.332611083984375, "objective/non_score_reward": -1.916630506515503, "objective/rlhf_reward": -10.094399452209473, "objective/scores": -8.17776870727539, "policy/approxkl_avg": 4.180834523026533e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5500050783157349, "step": 730, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999369382858276, "val/ratio_var": NaN }, { "episode": 731, "epoch": 0.13727699530516432, "eps": 0, "loss/policy_avg": 1.560067175887525e-05, "loss/value_avg": 0.13322214782238007, "lr": 8.100000000000001e-07, "objective/entropy": 120.211669921875, "objective/kl": 33.37413024902344, "objective/non_score_reward": -1.6687067747116089, "objective/rlhf_reward": -11.085958480834961, "objective/scores": -9.417251586914062, "policy/approxkl_avg": 1.0265519989616223e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.979565978050232, "step": 731, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999516606330872, "val/ratio_var": NaN }, { "episode": 732, "epoch": 0.13746478873239437, "eps": 0, "loss/policy_avg": -0.00013050268171355128, "loss/value_avg": 0.5099589228630066, "lr": 8.070000000000001e-07, "objective/entropy": 104.87541198730469, "objective/kl": 49.55988311767578, "objective/non_score_reward": -2.477994203567505, "objective/rlhf_reward": -12.004695892333984, "objective/scores": -9.526701927185059, "policy/approxkl_avg": 1.597722274482294e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3078360557556152, "step": 732, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999755620956421, "val/ratio_var": NaN }, { "episode": 733, "epoch": 0.13765258215962442, "eps": 0, "loss/policy_avg": -0.00014659143926110119, "loss/value_avg": 0.5540193319320679, "lr": 8.04e-07, "objective/entropy": 90.97347259521484, "objective/kl": 45.719093322753906, "objective/non_score_reward": -2.285954713821411, "objective/rlhf_reward": -11.644071578979492, "objective/scores": -9.35811710357666, "policy/approxkl_avg": 1.0765518254629569e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7408815622329712, "step": 733, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999818801879883, "val/ratio_var": NaN }, { "episode": 734, "epoch": 0.13784037558685447, "eps": 0, "loss/policy_avg": 9.698687790660188e-05, "loss/value_avg": 0.3530009388923645, "lr": 8.01e-07, "objective/entropy": 26.046646118164062, "objective/kl": 25.392988204956055, "objective/non_score_reward": -1.2696495056152344, "objective/rlhf_reward": -10.482990264892578, "objective/scores": -9.213340759277344, "policy/approxkl_avg": 1.8897186038202562e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.7526878714561462, "step": 734, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000014305114746, "val/ratio_var": NaN }, { "episode": 735, "epoch": 0.13802816901408452, "eps": 0, "loss/policy_avg": 3.66174936061725e-05, "loss/value_avg": 0.22751052677631378, "lr": 7.98e-07, "objective/entropy": 80.12655639648438, "objective/kl": 32.06889343261719, "objective/non_score_reward": -1.6034448146820068, "objective/rlhf_reward": -11.933274269104004, "objective/scores": -10.329829216003418, "policy/approxkl_avg": 4.9774953936321253e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3812347650527954, "step": 735, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999055862426758, "val/ratio_var": NaN }, { "episode": 736, "epoch": 0.13821596244131457, "eps": 0, "loss/policy_avg": 6.143101927591488e-05, "loss/value_avg": 1.2860099077224731, "lr": 7.95e-07, "objective/entropy": 89.85298156738281, "objective/kl": 52.52735137939453, "objective/non_score_reward": -2.6263675689697266, "objective/rlhf_reward": -14.7259521484375, "objective/scores": -12.099584579467773, "policy/approxkl_avg": 5.328900343215537e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7848337888717651, "step": 736, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000606775283813, "val/ratio_var": NaN }, { "episode": 737, "epoch": 0.13840375586854461, "eps": 0, "loss/policy_avg": -8.81878804648295e-05, "loss/value_avg": 0.2050207257270813, "lr": 7.920000000000001e-07, "objective/entropy": 115.64457702636719, "objective/kl": 39.846961975097656, "objective/non_score_reward": -1.9923481941223145, "objective/rlhf_reward": -11.570535659790039, "objective/scores": -9.578187942504883, "policy/approxkl_avg": 6.696258481042605e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.181168556213379, "step": 737, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000018835067749, "val/ratio_var": NaN }, { "episode": 738, "epoch": 0.13859154929577464, "eps": 0, "loss/policy_avg": 2.2969155907048844e-05, "loss/value_avg": 0.6959890723228455, "lr": 7.890000000000001e-07, "objective/entropy": 26.60940933227539, "objective/kl": 27.589550018310547, "objective/non_score_reward": -1.3794775009155273, "objective/rlhf_reward": -10.384703636169434, "objective/scores": -9.005226135253906, "policy/approxkl_avg": 3.1770895247973385e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.6184934377670288, "step": 738, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999788403511047, "val/ratio_var": NaN }, { "episode": 739, "epoch": 0.13877934272300468, "eps": 0, "loss/policy_avg": -6.153898539196234e-06, "loss/value_avg": 0.21672630310058594, "lr": 7.860000000000001e-07, "objective/entropy": 71.1323013305664, "objective/kl": 22.352333068847656, "objective/non_score_reward": -1.1176166534423828, "objective/rlhf_reward": -10.703028678894043, "objective/scores": -9.58541202545166, "policy/approxkl_avg": 4.701084321823146e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5022525787353516, "step": 739, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000709295272827, "val/ratio_var": NaN }, { "episode": 740, "epoch": 0.13896713615023473, "eps": 0, "loss/policy_avg": -3.41433405992575e-06, "loss/value_avg": 0.3237118124961853, "lr": 7.830000000000001e-07, "objective/entropy": 90.4149398803711, "objective/kl": 43.79487609863281, "objective/non_score_reward": -2.1897435188293457, "objective/rlhf_reward": -12.148208618164062, "objective/scores": -9.958464622497559, "policy/approxkl_avg": 5.2462304722666886e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5771484375, "step": 740, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999632835388184, "val/ratio_var": NaN }, { "episode": 741, "epoch": 0.13915492957746478, "eps": 0, "loss/policy_avg": 3.298273804830387e-05, "loss/value_avg": 0.45383477210998535, "lr": 7.8e-07, "objective/entropy": 135.1419677734375, "objective/kl": 46.095985412597656, "objective/non_score_reward": -2.3047993183135986, "objective/rlhf_reward": -10.703387260437012, "objective/scores": -8.398588180541992, "policy/approxkl_avg": 9.97155495952029e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3508899211883545, "step": 741, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000417232513428, "val/ratio_var": NaN }, { "episode": 742, "epoch": 0.13934272300469483, "eps": 0, "loss/policy_avg": 7.449455733876675e-05, "loss/value_avg": 0.16665346920490265, "lr": 7.77e-07, "objective/entropy": 98.0444107055664, "objective/kl": 30.34490394592285, "objective/non_score_reward": -1.5172451734542847, "objective/rlhf_reward": -11.510672569274902, "objective/scores": -9.993427276611328, "policy/approxkl_avg": 1.2007041050310363e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1558263301849365, "step": 742, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000413656234741, "val/ratio_var": NaN }, { "episode": 743, "epoch": 0.13953051643192488, "eps": 0, "loss/policy_avg": -7.327097409870476e-05, "loss/value_avg": 0.47678491473197937, "lr": 7.74e-07, "objective/entropy": 85.23297119140625, "objective/kl": 31.633291244506836, "objective/non_score_reward": -1.5816645622253418, "objective/rlhf_reward": -9.944940567016602, "objective/scores": -8.363275527954102, "policy/approxkl_avg": 1.0989845833364598e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.853644609451294, "step": 743, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999842643737793, "val/ratio_var": NaN }, { "episode": 744, "epoch": 0.13971830985915493, "eps": 0, "loss/policy_avg": -9.391443018103018e-05, "loss/value_avg": 0.21012528240680695, "lr": 7.71e-07, "objective/entropy": 90.17440032958984, "objective/kl": 31.127273559570312, "objective/non_score_reward": -1.5563637018203735, "objective/rlhf_reward": -11.428441047668457, "objective/scores": -9.872076988220215, "policy/approxkl_avg": 2.655484934166452e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.995591640472412, "step": 744, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000624656677246, "val/ratio_var": NaN }, { "episode": 745, "epoch": 0.13990610328638498, "eps": 0, "loss/policy_avg": 2.330204324607621e-06, "loss/value_avg": 0.11512067168951035, "lr": 7.68e-07, "objective/entropy": 79.36967468261719, "objective/kl": 37.45073699951172, "objective/non_score_reward": -1.8725371360778809, "objective/rlhf_reward": -11.499122619628906, "objective/scores": -9.626585006713867, "policy/approxkl_avg": 7.023380987902783e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6702673435211182, "step": 745, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999809265136719, "val/ratio_var": NaN }, { "episode": 746, "epoch": 0.14009389671361502, "eps": 0, "loss/policy_avg": 3.087295772274956e-05, "loss/value_avg": 0.5791235566139221, "lr": 7.65e-07, "objective/entropy": 30.718902587890625, "objective/kl": 44.64681625366211, "objective/non_score_reward": -2.2323408126831055, "objective/rlhf_reward": -10.083292961120605, "objective/scores": -7.8509521484375, "policy/approxkl_avg": 8.649641358715598e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.8241980075836182, "step": 746, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999992847442627, "val/ratio_var": NaN }, { "episode": 747, "epoch": 0.14028169014084507, "eps": 0, "loss/policy_avg": -2.5056442609638907e-06, "loss/value_avg": 0.33431532979011536, "lr": 7.620000000000001e-07, "objective/entropy": 120.74581909179688, "objective/kl": 34.36690902709961, "objective/non_score_reward": -1.7183455228805542, "objective/rlhf_reward": -12.928116798400879, "objective/scores": -11.209771156311035, "policy/approxkl_avg": 6.424809839700174e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9449716806411743, "step": 747, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999967098236084, "val/ratio_var": NaN }, { "episode": 748, "epoch": 0.14046948356807512, "eps": 0, "loss/policy_avg": 2.4260214559035376e-05, "loss/value_avg": 4.735385417938232, "lr": 7.590000000000001e-07, "objective/entropy": 30.677396774291992, "objective/kl": 19.67165756225586, "objective/non_score_reward": -0.9835829734802246, "objective/rlhf_reward": -4.298990726470947, "objective/scores": -3.3154077529907227, "policy/approxkl_avg": 3.381392232881808e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.7694000005722046, "step": 748, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999951124191284, "val/ratio_var": NaN }, { "episode": 749, "epoch": 0.14065727699530517, "eps": 0, "loss/policy_avg": 3.0634537324658595e-06, "loss/value_avg": 0.24370652437210083, "lr": 7.56e-07, "objective/entropy": 42.83723449707031, "objective/kl": 29.814529418945312, "objective/non_score_reward": -1.4907264709472656, "objective/rlhf_reward": -12.178169250488281, "objective/scores": -10.687442779541016, "policy/approxkl_avg": 7.729054374294719e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.147023320198059, "step": 749, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999625086784363, "val/ratio_var": NaN }, { "episode": 750, "epoch": 0.14084507042253522, "eps": 0, "loss/policy_avg": 1.8461694708094e-05, "loss/value_avg": 0.12083552032709122, "lr": 7.53e-07, "objective/entropy": 29.21025276184082, "objective/kl": 62.56989288330078, "objective/non_score_reward": -3.1284945011138916, "objective/rlhf_reward": -12.047420501708984, "objective/scores": -8.918926239013672, "policy/approxkl_avg": 1.5354434168557418e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.570400059223175, "step": 750, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000447034835815, "val/ratio_var": NaN }, { "episode": 751, "epoch": 0.14103286384976527, "eps": 0, "loss/policy_avg": 2.6702880859375e-05, "loss/value_avg": 0.8490910530090332, "lr": 7.5e-07, "objective/entropy": 62.18796157836914, "objective/kl": 78.29426574707031, "objective/non_score_reward": -3.914713144302368, "objective/rlhf_reward": -14.407407760620117, "objective/scores": -10.492694854736328, "policy/approxkl_avg": 7.021552761443672e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4004091024398804, "step": 751, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999164938926697, "val/ratio_var": NaN }, { "episode": 752, "epoch": 0.14122065727699532, "eps": 0, "loss/policy_avg": -8.946769958129153e-05, "loss/value_avg": 0.07890211790800095, "lr": 7.47e-07, "objective/entropy": 3.4860308170318604, "objective/kl": 20.919727325439453, "objective/non_score_reward": -1.0459864139556885, "objective/rlhf_reward": -9.751520156860352, "objective/scores": -8.705533981323242, "policy/approxkl_avg": 8.120772498898532e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.121337890625, "step": 752, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999828934669495, "val/ratio_var": NaN }, { "episode": 753, "epoch": 0.14140845070422536, "eps": 0, "loss/policy_avg": 4.21506047132425e-06, "loss/value_avg": 2.140195369720459, "lr": 7.44e-07, "objective/entropy": 87.67991638183594, "objective/kl": 25.707393646240234, "objective/non_score_reward": -1.285369634628296, "objective/rlhf_reward": -6.392263412475586, "objective/scores": -5.106894016265869, "policy/approxkl_avg": 8.026389508586362e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.599543809890747, "step": 753, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000308752059937, "val/ratio_var": NaN }, { "episode": 754, "epoch": 0.1415962441314554, "eps": 0, "loss/policy_avg": -2.6397547117085196e-05, "loss/value_avg": 0.25621771812438965, "lr": 7.41e-07, "objective/entropy": 116.33639526367188, "objective/kl": 46.76475143432617, "objective/non_score_reward": -2.338237762451172, "objective/rlhf_reward": -12.226609230041504, "objective/scores": -9.888371467590332, "policy/approxkl_avg": 7.975884130928534e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.075103282928467, "step": 754, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000399351119995, "val/ratio_var": NaN }, { "episode": 755, "epoch": 0.14178403755868543, "eps": 0, "loss/policy_avg": -5.5223139497684315e-05, "loss/value_avg": 0.46194514632225037, "lr": 7.38e-07, "objective/entropy": 99.99825286865234, "objective/kl": 46.01616287231445, "objective/non_score_reward": -2.3008081912994385, "objective/rlhf_reward": -12.73818588256836, "objective/scores": -10.4373779296875, "policy/approxkl_avg": 1.5901379413207906e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6662851572036743, "step": 755, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999783039093018, "val/ratio_var": NaN }, { "episode": 756, "epoch": 0.14197183098591548, "eps": 0, "loss/policy_avg": 2.8241354812053032e-05, "loss/value_avg": 0.6147298216819763, "lr": 7.350000000000001e-07, "objective/entropy": 111.19140625, "objective/kl": 36.930702209472656, "objective/non_score_reward": -1.8465349674224854, "objective/rlhf_reward": -9.813077926635742, "objective/scores": -7.966543197631836, "policy/approxkl_avg": 7.130980606007142e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0040974617004395, "step": 756, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.00005042552948, "val/ratio_var": NaN }, { "episode": 757, "epoch": 0.14215962441314553, "eps": 0, "loss/policy_avg": -5.332478758646175e-05, "loss/value_avg": 0.4016547203063965, "lr": 7.32e-07, "objective/entropy": 118.33969116210938, "objective/kl": 56.67889404296875, "objective/non_score_reward": -2.83394455909729, "objective/rlhf_reward": -12.949411392211914, "objective/scores": -10.115467071533203, "policy/approxkl_avg": 1.0546118289767037e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1332132816314697, "step": 757, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000215768814087, "val/ratio_var": NaN }, { "episode": 758, "epoch": 0.14234741784037558, "eps": 0, "loss/policy_avg": 8.59746360220015e-05, "loss/value_avg": 0.29045143723487854, "lr": 7.29e-07, "objective/entropy": 120.89865112304688, "objective/kl": 38.91841125488281, "objective/non_score_reward": -1.945920705795288, "objective/rlhf_reward": -12.83228588104248, "objective/scores": -10.886364936828613, "policy/approxkl_avg": 1.5637071726359864e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1138203144073486, "step": 758, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998883605003357, "val/ratio_var": NaN }, { "episode": 759, "epoch": 0.14253521126760563, "eps": 0, "loss/policy_avg": -8.768855332164094e-05, "loss/value_avg": 0.1500551402568817, "lr": 7.26e-07, "objective/entropy": 89.01573181152344, "objective/kl": 33.30186462402344, "objective/non_score_reward": -1.6650930643081665, "objective/rlhf_reward": -11.081914901733398, "objective/scores": -9.416821479797363, "policy/approxkl_avg": 1.2841456964451936e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7023166418075562, "step": 759, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000351667404175, "val/ratio_var": NaN }, { "episode": 760, "epoch": 0.14272300469483568, "eps": 0, "loss/policy_avg": 6.243867574085016e-06, "loss/value_avg": 0.32986870408058167, "lr": 7.23e-07, "objective/entropy": 88.73063659667969, "objective/kl": 37.65497589111328, "objective/non_score_reward": -1.8827488422393799, "objective/rlhf_reward": -12.029535293579102, "objective/scores": -10.1467866897583, "policy/approxkl_avg": 5.236905664673941e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5424643754959106, "step": 760, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999575614929199, "val/ratio_var": NaN }, { "episode": 761, "epoch": 0.14291079812206572, "eps": 0, "loss/policy_avg": -1.5114837879082188e-05, "loss/value_avg": 0.3595779836177826, "lr": 7.2e-07, "objective/entropy": 89.28742218017578, "objective/kl": 42.900665283203125, "objective/non_score_reward": -2.145033121109009, "objective/rlhf_reward": -10.929421424865723, "objective/scores": -8.784388542175293, "policy/approxkl_avg": 9.965332026240503e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.464606523513794, "step": 761, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999961853027344, "val/ratio_var": NaN }, { "episode": 762, "epoch": 0.14309859154929577, "eps": 0, "loss/policy_avg": 1.17499876068905e-05, "loss/value_avg": 0.18205831944942474, "lr": 7.17e-07, "objective/entropy": 134.32666015625, "objective/kl": 49.890865325927734, "objective/non_score_reward": -2.4945430755615234, "objective/rlhf_reward": -12.740689277648926, "objective/scores": -10.246146202087402, "policy/approxkl_avg": 1.3069643500784878e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3959801197052, "step": 762, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000108003616333, "val/ratio_var": NaN }, { "episode": 763, "epoch": 0.14328638497652582, "eps": 0, "loss/policy_avg": 2.0171111827949062e-05, "loss/value_avg": 0.9521726369857788, "lr": 7.14e-07, "objective/entropy": 106.7624282836914, "objective/kl": 46.36570739746094, "objective/non_score_reward": -2.3182854652404785, "objective/rlhf_reward": -14.105640411376953, "objective/scores": -11.787355422973633, "policy/approxkl_avg": 1.0873176847780996e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.057506561279297, "step": 763, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000015377998352, "val/ratio_var": NaN }, { "episode": 764, "epoch": 0.14347417840375587, "eps": 0, "loss/policy_avg": 5.565949322772212e-05, "loss/value_avg": 0.41947829723358154, "lr": 7.11e-07, "objective/entropy": 84.57792663574219, "objective/kl": 75.56053924560547, "objective/non_score_reward": -3.778027057647705, "objective/rlhf_reward": -12.051580429077148, "objective/scores": -8.273553848266602, "policy/approxkl_avg": 6.115024575592543e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5703216791152954, "step": 764, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999748468399048, "val/ratio_var": NaN }, { "episode": 765, "epoch": 0.14366197183098592, "eps": 0, "loss/policy_avg": -2.811539843605715e-06, "loss/value_avg": 0.37943339347839355, "lr": 7.079999999999999e-07, "objective/entropy": 99.2389144897461, "objective/kl": 50.59626007080078, "objective/non_score_reward": -2.529813051223755, "objective/rlhf_reward": -12.462225914001465, "objective/scores": -9.932413101196289, "policy/approxkl_avg": 6.982270406297175e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.068352460861206, "step": 765, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000791549682617, "val/ratio_var": NaN }, { "episode": 766, "epoch": 0.14384976525821597, "eps": 0, "loss/policy_avg": 9.224550012731925e-05, "loss/value_avg": 0.7215749025344849, "lr": 7.05e-07, "objective/entropy": 138.49673461914062, "objective/kl": 64.27469635009766, "objective/non_score_reward": -3.2137351036071777, "objective/rlhf_reward": -14.240175247192383, "objective/scores": -11.026439666748047, "policy/approxkl_avg": 1.3873027171484864e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.192385196685791, "step": 766, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999915361404419, "val/ratio_var": NaN }, { "episode": 767, "epoch": 0.14403755868544602, "eps": 0, "loss/policy_avg": -5.308187155605992e-06, "loss/value_avg": 0.06334613263607025, "lr": 7.02e-07, "objective/entropy": 3.7144460678100586, "objective/kl": 22.373180389404297, "objective/non_score_reward": -1.1186591386795044, "objective/rlhf_reward": -10.0079927444458, "objective/scores": -8.889333724975586, "policy/approxkl_avg": 1.8683619928960837e-10, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.12146687507629395, "step": 767, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000002145767212, "val/ratio_var": NaN }, { "episode": 768, "epoch": 0.14422535211267606, "eps": 0, "loss/policy_avg": 9.476464038016275e-05, "loss/value_avg": 3.6471965312957764, "lr": 6.990000000000001e-07, "objective/entropy": 124.86442565917969, "objective/kl": 59.86051940917969, "objective/non_score_reward": -2.9930262565612793, "objective/rlhf_reward": -17.705442428588867, "objective/scores": -14.71241569519043, "policy/approxkl_avg": 9.33390325030814e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.382434844970703, "step": 768, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999142289161682, "val/ratio_var": NaN }, { "episode": 769, "epoch": 0.1444131455399061, "eps": 0, "loss/policy_avg": -1.8344735508435406e-05, "loss/value_avg": 0.3103245198726654, "lr": 6.960000000000001e-07, "objective/entropy": 113.03182983398438, "objective/kl": 42.677425384521484, "objective/non_score_reward": -2.13387131690979, "objective/rlhf_reward": -10.646512985229492, "objective/scores": -8.512641906738281, "policy/approxkl_avg": 4.946237197600567e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9804341793060303, "step": 769, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000002384185791, "val/ratio_var": NaN }, { "episode": 770, "epoch": 0.14460093896713616, "eps": 0, "loss/policy_avg": 8.961389539763331e-05, "loss/value_avg": 0.4555176794528961, "lr": 6.930000000000001e-07, "objective/entropy": 90.04756927490234, "objective/kl": 42.018821716308594, "objective/non_score_reward": -2.1009411811828613, "objective/rlhf_reward": -11.128652572631836, "objective/scores": -9.027710914611816, "policy/approxkl_avg": 8.590850342216072e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7106565237045288, "step": 770, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000725984573364, "val/ratio_var": NaN }, { "episode": 771, "epoch": 0.1447887323943662, "eps": 0, "loss/policy_avg": -0.00010981649393215775, "loss/value_avg": 0.20057225227355957, "lr": 6.900000000000001e-07, "objective/entropy": 116.87068176269531, "objective/kl": 24.96898651123047, "objective/non_score_reward": -1.2484493255615234, "objective/rlhf_reward": -10.835865020751953, "objective/scores": -9.58741569519043, "policy/approxkl_avg": 1.3070690840777388e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8748526573181152, "step": 771, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000098943710327, "val/ratio_var": NaN }, { "episode": 772, "epoch": 0.14497652582159623, "eps": 0, "loss/policy_avg": -2.6936801077681594e-05, "loss/value_avg": 0.9632473587989807, "lr": 6.87e-07, "objective/entropy": 93.40489959716797, "objective/kl": 19.877685546875, "objective/non_score_reward": -0.9938843846321106, "objective/rlhf_reward": -8.518884658813477, "objective/scores": -7.52500057220459, "policy/approxkl_avg": 4.9074234453883037e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.824946641921997, "step": 772, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000624656677246, "val/ratio_var": NaN }, { "episode": 773, "epoch": 0.14516431924882628, "eps": 0, "loss/policy_avg": -0.0001033027219818905, "loss/value_avg": 0.5013088583946228, "lr": 6.84e-07, "objective/entropy": 104.27388000488281, "objective/kl": 42.81517028808594, "objective/non_score_reward": -2.140758752822876, "objective/rlhf_reward": -10.356557846069336, "objective/scores": -8.215799331665039, "policy/approxkl_avg": 8.410054874730122e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.995546817779541, "step": 773, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000534057617188, "val/ratio_var": NaN }, { "episode": 774, "epoch": 0.14535211267605633, "eps": 0, "loss/policy_avg": -4.390500635054195e-06, "loss/value_avg": 0.3887186646461487, "lr": 6.81e-07, "objective/entropy": 132.73133850097656, "objective/kl": 50.010704040527344, "objective/non_score_reward": -2.500535249710083, "objective/rlhf_reward": -12.982542037963867, "objective/scores": -10.482007026672363, "policy/approxkl_avg": 1.2011564365366212e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1319832801818848, "step": 774, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999690055847168, "val/ratio_var": NaN }, { "episode": 775, "epoch": 0.14553990610328638, "eps": 0, "loss/policy_avg": -6.6577263169165235e-06, "loss/value_avg": 0.6730038523674011, "lr": 6.78e-07, "objective/entropy": 40.186561584472656, "objective/kl": 48.793212890625, "objective/non_score_reward": -2.4396605491638184, "objective/rlhf_reward": -11.230661392211914, "objective/scores": -8.791000366210938, "policy/approxkl_avg": 5.393562929612017e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9173722267150879, "step": 775, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999794363975525, "val/ratio_var": NaN }, { "episode": 776, "epoch": 0.14572769953051642, "eps": 0, "loss/policy_avg": -2.6034858819912188e-05, "loss/value_avg": 0.18365034461021423, "lr": 6.75e-07, "objective/entropy": 91.74562072753906, "objective/kl": 36.92285919189453, "objective/non_score_reward": -1.8461430072784424, "objective/rlhf_reward": -11.813918113708496, "objective/scores": -9.967775344848633, "policy/approxkl_avg": 6.356975035259893e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7960481643676758, "step": 776, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999578595161438, "val/ratio_var": NaN }, { "episode": 777, "epoch": 0.14591549295774647, "eps": 0, "loss/policy_avg": 0.00012957824219483882, "loss/value_avg": 1.1318289041519165, "lr": 6.72e-07, "objective/entropy": 85.65856170654297, "objective/kl": 57.090396881103516, "objective/non_score_reward": -2.854519844055176, "objective/rlhf_reward": -11.597187995910645, "objective/scores": -8.742668151855469, "policy/approxkl_avg": 1.1008561529024519e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.911434531211853, "step": 777, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999962449073792, "val/ratio_var": NaN }, { "episode": 778, "epoch": 0.14610328638497652, "eps": 0, "loss/policy_avg": -2.458860217302572e-05, "loss/value_avg": 0.33466461300849915, "lr": 6.690000000000001e-07, "objective/entropy": 119.31956481933594, "objective/kl": 71.27220916748047, "objective/non_score_reward": -3.563610553741455, "objective/rlhf_reward": -13.496423721313477, "objective/scores": -9.93281364440918, "policy/approxkl_avg": 8.964170206127164e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9503289461135864, "step": 778, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000321865081787, "val/ratio_var": NaN }, { "episode": 779, "epoch": 0.14629107981220657, "eps": 0, "loss/policy_avg": -1.4539034964400344e-05, "loss/value_avg": 0.24456758797168732, "lr": 6.660000000000001e-07, "objective/entropy": 41.17055892944336, "objective/kl": 30.956579208374023, "objective/non_score_reward": -1.5478289127349854, "objective/rlhf_reward": -11.78028678894043, "objective/scores": -10.232458114624023, "policy/approxkl_avg": 3.651325286568863e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.8359352350234985, "step": 779, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999978542327881, "val/ratio_var": NaN }, { "episode": 780, "epoch": 0.14647887323943662, "eps": 0, "loss/policy_avg": -8.565974712837487e-05, "loss/value_avg": 0.42197078466415405, "lr": 6.63e-07, "objective/entropy": 113.03877258300781, "objective/kl": 31.109432220458984, "objective/non_score_reward": -1.555471658706665, "objective/rlhf_reward": -10.688169479370117, "objective/scores": -9.132698059082031, "policy/approxkl_avg": 3.2372616942666355e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.034769296646118, "step": 780, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000922679901123, "val/ratio_var": NaN }, { "episode": 781, "epoch": 0.14666666666666667, "eps": 0, "loss/policy_avg": 3.107538941549137e-05, "loss/value_avg": 0.15217962861061096, "lr": 6.6e-07, "objective/entropy": 17.792041778564453, "objective/kl": 55.51509094238281, "objective/non_score_reward": -2.775754690170288, "objective/rlhf_reward": -12.594615936279297, "objective/scores": -9.81886100769043, "policy/approxkl_avg": 1.46556127234021e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.3449154198169708, "step": 781, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000012993812561, "val/ratio_var": NaN }, { "episode": 782, "epoch": 0.14685446009389672, "eps": 0, "loss/policy_avg": -6.612966535612941e-05, "loss/value_avg": 0.37621796131134033, "lr": 6.57e-07, "objective/entropy": 89.160400390625, "objective/kl": 38.306087493896484, "objective/non_score_reward": -1.9153045415878296, "objective/rlhf_reward": -12.014934539794922, "objective/scores": -10.099630355834961, "policy/approxkl_avg": 8.93680933700125e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1894209384918213, "step": 782, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999601244926453, "val/ratio_var": NaN }, { "episode": 783, "epoch": 0.14704225352112676, "eps": 0, "loss/policy_avg": 1.583459265930287e-06, "loss/value_avg": 0.4267531931400299, "lr": 6.54e-07, "objective/entropy": 86.30342102050781, "objective/kl": 37.46908950805664, "objective/non_score_reward": -1.8734545707702637, "objective/rlhf_reward": -10.628002166748047, "objective/scores": -8.754548072814941, "policy/approxkl_avg": 2.1406825112535444e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9220086336135864, "step": 783, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998320937156677, "val/ratio_var": NaN }, { "episode": 784, "epoch": 0.1472300469483568, "eps": 0, "loss/policy_avg": 3.242042657802813e-05, "loss/value_avg": 1.5738028287887573, "lr": 6.51e-07, "objective/entropy": 126.34075164794922, "objective/kl": 52.485469818115234, "objective/non_score_reward": -2.6242737770080566, "objective/rlhf_reward": -12.331857681274414, "objective/scores": -9.707584381103516, "policy/approxkl_avg": 1.083784937350174e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.127383232116699, "step": 784, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000035762786865, "val/ratio_var": NaN }, { "episode": 785, "epoch": 0.14741784037558686, "eps": 0, "loss/policy_avg": -1.6675805454724468e-05, "loss/value_avg": 0.24276115000247955, "lr": 6.48e-07, "objective/entropy": 85.75590515136719, "objective/kl": 38.25080871582031, "objective/non_score_reward": -1.9125405550003052, "objective/rlhf_reward": -12.726364135742188, "objective/scores": -10.813823699951172, "policy/approxkl_avg": 6.112671258051705e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6612365245819092, "step": 785, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.999975860118866, "val/ratio_var": NaN }, { "episode": 786, "epoch": 0.1476056338028169, "eps": 0, "loss/policy_avg": 5.0373797421343625e-05, "loss/value_avg": 0.6543755531311035, "lr": 6.45e-07, "objective/entropy": 76.14405822753906, "objective/kl": 43.815608978271484, "objective/non_score_reward": -2.1907806396484375, "objective/rlhf_reward": -10.945955276489258, "objective/scores": -8.75517463684082, "policy/approxkl_avg": 5.334957009495156e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3377363681793213, "step": 786, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999582767486572, "val/ratio_var": NaN }, { "episode": 787, "epoch": 0.14779342723004696, "eps": 0, "loss/policy_avg": 1.300056010222761e-05, "loss/value_avg": 1.187119483947754, "lr": 6.42e-07, "objective/entropy": 143.48983764648438, "objective/kl": 42.403831481933594, "objective/non_score_reward": -2.120191812515259, "objective/rlhf_reward": -8.922174453735352, "objective/scores": -6.801982402801514, "policy/approxkl_avg": 8.714366828144193e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.724993944168091, "step": 787, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000128746032715, "val/ratio_var": NaN }, { "episode": 788, "epoch": 0.147981220657277, "eps": 0, "loss/policy_avg": -1.0850294529518578e-05, "loss/value_avg": 0.11597289144992828, "lr": 6.39e-07, "objective/entropy": 76.52420043945312, "objective/kl": 40.67932891845703, "objective/non_score_reward": -2.033966541290283, "objective/rlhf_reward": -12.495460510253906, "objective/scores": -10.461494445800781, "policy/approxkl_avg": 6.375379513201551e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.486933946609497, "step": 788, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999408721923828, "val/ratio_var": NaN }, { "episode": 789, "epoch": 0.14816901408450706, "eps": 0, "loss/policy_avg": -3.563682912499644e-05, "loss/value_avg": 1.080810785293579, "lr": 6.36e-07, "objective/entropy": 13.239225387573242, "objective/kl": 40.28741455078125, "objective/non_score_reward": -2.014370918273926, "objective/rlhf_reward": -9.739316940307617, "objective/scores": -7.724946022033691, "policy/approxkl_avg": 1.6594171370343247e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.35176748037338257, "step": 789, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999843239784241, "val/ratio_var": NaN }, { "episode": 790, "epoch": 0.14835680751173708, "eps": 0, "loss/policy_avg": 4.6145240048645064e-05, "loss/value_avg": 1.5610172748565674, "lr": 6.33e-07, "objective/entropy": 60.38984680175781, "objective/kl": 44.539886474609375, "objective/non_score_reward": -2.226994037628174, "objective/rlhf_reward": -8.133455276489258, "objective/scores": -5.906460762023926, "policy/approxkl_avg": 4.081693916191398e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1088752746582031, "step": 790, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000057578086853, "val/ratio_var": NaN }, { "episode": 791, "epoch": 0.14854460093896713, "eps": 0, "loss/policy_avg": -9.707684512250125e-06, "loss/value_avg": 0.6178264021873474, "lr": 6.3e-07, "objective/entropy": 93.86734008789062, "objective/kl": 26.00030517578125, "objective/non_score_reward": -1.3000153303146362, "objective/rlhf_reward": -11.808412551879883, "objective/scores": -10.508397102355957, "policy/approxkl_avg": 1.4757684141386562e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7737599611282349, "step": 791, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000427961349487, "val/ratio_var": NaN }, { "episode": 792, "epoch": 0.14873239436619717, "eps": 0, "loss/policy_avg": 0.00011635726696113124, "loss/value_avg": 0.3508826196193695, "lr": 6.27e-07, "objective/entropy": 97.92556762695312, "objective/kl": 56.348514556884766, "objective/non_score_reward": -2.8174259662628174, "objective/rlhf_reward": -12.833760261535645, "objective/scores": -10.016334533691406, "policy/approxkl_avg": 9.186677374373176e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.034810781478882, "step": 792, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000026822090149, "val/ratio_var": NaN }, { "episode": 793, "epoch": 0.14892018779342722, "eps": 0, "loss/policy_avg": -5.883990525035188e-06, "loss/value_avg": 0.23829910159111023, "lr": 6.24e-07, "objective/entropy": 74.49545288085938, "objective/kl": 32.45478057861328, "objective/non_score_reward": -1.6227390766143799, "objective/rlhf_reward": -10.018287658691406, "objective/scores": -8.395548820495605, "policy/approxkl_avg": 4.2979031888989994e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2446335554122925, "step": 793, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999631643295288, "val/ratio_var": NaN }, { "episode": 794, "epoch": 0.14910798122065727, "eps": 0, "loss/policy_avg": 0.00011136847024317831, "loss/value_avg": 0.4461505711078644, "lr": 6.21e-07, "objective/entropy": 45.36363983154297, "objective/kl": 63.50576400756836, "objective/non_score_reward": -3.175288438796997, "objective/rlhf_reward": -14.070000648498535, "objective/scores": -10.894712448120117, "policy/approxkl_avg": 1.3071708337974997e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.7845298051834106, "step": 794, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999949932098389, "val/ratio_var": NaN }, { "episode": 795, "epoch": 0.14929577464788732, "eps": 0, "loss/policy_avg": -2.2402349713956937e-05, "loss/value_avg": 0.4449780583381653, "lr": 6.18e-07, "objective/entropy": 59.122310638427734, "objective/kl": 79.33708190917969, "objective/non_score_reward": -3.9668540954589844, "objective/rlhf_reward": -12.80784797668457, "objective/scores": -8.840993881225586, "policy/approxkl_avg": 5.1749658780408936e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.219001054763794, "step": 795, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000008225440979, "val/ratio_var": NaN }, { "episode": 796, "epoch": 0.14948356807511737, "eps": 0, "loss/policy_avg": -1.0904275768552907e-05, "loss/value_avg": 0.35277581214904785, "lr": 6.149999999999999e-07, "objective/entropy": 19.18593978881836, "objective/kl": 24.196157455444336, "objective/non_score_reward": -1.2098077535629272, "objective/rlhf_reward": -9.963383674621582, "objective/scores": -8.753576278686523, "policy/approxkl_avg": 1.7219353054542808e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.39830392599105835, "step": 796, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999580383300781, "val/ratio_var": NaN }, { "episode": 797, "epoch": 0.14967136150234742, "eps": 0, "loss/policy_avg": 8.595664257882163e-05, "loss/value_avg": 0.07847526669502258, "lr": 6.12e-07, "objective/entropy": 73.5054702758789, "objective/kl": 32.145877838134766, "objective/non_score_reward": -1.607293725013733, "objective/rlhf_reward": -11.09023666381836, "objective/scores": -9.482942581176758, "policy/approxkl_avg": 5.016828907855597e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1651196479797363, "step": 797, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999310374259949, "val/ratio_var": NaN }, { "episode": 798, "epoch": 0.14985915492957746, "eps": 0, "loss/policy_avg": -1.4431071576836985e-05, "loss/value_avg": 0.17220324277877808, "lr": 6.09e-07, "objective/entropy": 134.1142578125, "objective/kl": 12.891138076782227, "objective/non_score_reward": -0.644556999206543, "objective/rlhf_reward": -11.885764122009277, "objective/scores": -11.241207122802734, "policy/approxkl_avg": 1.2658260573061852e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.202259063720703, "step": 798, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999495148658752, "val/ratio_var": NaN }, { "episode": 799, "epoch": 0.1500469483568075, "eps": 0, "loss/policy_avg": -8.924952453526203e-06, "loss/value_avg": 0.21405529975891113, "lr": 6.060000000000001e-07, "objective/entropy": 116.20153045654297, "objective/kl": 33.66215515136719, "objective/non_score_reward": -1.683107614517212, "objective/rlhf_reward": -11.744784355163574, "objective/scores": -10.061676979064941, "policy/approxkl_avg": 1.6248590384293493e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9615801572799683, "step": 799, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000473260879517, "val/ratio_var": NaN }, { "episode": 800, "epoch": 0.15023474178403756, "eps": 0, "loss/policy_avg": -7.62939453125e-06, "loss/value_avg": 0.41348591446876526, "lr": 6.030000000000001e-07, "objective/entropy": 82.37389373779297, "objective/kl": 36.7293586730957, "objective/non_score_reward": -1.836467981338501, "objective/rlhf_reward": -13.841939926147461, "objective/scores": -12.005472183227539, "policy/approxkl_avg": 7.161945347888832e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6939858198165894, "step": 800, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999843835830688, "val/ratio_var": NaN }, { "episode": 801, "epoch": 0.1504225352112676, "eps": 0, "loss/policy_avg": -2.6705130949267186e-05, "loss/value_avg": 0.22695113718509674, "lr": 6.000000000000001e-07, "objective/entropy": 61.314857482910156, "objective/kl": 16.537792205810547, "objective/non_score_reward": -0.8268896341323853, "objective/rlhf_reward": -11.923489570617676, "objective/scores": -11.096599578857422, "policy/approxkl_avg": 2.7666271051884905e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3275768756866455, "step": 801, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000014066696167, "val/ratio_var": NaN }, { "episode": 802, "epoch": 0.15061032863849766, "eps": 0, "loss/policy_avg": 3.18716156471055e-05, "loss/value_avg": 0.13507451117038727, "lr": 5.970000000000001e-07, "objective/entropy": 84.13212585449219, "objective/kl": 34.01031494140625, "objective/non_score_reward": -1.700515866279602, "objective/rlhf_reward": -11.696172714233398, "objective/scores": -9.995656967163086, "policy/approxkl_avg": 9.312535098615626e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9595601558685303, "step": 802, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999776482582092, "val/ratio_var": NaN }, { "episode": 803, "epoch": 0.1507981220657277, "eps": 0, "loss/policy_avg": 4.448980689630844e-05, "loss/value_avg": 0.15407581627368927, "lr": 5.94e-07, "objective/entropy": 101.60400390625, "objective/kl": 16.382850646972656, "objective/non_score_reward": -0.8191425800323486, "objective/rlhf_reward": -10.620668411254883, "objective/scores": -9.801526069641113, "policy/approxkl_avg": 1.1544968003818212e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7815978527069092, "step": 803, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999821186065674, "val/ratio_var": NaN }, { "episode": 804, "epoch": 0.15098591549295776, "eps": 0, "loss/policy_avg": -4.008131327282172e-06, "loss/value_avg": 0.20618344843387604, "lr": 5.91e-07, "objective/entropy": 91.71038818359375, "objective/kl": 24.92753028869629, "objective/non_score_reward": -1.2463765144348145, "objective/rlhf_reward": -10.539909362792969, "objective/scores": -9.293533325195312, "policy/approxkl_avg": 9.980139736853744e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5931569337844849, "step": 804, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999871850013733, "val/ratio_var": NaN }, { "episode": 805, "epoch": 0.1511737089201878, "eps": 0, "loss/policy_avg": -1.4035206731932703e-05, "loss/value_avg": 0.09656815975904465, "lr": 5.88e-07, "objective/entropy": 87.24464416503906, "objective/kl": 36.10658645629883, "objective/non_score_reward": -1.8053293228149414, "objective/rlhf_reward": -11.134928703308105, "objective/scores": -9.329599380493164, "policy/approxkl_avg": 1.209640458910144e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.638731837272644, "step": 805, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999327063560486, "val/ratio_var": NaN }, { "episode": 806, "epoch": 0.15136150234741785, "eps": 0, "loss/policy_avg": 6.703610415570438e-05, "loss/value_avg": 0.15354391932487488, "lr": 5.85e-07, "objective/entropy": 78.64720153808594, "objective/kl": 22.71089744567871, "objective/non_score_reward": -1.135545015335083, "objective/rlhf_reward": -11.250505447387695, "objective/scores": -10.114960670471191, "policy/approxkl_avg": 1.580485644581131e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.438893437385559, "step": 806, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000251531600952, "val/ratio_var": NaN }, { "episode": 807, "epoch": 0.15154929577464787, "eps": 0, "loss/policy_avg": 8.750861888984218e-05, "loss/value_avg": 0.3446729779243469, "lr": 5.82e-07, "objective/entropy": 123.32347106933594, "objective/kl": 55.920352935791016, "objective/non_score_reward": -2.796017646789551, "objective/rlhf_reward": -12.837446212768555, "objective/scores": -10.041428565979004, "policy/approxkl_avg": 9.933183520161037e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4337780475616455, "step": 807, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999496340751648, "val/ratio_var": NaN }, { "episode": 808, "epoch": 0.15173708920187792, "eps": 0, "loss/policy_avg": 1.2955575584783219e-05, "loss/value_avg": 0.18486958742141724, "lr": 5.79e-07, "objective/entropy": 76.68809509277344, "objective/kl": 54.92786407470703, "objective/non_score_reward": -2.7463934421539307, "objective/rlhf_reward": -11.866422653198242, "objective/scores": -9.12002944946289, "policy/approxkl_avg": 3.2509515079937046e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4272346496582031, "step": 808, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000160932540894, "val/ratio_var": NaN }, { "episode": 809, "epoch": 0.15192488262910797, "eps": 0, "loss/policy_avg": -1.2055883189532324e-06, "loss/value_avg": 0.09783000499010086, "lr": 5.760000000000001e-07, "objective/entropy": 5.854029655456543, "objective/kl": 27.81761360168457, "objective/non_score_reward": -1.390880823135376, "objective/rlhf_reward": -10.918781280517578, "objective/scores": -9.527900695800781, "policy/approxkl_avg": 6.5218955236368e-11, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.13421055674552917, "step": 809, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999987483024597, "val/ratio_var": NaN }, { "episode": 810, "epoch": 0.15211267605633802, "eps": 0, "loss/policy_avg": -5.398606299422681e-05, "loss/value_avg": 0.11519447714090347, "lr": 5.730000000000001e-07, "objective/entropy": 93.04791259765625, "objective/kl": 20.59920883178711, "objective/non_score_reward": -1.0299606323242188, "objective/rlhf_reward": -10.72116470336914, "objective/scores": -9.691204071044922, "policy/approxkl_avg": 1.3104603624469746e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.80998957157135, "step": 810, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999516606330872, "val/ratio_var": NaN }, { "episode": 811, "epoch": 0.15230046948356807, "eps": 0, "loss/policy_avg": -3.6302601074567065e-05, "loss/value_avg": 0.49333712458610535, "lr": 5.7e-07, "objective/entropy": 102.4549560546875, "objective/kl": 54.06017303466797, "objective/non_score_reward": -2.7030086517333984, "objective/rlhf_reward": -12.807744979858398, "objective/scores": -10.104736328125, "policy/approxkl_avg": 8.004099782965568e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.887339472770691, "step": 811, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.000008225440979, "val/ratio_var": NaN }, { "episode": 812, "epoch": 0.15248826291079812, "eps": 0, "loss/policy_avg": -2.1124786144355312e-05, "loss/value_avg": 0.19171912968158722, "lr": 5.67e-07, "objective/entropy": 77.68840789794922, "objective/kl": 32.940006256103516, "objective/non_score_reward": -1.6470005512237549, "objective/rlhf_reward": -11.318918228149414, "objective/scores": -9.671917915344238, "policy/approxkl_avg": 5.780698586477229e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6582837104797363, "step": 812, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999970197677612, "val/ratio_var": NaN }, { "episode": 813, "epoch": 0.15267605633802817, "eps": 0, "loss/policy_avg": -4.858340980717912e-05, "loss/value_avg": 0.2087060809135437, "lr": 5.64e-07, "objective/entropy": 55.96753692626953, "objective/kl": 21.066368103027344, "objective/non_score_reward": -1.0533185005187988, "objective/rlhf_reward": -11.621288299560547, "objective/scores": -10.56796932220459, "policy/approxkl_avg": 9.519796151380433e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.255923867225647, "step": 813, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999431371688843, "val/ratio_var": NaN }, { "episode": 814, "epoch": 0.15286384976525821, "eps": 0, "loss/policy_avg": -6.0945185396121815e-05, "loss/value_avg": 0.2648741602897644, "lr": 5.61e-07, "objective/entropy": 94.28382873535156, "objective/kl": 49.03349685668945, "objective/non_score_reward": -2.4516749382019043, "objective/rlhf_reward": -12.000818252563477, "objective/scores": -9.549142837524414, "policy/approxkl_avg": 7.005530733295018e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8827803134918213, "step": 814, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000724792480469, "val/ratio_var": NaN }, { "episode": 815, "epoch": 0.15305164319248826, "eps": 0, "loss/policy_avg": 1.5474715837626718e-06, "loss/value_avg": 0.12059330195188522, "lr": 5.58e-07, "objective/entropy": 63.68206787109375, "objective/kl": 50.73091506958008, "objective/non_score_reward": -2.536545753479004, "objective/rlhf_reward": -12.493268013000488, "objective/scores": -9.956722259521484, "policy/approxkl_avg": 7.013138514366801e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6375294923782349, "step": 815, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000059604644775, "val/ratio_var": NaN }, { "episode": 816, "epoch": 0.1532394366197183, "eps": 0, "loss/policy_avg": -5.909182073082775e-05, "loss/value_avg": 0.2284490466117859, "lr": 5.55e-07, "objective/entropy": 76.69854736328125, "objective/kl": 23.64881134033203, "objective/non_score_reward": -1.1824405193328857, "objective/rlhf_reward": -10.309880256652832, "objective/scores": -9.127439498901367, "policy/approxkl_avg": 7.501068921555998e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6196266412734985, "step": 816, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000047206878662, "val/ratio_var": NaN }, { "episode": 817, "epoch": 0.15342723004694836, "eps": 0, "loss/policy_avg": -0.0001062717055901885, "loss/value_avg": 0.5130720138549805, "lr": 5.52e-07, "objective/entropy": 77.4860610961914, "objective/kl": 64.052978515625, "objective/non_score_reward": -3.2026491165161133, "objective/rlhf_reward": -12.483036041259766, "objective/scores": -9.280386924743652, "policy/approxkl_avg": 9.127493427740774e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.573378086090088, "step": 817, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000296831130981, "val/ratio_var": NaN }, { "episode": 818, "epoch": 0.1536150234741784, "eps": 0, "loss/policy_avg": 6.859707355033606e-05, "loss/value_avg": 0.2675771117210388, "lr": 5.49e-07, "objective/entropy": 97.13026428222656, "objective/kl": 57.645751953125, "objective/non_score_reward": -2.8822875022888184, "objective/rlhf_reward": -12.464254379272461, "objective/scores": -9.581966400146484, "policy/approxkl_avg": 7.34666087964797e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.902661681175232, "step": 818, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000382661819458, "val/ratio_var": NaN }, { "episode": 819, "epoch": 0.15380281690140846, "eps": 0, "loss/policy_avg": 1.8011849533650093e-05, "loss/value_avg": 0.40351688861846924, "lr": 5.46e-07, "objective/entropy": 96.59074401855469, "objective/kl": 26.833864212036133, "objective/non_score_reward": -1.34169340133667, "objective/rlhf_reward": -12.66834831237793, "objective/scores": -11.326655387878418, "policy/approxkl_avg": 8.078655611143404e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.910529375076294, "step": 819, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9998616576194763, "val/ratio_var": NaN }, { "episode": 820, "epoch": 0.1539906103286385, "eps": 0, "loss/policy_avg": 3.418832420720719e-06, "loss/value_avg": 0.2942241132259369, "lr": 5.43e-07, "objective/entropy": 102.87922668457031, "objective/kl": 17.520484924316406, "objective/non_score_reward": -0.8760243058204651, "objective/rlhf_reward": -9.971654891967773, "objective/scores": -9.095630645751953, "policy/approxkl_avg": 1.3201973558807367e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1942760944366455, "step": 820, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.0000420808792114, "val/ratio_var": NaN }, { "episode": 821, "epoch": 0.15417840375586855, "eps": 0, "loss/policy_avg": 3.0256667741923593e-05, "loss/value_avg": 0.31073012948036194, "lr": 5.4e-07, "objective/entropy": 93.72694396972656, "objective/kl": 36.37100601196289, "objective/non_score_reward": -1.81855046749115, "objective/rlhf_reward": -12.34501838684082, "objective/scores": -10.526468276977539, "policy/approxkl_avg": 8.856837752091451e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6241570711135864, "step": 821, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000024437904358, "val/ratio_var": NaN }, { "episode": 822, "epoch": 0.1543661971830986, "eps": 0, "loss/policy_avg": 8.155265095410869e-05, "loss/value_avg": 0.7824551463127136, "lr": 5.37e-07, "objective/entropy": 20.04275131225586, "objective/kl": 31.447847366333008, "objective/non_score_reward": -1.5723925828933716, "objective/rlhf_reward": -9.190762519836426, "objective/scores": -7.6183695793151855, "policy/approxkl_avg": 1.949406502887996e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.3088908791542053, "step": 822, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000479221343994, "val/ratio_var": NaN }, { "episode": 823, "epoch": 0.15455399061032865, "eps": 0, "loss/policy_avg": -2.6477957362658344e-05, "loss/value_avg": 0.12654650211334229, "lr": 5.34e-07, "objective/entropy": 83.19505310058594, "objective/kl": 44.58616256713867, "objective/non_score_reward": -2.2293081283569336, "objective/rlhf_reward": -12.118938446044922, "objective/scores": -9.889630317687988, "policy/approxkl_avg": 8.950575391963866e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.718434453010559, "step": 823, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999933660030365, "val/ratio_var": NaN }, { "episode": 824, "epoch": 0.15474178403755867, "eps": 0, "loss/policy_avg": 6.396815479092766e-06, "loss/value_avg": 0.22580017149448395, "lr": 5.31e-07, "objective/entropy": 84.95570373535156, "objective/kl": 23.81700897216797, "objective/non_score_reward": -1.1908504962921143, "objective/rlhf_reward": -12.528239250183105, "objective/scores": -11.33738899230957, "policy/approxkl_avg": 5.313104267656854e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6718980073928833, "step": 824, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999558329582214, "val/ratio_var": NaN }, { "episode": 825, "epoch": 0.15492957746478872, "eps": 0, "loss/policy_avg": -0.00013767098425887525, "loss/value_avg": 0.16722305119037628, "lr": 5.28e-07, "objective/entropy": 110.17823028564453, "objective/kl": 38.27603530883789, "objective/non_score_reward": -1.913801908493042, "objective/rlhf_reward": -11.914217948913574, "objective/scores": -10.000415802001953, "policy/approxkl_avg": 7.663218326570131e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.835757851600647, "step": 825, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999927878379822, "val/ratio_var": NaN }, { "episode": 826, "epoch": 0.15511737089201877, "eps": 0, "loss/policy_avg": 1.8137805454898626e-05, "loss/value_avg": 0.5407024025917053, "lr": 5.25e-07, "objective/entropy": 94.73126983642578, "objective/kl": 41.159759521484375, "objective/non_score_reward": -2.057988166809082, "objective/rlhf_reward": -14.05642318725586, "objective/scores": -11.998435020446777, "policy/approxkl_avg": 5.810321468402435e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.108428478240967, "step": 826, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000029802322388, "val/ratio_var": NaN }, { "episode": 827, "epoch": 0.15530516431924882, "eps": 0, "loss/policy_avg": 1.9073486328125e-05, "loss/value_avg": 0.31649309396743774, "lr": 5.219999999999999e-07, "objective/entropy": 93.01176452636719, "objective/kl": 20.195329666137695, "objective/non_score_reward": -1.0097665786743164, "objective/rlhf_reward": -10.257902145385742, "objective/scores": -9.248135566711426, "policy/approxkl_avg": 8.893020719824563e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.85870361328125, "step": 827, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000673532485962, "val/ratio_var": NaN }, { "episode": 828, "epoch": 0.15549295774647887, "eps": 0, "loss/policy_avg": -2.83403210232791e-07, "loss/value_avg": 0.3051440417766571, "lr": 5.189999999999999e-07, "objective/entropy": 121.00012969970703, "objective/kl": 33.99506378173828, "objective/non_score_reward": -1.6997534036636353, "objective/rlhf_reward": -10.842211723327637, "objective/scores": -9.142457962036133, "policy/approxkl_avg": 9.417351520824013e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1535139083862305, "step": 828, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999000430107117, "val/ratio_var": NaN }, { "episode": 829, "epoch": 0.15568075117370891, "eps": 0, "loss/policy_avg": 2.5191397412527294e-07, "loss/value_avg": 0.04289963096380234, "lr": 5.16e-07, "objective/entropy": 4.104515075683594, "objective/kl": 22.066631317138672, "objective/non_score_reward": -1.1033316850662231, "objective/rlhf_reward": -9.675679206848145, "objective/scores": -8.572347640991211, "policy/approxkl_avg": 3.764017222906979e-11, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.1210891455411911, "step": 829, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999998807907104, "val/ratio_var": NaN }, { "episode": 830, "epoch": 0.15586854460093896, "eps": 0, "loss/policy_avg": -7.773345714667812e-06, "loss/value_avg": 0.16601137816905975, "lr": 5.13e-07, "objective/entropy": 66.24897003173828, "objective/kl": 36.84999465942383, "objective/non_score_reward": -1.8424999713897705, "objective/rlhf_reward": -11.502880096435547, "objective/scores": -9.660380363464355, "policy/approxkl_avg": 9.816916701765876e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4962526559829712, "step": 830, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000282526016235, "val/ratio_var": NaN }, { "episode": 831, "epoch": 0.156056338028169, "eps": 0, "loss/policy_avg": -2.8655214919126593e-05, "loss/value_avg": 1.20947265625, "lr": 5.100000000000001e-07, "objective/entropy": 18.051219940185547, "objective/kl": 37.415199279785156, "objective/non_score_reward": -1.8707600831985474, "objective/rlhf_reward": -7.835763931274414, "objective/scores": -5.965003967285156, "policy/approxkl_avg": 2.405118948445306e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5058708786964417, "step": 831, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999575614929199, "val/ratio_var": NaN }, { "episode": 832, "epoch": 0.15624413145539906, "eps": 0, "loss/policy_avg": -1.1039230230380781e-05, "loss/value_avg": 0.18218763172626495, "lr": 5.070000000000001e-07, "objective/entropy": 54.86686706542969, "objective/kl": 29.53212547302246, "objective/non_score_reward": -1.4766063690185547, "objective/rlhf_reward": -10.099019050598145, "objective/scores": -8.62241268157959, "policy/approxkl_avg": 3.926429315015412e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2390252351760864, "step": 832, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999644756317139, "val/ratio_var": NaN }, { "episode": 833, "epoch": 0.1564319248826291, "eps": 0, "loss/policy_avg": 6.2888525462767575e-06, "loss/value_avg": 0.8753882646560669, "lr": 5.040000000000001e-07, "objective/entropy": 21.959426879882812, "objective/kl": 44.629112243652344, "objective/non_score_reward": -2.2314555644989014, "objective/rlhf_reward": -9.46800708770752, "objective/scores": -7.236551761627197, "policy/approxkl_avg": 2.45218867434005e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.4230772852897644, "step": 833, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000015377998352, "val/ratio_var": NaN }, { "episode": 834, "epoch": 0.15661971830985916, "eps": 0, "loss/policy_avg": -2.286344169988297e-05, "loss/value_avg": 0.1324317306280136, "lr": 5.01e-07, "objective/entropy": 18.03795623779297, "objective/kl": 31.14957046508789, "objective/non_score_reward": -1.5574785470962524, "objective/rlhf_reward": -9.46423625946045, "objective/scores": -7.906757831573486, "policy/approxkl_avg": 1.3408390842073459e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.44444188475608826, "step": 834, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000466108322144, "val/ratio_var": NaN }, { "episode": 835, "epoch": 0.1568075117370892, "eps": 0, "loss/policy_avg": 1.5087847714312375e-05, "loss/value_avg": 0.5281351804733276, "lr": 4.98e-07, "objective/entropy": 102.44727325439453, "objective/kl": 35.588279724121094, "objective/non_score_reward": -1.7794139385223389, "objective/rlhf_reward": -13.404740333557129, "objective/scores": -11.625326156616211, "policy/approxkl_avg": 7.359459885947217e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0369067192077637, "step": 835, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000734329223633, "val/ratio_var": NaN }, { "episode": 836, "epoch": 0.15699530516431925, "eps": 0, "loss/policy_avg": 8.979833364719525e-05, "loss/value_avg": 0.09041687101125717, "lr": 4.95e-07, "objective/entropy": 25.333925247192383, "objective/kl": 43.12239456176758, "objective/non_score_reward": -2.1561198234558105, "objective/rlhf_reward": -11.430543899536133, "objective/scores": -9.27442455291748, "policy/approxkl_avg": 6.920495820850192e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5841548442840576, "step": 836, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000275373458862, "val/ratio_var": NaN }, { "episode": 837, "epoch": 0.1571830985915493, "eps": 0, "loss/policy_avg": -7.291109795914963e-05, "loss/value_avg": 0.5098007321357727, "lr": 4.92e-07, "objective/entropy": 110.45637512207031, "objective/kl": 40.875144958496094, "objective/non_score_reward": -2.043757438659668, "objective/rlhf_reward": -10.332768440246582, "objective/scores": -8.289011001586914, "policy/approxkl_avg": 1.506152074171041e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9020397663116455, "step": 837, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000152587890625, "val/ratio_var": NaN }, { "episode": 838, "epoch": 0.15737089201877935, "eps": 0, "loss/policy_avg": 9.354555368190631e-05, "loss/value_avg": 0.5006235837936401, "lr": 4.89e-07, "objective/entropy": 97.89346313476562, "objective/kl": 51.0350227355957, "objective/non_score_reward": -2.5517513751983643, "objective/rlhf_reward": -10.911578178405762, "objective/scores": -8.359827041625977, "policy/approxkl_avg": 6.00284550955621e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3734500408172607, "step": 838, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999043345451355, "val/ratio_var": NaN }, { "episode": 839, "epoch": 0.1575586854460094, "eps": 0, "loss/policy_avg": 4.517357228905894e-05, "loss/value_avg": 0.19916102290153503, "lr": 4.86e-07, "objective/entropy": 95.7806625366211, "objective/kl": 29.21830940246582, "objective/non_score_reward": -1.460915446281433, "objective/rlhf_reward": -11.602867126464844, "objective/scores": -10.141951560974121, "policy/approxkl_avg": 6.259794105289984e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7573587894439697, "step": 839, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000040531158447, "val/ratio_var": NaN }, { "episode": 840, "epoch": 0.15774647887323945, "eps": 0, "loss/policy_avg": 4.624420853360789e-06, "loss/value_avg": 0.37295404076576233, "lr": 4.830000000000001e-07, "objective/entropy": 73.06664276123047, "objective/kl": 57.56177520751953, "objective/non_score_reward": -2.8780884742736816, "objective/rlhf_reward": -12.940256118774414, "objective/scores": -10.06216812133789, "policy/approxkl_avg": 3.848765572911361e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4563748836517334, "step": 840, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000020146369934, "val/ratio_var": NaN }, { "episode": 841, "epoch": 0.1579342723004695, "eps": 0, "loss/policy_avg": 2.3176085960585624e-05, "loss/value_avg": 0.5569528341293335, "lr": 4.800000000000001e-07, "objective/entropy": 41.77384948730469, "objective/kl": 68.27159118652344, "objective/non_score_reward": -3.4135797023773193, "objective/rlhf_reward": -11.709957122802734, "objective/scores": -8.296377182006836, "policy/approxkl_avg": 4.174988177396699e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9112779498100281, "step": 841, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999800324440002, "val/ratio_var": NaN }, { "episode": 842, "epoch": 0.15812206572769952, "eps": 0, "loss/policy_avg": -1.8551663742982782e-05, "loss/value_avg": 0.2643900513648987, "lr": 4.77e-07, "objective/entropy": 87.1575927734375, "objective/kl": 31.72145652770996, "objective/non_score_reward": -1.5860729217529297, "objective/rlhf_reward": -12.249580383300781, "objective/scores": -10.663507461547852, "policy/approxkl_avg": 1.8051046879463684e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8545221090316772, "step": 842, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999502301216125, "val/ratio_var": NaN }, { "episode": 843, "epoch": 0.15830985915492957, "eps": 0, "loss/policy_avg": -7.966778866830282e-06, "loss/value_avg": 0.14224423468112946, "lr": 4.7400000000000004e-07, "objective/entropy": 90.26080322265625, "objective/kl": 20.55486488342285, "objective/non_score_reward": -1.0277433395385742, "objective/rlhf_reward": -11.503951072692871, "objective/scores": -10.476207733154297, "policy/approxkl_avg": 9.1297749804653e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.872079610824585, "step": 843, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000184774398804, "val/ratio_var": NaN }, { "episode": 844, "epoch": 0.15849765258215961, "eps": 0, "loss/policy_avg": -9.921811579260975e-05, "loss/value_avg": 0.17816989123821259, "lr": 4.71e-07, "objective/entropy": 89.8841323852539, "objective/kl": 28.50619125366211, "objective/non_score_reward": -1.4253095388412476, "objective/rlhf_reward": -10.688983917236328, "objective/scores": -9.26367473602295, "policy/approxkl_avg": 9.645248155720765e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0180296897888184, "step": 844, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999053478240967, "val/ratio_var": NaN }, { "episode": 845, "epoch": 0.15868544600938966, "eps": 0, "loss/policy_avg": 1.1705003089446109e-05, "loss/value_avg": 0.14605700969696045, "lr": 4.68e-07, "objective/entropy": 57.65314865112305, "objective/kl": 24.883577346801758, "objective/non_score_reward": -1.2441790103912354, "objective/rlhf_reward": -11.619911193847656, "objective/scores": -10.375732421875, "policy/approxkl_avg": 6.54199752148088e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2556982040405273, "step": 845, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999950110912323, "val/ratio_var": NaN }, { "episode": 846, "epoch": 0.1588732394366197, "eps": 0, "loss/policy_avg": 5.830008831253508e-06, "loss/value_avg": 0.06726200878620148, "lr": 4.65e-07, "objective/entropy": 4.193295001983643, "objective/kl": 38.93879699707031, "objective/non_score_reward": -1.9469397068023682, "objective/rlhf_reward": -11.616470336914062, "objective/scores": -9.669530868530273, "policy/approxkl_avg": 3.353053612542567e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.12341078370809555, "step": 846, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999903440475464, "val/ratio_var": NaN }, { "episode": 847, "epoch": 0.15906103286384976, "eps": 0, "loss/policy_avg": -2.249231874884572e-05, "loss/value_avg": 0.608018696308136, "lr": 4.62e-07, "objective/entropy": 118.61072540283203, "objective/kl": 53.797569274902344, "objective/non_score_reward": -2.689878463745117, "objective/rlhf_reward": -10.980001449584961, "objective/scores": -8.290122985839844, "policy/approxkl_avg": 6.820791043082863e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0736429691314697, "step": 847, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999988079071045, "val/ratio_var": NaN }, { "episode": 848, "epoch": 0.1592488262910798, "eps": 0, "loss/policy_avg": 4.783666372532025e-05, "loss/value_avg": 0.15660397708415985, "lr": 4.59e-07, "objective/entropy": 68.53981018066406, "objective/kl": 15.65165901184082, "objective/non_score_reward": -0.7825829386711121, "objective/rlhf_reward": -10.710723876953125, "objective/scores": -9.928140640258789, "policy/approxkl_avg": 8.587145572391819e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6442525386810303, "step": 848, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000077724456787, "val/ratio_var": NaN }, { "episode": 849, "epoch": 0.15943661971830986, "eps": 0, "loss/policy_avg": -1.8715858459472656e-05, "loss/value_avg": 0.19810961186885834, "lr": 4.56e-07, "objective/entropy": 89.15933227539062, "objective/kl": 43.677513122558594, "objective/non_score_reward": -2.183875560760498, "objective/rlhf_reward": -12.757553100585938, "objective/scores": -10.573678016662598, "policy/approxkl_avg": 9.594393901579679e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.831782579421997, "step": 849, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999954104423523, "val/ratio_var": NaN }, { "episode": 850, "epoch": 0.1596244131455399, "eps": 0, "loss/policy_avg": -3.167818067595363e-05, "loss/value_avg": 0.29174748063087463, "lr": 4.53e-07, "objective/entropy": 13.783346176147461, "objective/kl": 49.08049392700195, "objective/non_score_reward": -2.4540247917175293, "objective/rlhf_reward": -11.223026275634766, "objective/scores": -8.769001007080078, "policy/approxkl_avg": 3.902886902551472e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.31540897488594055, "step": 850, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999736547470093, "val/ratio_var": NaN }, { "episode": 851, "epoch": 0.15981220657276995, "eps": 0, "loss/policy_avg": -6.783683056710288e-05, "loss/value_avg": 0.18941448628902435, "lr": 4.5e-07, "objective/entropy": 141.23406982421875, "objective/kl": 49.52582550048828, "objective/non_score_reward": -2.4762916564941406, "objective/rlhf_reward": -12.039616584777832, "objective/scores": -9.563324928283691, "policy/approxkl_avg": 9.590011984528246e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.155390977859497, "step": 851, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000015377998352, "val/ratio_var": NaN }, { "episode": 852, "epoch": 0.16, "eps": 0, "loss/policy_avg": -5.546605825657025e-05, "loss/value_avg": 0.496733695268631, "lr": 4.4699999999999997e-07, "objective/entropy": 22.050418853759766, "objective/kl": 66.66474914550781, "objective/non_score_reward": -3.333237648010254, "objective/rlhf_reward": -12.387042045593262, "objective/scores": -9.053804397583008, "policy/approxkl_avg": 1.813935313066395e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.4605344533920288, "step": 852, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999716281890869, "val/ratio_var": NaN }, { "episode": 853, "epoch": 0.16018779342723005, "eps": 0, "loss/policy_avg": -4.8160552978515625e-05, "loss/value_avg": 21.35958480834961, "lr": 4.44e-07, "objective/entropy": 18.705547332763672, "objective/kl": 45.811676025390625, "objective/non_score_reward": -2.290583848953247, "objective/rlhf_reward": -5.266453266143799, "objective/scores": -2.9758694171905518, "policy/approxkl_avg": 8.270597540160907e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.38375285267829895, "step": 853, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999630451202393, "val/ratio_var": NaN }, { "episode": 854, "epoch": 0.1603755868544601, "eps": 0, "loss/policy_avg": -3.2388938961958047e-06, "loss/value_avg": 0.4421752393245697, "lr": 4.41e-07, "objective/entropy": 130.33187866210938, "objective/kl": 43.71908187866211, "objective/non_score_reward": -2.1859543323516846, "objective/rlhf_reward": -10.750232696533203, "objective/scores": -8.564278602600098, "policy/approxkl_avg": 9.095467135011859e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.229747772216797, "step": 854, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999697804450989, "val/ratio_var": NaN }, { "episode": 855, "epoch": 0.16056338028169015, "eps": 0, "loss/policy_avg": 0.0001319849252467975, "loss/value_avg": 0.5220931172370911, "lr": 4.38e-07, "objective/entropy": 101.1594467163086, "objective/kl": 19.342708587646484, "objective/non_score_reward": -0.967135488986969, "objective/rlhf_reward": -13.083853721618652, "objective/scores": -12.116718292236328, "policy/approxkl_avg": 2.2523487075432058e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6948496103286743, "step": 855, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998087286949158, "val/ratio_var": NaN }, { "episode": 856, "epoch": 0.1607511737089202, "eps": 0, "loss/policy_avg": -2.7948955903411843e-05, "loss/value_avg": 0.33471623063087463, "lr": 4.3499999999999996e-07, "objective/entropy": 97.63127899169922, "objective/kl": 26.681198120117188, "objective/non_score_reward": -1.3340599536895752, "objective/rlhf_reward": -9.718422889709473, "objective/scores": -8.384363174438477, "policy/approxkl_avg": 1.0598341049217197e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7205027341842651, "step": 856, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000001192092896, "val/ratio_var": NaN }, { "episode": 857, "epoch": 0.16093896713615025, "eps": 0, "loss/policy_avg": 7.13276385795325e-05, "loss/value_avg": 0.4680757224559784, "lr": 4.32e-07, "objective/entropy": 104.51844024658203, "objective/kl": 61.04383850097656, "objective/non_score_reward": -3.052191734313965, "objective/rlhf_reward": -12.598830223083496, "objective/scores": -9.546638488769531, "policy/approxkl_avg": 1.3968563905564224e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7485628128051758, "step": 857, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000035047531128, "val/ratio_var": NaN }, { "episode": 858, "epoch": 0.1611267605633803, "eps": 0, "loss/policy_avg": 8.4301209426485e-06, "loss/value_avg": 0.31487104296684265, "lr": 4.29e-07, "objective/entropy": 137.2862548828125, "objective/kl": 35.17189025878906, "objective/non_score_reward": -1.7585947513580322, "objective/rlhf_reward": -11.358874320983887, "objective/scores": -9.600279808044434, "policy/approxkl_avg": 6.687010767336687e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.322896718978882, "step": 858, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000144243240356, "val/ratio_var": NaN }, { "episode": 859, "epoch": 0.16131455399061032, "eps": 0, "loss/policy_avg": 7.744105460005812e-06, "loss/value_avg": 0.5400773882865906, "lr": 4.26e-07, "objective/entropy": 132.72299194335938, "objective/kl": 65.37242889404297, "objective/non_score_reward": -3.2686214447021484, "objective/rlhf_reward": -11.102624893188477, "objective/scores": -7.834003448486328, "policy/approxkl_avg": 8.24960793011087e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2513198852539062, "step": 859, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999374151229858, "val/ratio_var": NaN }, { "episode": 860, "epoch": 0.16150234741784036, "eps": 0, "loss/policy_avg": 2.0602963559213094e-05, "loss/value_avg": 0.06921471655368805, "lr": 4.2299999999999996e-07, "objective/entropy": 95.91400146484375, "objective/kl": 43.338592529296875, "objective/non_score_reward": -2.1669297218322754, "objective/rlhf_reward": -11.913528442382812, "objective/scores": -9.746598243713379, "policy/approxkl_avg": 6.093993221156779e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.966566562652588, "step": 860, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999510645866394, "val/ratio_var": NaN }, { "episode": 861, "epoch": 0.1616901408450704, "eps": 0, "loss/policy_avg": 1.596054971741978e-05, "loss/value_avg": 0.022318139672279358, "lr": 4.2000000000000006e-07, "objective/entropy": 4.446290016174316, "objective/kl": 24.92426109313965, "objective/non_score_reward": -1.2462129592895508, "objective/rlhf_reward": -10.11995792388916, "objective/scores": -8.87374496459961, "policy/approxkl_avg": 1.1174043912376419e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.12237434089183807, "step": 861, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999981164932251, "val/ratio_var": NaN }, { "episode": 862, "epoch": 0.16187793427230046, "eps": 0, "loss/policy_avg": 1.9028500901185907e-05, "loss/value_avg": 2.5992836952209473, "lr": 4.1700000000000004e-07, "objective/entropy": 83.80549621582031, "objective/kl": 74.77653503417969, "objective/non_score_reward": -3.7388272285461426, "objective/rlhf_reward": -12.826574325561523, "objective/scores": -9.087747573852539, "policy/approxkl_avg": 1.0880440015625936e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4354552030563354, "step": 862, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000264644622803, "val/ratio_var": NaN }, { "episode": 863, "epoch": 0.1620657276995305, "eps": 0, "loss/policy_avg": 8.077441452769563e-05, "loss/value_avg": 0.20193949341773987, "lr": 4.1400000000000003e-07, "objective/entropy": 111.96841430664062, "objective/kl": 30.658660888671875, "objective/non_score_reward": -1.532932996749878, "objective/rlhf_reward": -12.443358421325684, "objective/scores": -10.910425186157227, "policy/approxkl_avg": 1.2557110551369988e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0151851177215576, "step": 863, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000319480895996, "val/ratio_var": NaN }, { "episode": 864, "epoch": 0.16225352112676056, "eps": 0, "loss/policy_avg": -2.986979961860925e-05, "loss/value_avg": 0.6715213656425476, "lr": 4.1100000000000007e-07, "objective/entropy": 14.532817840576172, "objective/kl": 57.9564094543457, "objective/non_score_reward": -2.8978207111358643, "objective/rlhf_reward": -9.832348823547363, "objective/scores": -6.934528350830078, "policy/approxkl_avg": 3.8332030882770596e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5617629885673523, "step": 864, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999725818634033, "val/ratio_var": NaN }, { "episode": 865, "epoch": 0.1624413145539906, "eps": 0, "loss/policy_avg": 7.1984417445492e-05, "loss/value_avg": 1.2184311151504517, "lr": 4.0800000000000005e-07, "objective/entropy": 88.88031005859375, "objective/kl": 42.71998977661133, "objective/non_score_reward": -2.1359996795654297, "objective/rlhf_reward": -8.386085510253906, "objective/scores": -6.250086307525635, "policy/approxkl_avg": 1.1018319412414712e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9452975988388062, "step": 865, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999424815177917, "val/ratio_var": NaN }, { "episode": 866, "epoch": 0.16262910798122066, "eps": 0, "loss/policy_avg": 1.2487735148170032e-05, "loss/value_avg": 1.2763738632202148, "lr": 4.0500000000000004e-07, "objective/entropy": 88.6906967163086, "objective/kl": 68.91214752197266, "objective/non_score_reward": -3.4456071853637695, "objective/rlhf_reward": -13.838654518127441, "objective/scores": -10.393047332763672, "policy/approxkl_avg": 7.955829062211706e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5988285541534424, "step": 866, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999819993972778, "val/ratio_var": NaN }, { "episode": 867, "epoch": 0.1628169014084507, "eps": 0, "loss/policy_avg": 5.0666196329984814e-05, "loss/value_avg": 0.5785573124885559, "lr": 4.02e-07, "objective/entropy": 69.03265380859375, "objective/kl": 83.98564910888672, "objective/non_score_reward": -4.199282646179199, "objective/rlhf_reward": -13.350728034973145, "objective/scores": -9.151445388793945, "policy/approxkl_avg": 1.2052932163442165e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.527760624885559, "step": 867, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999687075614929, "val/ratio_var": NaN }, { "episode": 868, "epoch": 0.16300469483568075, "eps": 0, "loss/policy_avg": -2.0998828404117376e-05, "loss/value_avg": 0.4801427125930786, "lr": 3.99e-07, "objective/entropy": 103.57394409179688, "objective/kl": 43.205142974853516, "objective/non_score_reward": -2.160257339477539, "objective/rlhf_reward": -10.552641868591309, "objective/scores": -8.39238452911377, "policy/approxkl_avg": 8.488809299933564e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9976023435592651, "step": 868, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000039339065552, "val/ratio_var": NaN }, { "episode": 869, "epoch": 0.1631924882629108, "eps": 0, "loss/policy_avg": -2.4786536414467264e-06, "loss/value_avg": 0.09044190496206284, "lr": 3.9600000000000005e-07, "objective/entropy": 108.90570068359375, "objective/kl": 38.1839714050293, "objective/non_score_reward": -1.909198522567749, "objective/rlhf_reward": -12.543745994567871, "objective/scores": -10.634547233581543, "policy/approxkl_avg": 1.1441868963402158e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.197286367416382, "step": 869, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999986469745636, "val/ratio_var": NaN }, { "episode": 870, "epoch": 0.16338028169014085, "eps": 0, "loss/policy_avg": -0.00012270009028725326, "loss/value_avg": 0.0812043845653534, "lr": 3.9300000000000004e-07, "objective/entropy": 53.25826644897461, "objective/kl": 38.93324279785156, "objective/non_score_reward": -1.946662187576294, "objective/rlhf_reward": -11.628344535827637, "objective/scores": -9.681682586669922, "policy/approxkl_avg": 4.517834284456512e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3635414838790894, "step": 870, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999738931655884, "val/ratio_var": NaN }, { "episode": 871, "epoch": 0.1635680751173709, "eps": 0, "loss/policy_avg": 8.389634785999078e-06, "loss/value_avg": 0.40558674931526184, "lr": 3.9e-07, "objective/entropy": 6.954255104064941, "objective/kl": 18.79407501220703, "objective/non_score_reward": -0.9397038221359253, "objective/rlhf_reward": -10.392925262451172, "objective/scores": -9.453221321105957, "policy/approxkl_avg": 4.0187586591855506e-10, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.11934100836515427, "step": 871, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999979734420776, "val/ratio_var": NaN }, { "episode": 872, "epoch": 0.16375586854460095, "eps": 0, "loss/policy_avg": 1.8691116565605626e-06, "loss/value_avg": 0.8821293711662292, "lr": 3.87e-07, "objective/entropy": 5.827352523803711, "objective/kl": 26.895523071289062, "objective/non_score_reward": -1.3447761535644531, "objective/rlhf_reward": -9.821122169494629, "objective/scores": -8.476346015930176, "policy/approxkl_avg": 3.799479897370617e-10, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.12955805659294128, "step": 872, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000038146972656, "val/ratio_var": NaN }, { "episode": 873, "epoch": 0.163943661971831, "eps": 0, "loss/policy_avg": 6.547063821926713e-05, "loss/value_avg": 0.11789055913686752, "lr": 3.84e-07, "objective/entropy": 6.850018501281738, "objective/kl": 26.037118911743164, "objective/non_score_reward": -1.3018559217453003, "objective/rlhf_reward": -10.943857192993164, "objective/scores": -9.642001152038574, "policy/approxkl_avg": 7.150982472836631e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.19680267572402954, "step": 873, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999743700027466, "val/ratio_var": NaN }, { "episode": 874, "epoch": 0.16413145539906104, "eps": 0, "loss/policy_avg": -4.0117298340192065e-05, "loss/value_avg": 0.578097403049469, "lr": 3.8100000000000004e-07, "objective/entropy": 51.675689697265625, "objective/kl": 17.19123649597168, "objective/non_score_reward": -0.8595616817474365, "objective/rlhf_reward": -9.152711868286133, "objective/scores": -8.293149948120117, "policy/approxkl_avg": 5.2525130911362794e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1980003118515015, "step": 874, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000286102294922, "val/ratio_var": NaN }, { "episode": 875, "epoch": 0.1643192488262911, "eps": 0, "loss/policy_avg": 1.1635276678134687e-05, "loss/value_avg": 0.17561088502407074, "lr": 3.78e-07, "objective/entropy": 66.70155334472656, "objective/kl": 26.95919418334961, "objective/non_score_reward": -1.3479597568511963, "objective/rlhf_reward": -10.428476333618164, "objective/scores": -9.080516815185547, "policy/approxkl_avg": 3.236594992017672e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.563591718673706, "step": 875, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000014305114746, "val/ratio_var": NaN }, { "episode": 876, "epoch": 0.1645070422535211, "eps": 0, "loss/policy_avg": -3.936155917472206e-05, "loss/value_avg": 0.13211266696453094, "lr": 3.75e-07, "objective/entropy": 119.83422088623047, "objective/kl": 33.75497817993164, "objective/non_score_reward": -1.687748908996582, "objective/rlhf_reward": -11.939952850341797, "objective/scores": -10.252203941345215, "policy/approxkl_avg": 7.645550681445457e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9625589847564697, "step": 876, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999690651893616, "val/ratio_var": NaN }, { "episode": 877, "epoch": 0.16469483568075116, "eps": 0, "loss/policy_avg": 8.595664257882163e-05, "loss/value_avg": 0.25229892134666443, "lr": 3.72e-07, "objective/entropy": 48.30602264404297, "objective/kl": 57.173583984375, "objective/non_score_reward": -2.8586790561676025, "objective/rlhf_reward": -11.940308570861816, "objective/scores": -9.081629753112793, "policy/approxkl_avg": 7.666935886163628e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.8894549608230591, "step": 877, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999470114707947, "val/ratio_var": NaN }, { "episode": 878, "epoch": 0.1648826291079812, "eps": 0, "loss/policy_avg": -8.106231689453125e-06, "loss/value_avg": 0.039769262075424194, "lr": 3.69e-07, "objective/entropy": 5.810091972351074, "objective/kl": 28.735576629638672, "objective/non_score_reward": -1.4367789030075073, "objective/rlhf_reward": -10.660958290100098, "objective/scores": -9.2241792678833, "policy/approxkl_avg": 3.592768971216742e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.13728764653205872, "step": 878, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000088214874268, "val/ratio_var": NaN }, { "episode": 879, "epoch": 0.16507042253521126, "eps": 0, "loss/policy_avg": 4.9127724196296185e-05, "loss/value_avg": 0.1571117788553238, "lr": 3.66e-07, "objective/entropy": 91.49909210205078, "objective/kl": 39.37042999267578, "objective/non_score_reward": -1.9685214757919312, "objective/rlhf_reward": -10.994367599487305, "objective/scores": -9.025846481323242, "policy/approxkl_avg": 6.895892568081763e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.032707929611206, "step": 879, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000295639038086, "val/ratio_var": NaN }, { "episode": 880, "epoch": 0.1652582159624413, "eps": 0, "loss/policy_avg": -2.704476355575025e-05, "loss/value_avg": 0.12826189398765564, "lr": 3.63e-07, "objective/entropy": 55.849578857421875, "objective/kl": 15.277727127075195, "objective/non_score_reward": -0.7638862133026123, "objective/rlhf_reward": -10.759000778198242, "objective/scores": -9.99511432647705, "policy/approxkl_avg": 7.964695925011256e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5999271869659424, "step": 880, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000276565551758, "val/ratio_var": NaN }, { "episode": 881, "epoch": 0.16544600938967136, "eps": 0, "loss/policy_avg": 6.446298357332125e-06, "loss/value_avg": 0.08315499871969223, "lr": 3.6e-07, "objective/entropy": 9.545026779174805, "objective/kl": 27.617380142211914, "objective/non_score_reward": -1.3808691501617432, "objective/rlhf_reward": -10.023024559020996, "objective/scores": -8.642155647277832, "policy/approxkl_avg": 7.489891373779756e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.22759895026683807, "step": 881, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000146627426147, "val/ratio_var": NaN }, { "episode": 882, "epoch": 0.1656338028169014, "eps": 0, "loss/policy_avg": 1.626644552743528e-05, "loss/value_avg": 0.48274755477905273, "lr": 3.57e-07, "objective/entropy": 73.891845703125, "objective/kl": 29.22901153564453, "objective/non_score_reward": -1.4614505767822266, "objective/rlhf_reward": -12.142923355102539, "objective/scores": -10.681472778320312, "policy/approxkl_avg": 9.786698740299471e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.692686915397644, "step": 882, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998303651809692, "val/ratio_var": NaN }, { "episode": 883, "epoch": 0.16582159624413145, "eps": 0, "loss/policy_avg": -1.260019689652836e-05, "loss/value_avg": 0.16629980504512787, "lr": 3.5399999999999997e-07, "objective/entropy": 8.039379119873047, "objective/kl": 27.615732192993164, "objective/non_score_reward": -1.3807865381240845, "objective/rlhf_reward": -9.913211822509766, "objective/scores": -8.532424926757812, "policy/approxkl_avg": 1.6532460955787087e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.1480989307165146, "step": 883, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999913573265076, "val/ratio_var": NaN }, { "episode": 884, "epoch": 0.1660093896713615, "eps": 0, "loss/policy_avg": 3.074249980272725e-05, "loss/value_avg": 0.270263671875, "lr": 3.51e-07, "objective/entropy": 99.16259765625, "objective/kl": 22.608644485473633, "objective/non_score_reward": -1.1304322481155396, "objective/rlhf_reward": -11.36941909790039, "objective/scores": -10.23898696899414, "policy/approxkl_avg": 1.0746143175310863e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.866939902305603, "step": 884, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998992681503296, "val/ratio_var": NaN }, { "episode": 885, "epoch": 0.16619718309859155, "eps": 0, "loss/policy_avg": 2.334702730877325e-05, "loss/value_avg": 0.566254198551178, "lr": 3.4800000000000005e-07, "objective/entropy": 112.55889892578125, "objective/kl": 31.528118133544922, "objective/non_score_reward": -1.5764060020446777, "objective/rlhf_reward": -13.46121597290039, "objective/scores": -11.884809494018555, "policy/approxkl_avg": 6.607525193658148e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9210758209228516, "step": 885, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999880790710449, "val/ratio_var": NaN }, { "episode": 886, "epoch": 0.1663849765258216, "eps": 0, "loss/policy_avg": -3.103940002802119e-07, "loss/value_avg": 0.24192281067371368, "lr": 3.4500000000000003e-07, "objective/entropy": 90.29652404785156, "objective/kl": 25.795597076416016, "objective/non_score_reward": -1.2897799015045166, "objective/rlhf_reward": -9.70318603515625, "objective/scores": -8.413406372070312, "policy/approxkl_avg": 8.244933269452304e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7989110946655273, "step": 886, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999512434005737, "val/ratio_var": NaN }, { "episode": 887, "epoch": 0.16657276995305165, "eps": 0, "loss/policy_avg": 2.9051079764030874e-05, "loss/value_avg": 0.41109582781791687, "lr": 3.42e-07, "objective/entropy": 104.39923095703125, "objective/kl": 40.2841796875, "objective/non_score_reward": -2.0142087936401367, "objective/rlhf_reward": -13.129408836364746, "objective/scores": -11.11520004272461, "policy/approxkl_avg": 1.2777971392097243e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2140469551086426, "step": 887, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000157356262207, "val/ratio_var": NaN }, { "episode": 888, "epoch": 0.1667605633802817, "eps": 0, "loss/policy_avg": -2.9527916922234e-05, "loss/value_avg": 0.22219374775886536, "lr": 3.39e-07, "objective/entropy": 54.46760559082031, "objective/kl": 52.841312408447266, "objective/non_score_reward": -2.6420652866363525, "objective/rlhf_reward": -10.049214363098145, "objective/scores": -7.407149314880371, "policy/approxkl_avg": 7.47382102872507e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4119527339935303, "step": 888, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.999986469745636, "val/ratio_var": NaN }, { "episode": 889, "epoch": 0.16694835680751174, "eps": 0, "loss/policy_avg": 1.1516067388583906e-05, "loss/value_avg": 0.234320729970932, "lr": 3.36e-07, "objective/entropy": 100.44703674316406, "objective/kl": 26.724557876586914, "objective/non_score_reward": -1.3362280130386353, "objective/rlhf_reward": -11.985616683959961, "objective/scores": -10.649388313293457, "policy/approxkl_avg": 1.612652056337538e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9848471879959106, "step": 889, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999326467514038, "val/ratio_var": NaN }, { "episode": 890, "epoch": 0.1671361502347418, "eps": 0, "loss/policy_avg": 5.038279482505459e-07, "loss/value_avg": 0.06906334310770035, "lr": 3.3300000000000003e-07, "objective/entropy": 4.113088130950928, "objective/kl": 21.079818725585938, "objective/non_score_reward": -1.0539910793304443, "objective/rlhf_reward": -10.254669189453125, "objective/scores": -9.200677871704102, "policy/approxkl_avg": 5.464832475432502e-10, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.11958284676074982, "step": 890, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999952912330627, "val/ratio_var": NaN }, { "episode": 891, "epoch": 0.16732394366197184, "eps": 0, "loss/policy_avg": 4.712590452982113e-05, "loss/value_avg": 0.3711980879306793, "lr": 3.3e-07, "objective/entropy": 119.65919494628906, "objective/kl": 64.18376922607422, "objective/non_score_reward": -3.209188938140869, "objective/rlhf_reward": -12.865226745605469, "objective/scores": -9.656037330627441, "policy/approxkl_avg": 6.876296509972235e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.234738826751709, "step": 891, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999667406082153, "val/ratio_var": NaN }, { "episode": 892, "epoch": 0.1675117370892019, "eps": 0, "loss/policy_avg": 1.3873262105335016e-05, "loss/value_avg": 0.6189697980880737, "lr": 3.27e-07, "objective/entropy": 57.53052520751953, "objective/kl": 67.37770080566406, "objective/non_score_reward": -3.3688855171203613, "objective/rlhf_reward": -11.547542572021484, "objective/scores": -8.178656578063965, "policy/approxkl_avg": 4.845772494377343e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0056520700454712, "step": 892, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000012755393982, "val/ratio_var": NaN }, { "episode": 893, "epoch": 0.16769953051643194, "eps": 0, "loss/policy_avg": 1.1880442798428703e-05, "loss/value_avg": 0.12525348365306854, "lr": 3.24e-07, "objective/entropy": 77.98744201660156, "objective/kl": 25.418481826782227, "objective/non_score_reward": -1.2709242105484009, "objective/rlhf_reward": -11.361616134643555, "objective/scores": -10.090691566467285, "policy/approxkl_avg": 4.1421753138592976e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6127492189407349, "step": 893, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999884963035583, "val/ratio_var": NaN }, { "episode": 894, "epoch": 0.16788732394366196, "eps": 0, "loss/policy_avg": 2.5911151169566438e-05, "loss/value_avg": 0.2082015722990036, "lr": 3.21e-07, "objective/entropy": 114.84100341796875, "objective/kl": 22.446592330932617, "objective/non_score_reward": -1.1223297119140625, "objective/rlhf_reward": -11.88166332244873, "objective/scores": -10.759333610534668, "policy/approxkl_avg": 1.4665083369891363e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5279219150543213, "step": 894, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999639391899109, "val/ratio_var": NaN }, { "episode": 895, "epoch": 0.168075117370892, "eps": 0, "loss/policy_avg": 5.536709068110213e-05, "loss/value_avg": 0.521860659122467, "lr": 3.18e-07, "objective/entropy": 110.87748718261719, "objective/kl": 66.10547637939453, "objective/non_score_reward": -3.30527400970459, "objective/rlhf_reward": -14.669912338256836, "objective/scores": -11.364638328552246, "policy/approxkl_avg": 7.969933335516544e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8117883205413818, "step": 895, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000213384628296, "val/ratio_var": NaN }, { "episode": 896, "epoch": 0.16826291079812206, "eps": 0, "loss/policy_avg": -6.577653402928263e-05, "loss/value_avg": 0.6738947629928589, "lr": 3.15e-07, "objective/entropy": 74.09831237792969, "objective/kl": 74.36154174804688, "objective/non_score_reward": -3.7180771827697754, "objective/rlhf_reward": -13.16253662109375, "objective/scores": -9.444459915161133, "policy/approxkl_avg": 1.1760758411583083e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4062845706939697, "step": 896, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0001226663589478, "val/ratio_var": NaN }, { "episode": 897, "epoch": 0.1684507042253521, "eps": 0, "loss/policy_avg": 2.323006629012525e-05, "loss/value_avg": 0.12312526255846024, "lr": 3.12e-07, "objective/entropy": 94.30950927734375, "objective/kl": 36.369510650634766, "objective/non_score_reward": -1.8184754848480225, "objective/rlhf_reward": -10.890999794006348, "objective/scores": -9.072524070739746, "policy/approxkl_avg": 9.52562189127093e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9709242582321167, "step": 897, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999527931213379, "val/ratio_var": NaN }, { "episode": 898, "epoch": 0.16863849765258215, "eps": 0, "loss/policy_avg": -4.8021101974882185e-05, "loss/value_avg": 0.16710573434829712, "lr": 3.09e-07, "objective/entropy": 98.95502471923828, "objective/kl": 24.956241607666016, "objective/non_score_reward": -1.2478119134902954, "objective/rlhf_reward": -11.090417861938477, "objective/scores": -9.842605590820312, "policy/approxkl_avg": 7.356484132969854e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.74639093875885, "step": 898, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000042915344238, "val/ratio_var": NaN }, { "episode": 899, "epoch": 0.1688262910798122, "eps": 0, "loss/policy_avg": -4.448980689630844e-06, "loss/value_avg": 0.2910092771053314, "lr": 3.06e-07, "objective/entropy": 113.56355285644531, "objective/kl": 47.272544860839844, "objective/non_score_reward": -2.3636271953582764, "objective/rlhf_reward": -12.354471206665039, "objective/scores": -9.990843772888184, "policy/approxkl_avg": 9.542054613120854e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1636409759521484, "step": 899, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998300671577454, "val/ratio_var": NaN }, { "episode": 900, "epoch": 0.16901408450704225, "eps": 0, "loss/policy_avg": 3.1903105991659686e-05, "loss/value_avg": 0.471824586391449, "lr": 3.0300000000000005e-07, "objective/entropy": 3.7358810901641846, "objective/kl": 30.014820098876953, "objective/non_score_reward": -1.500740885734558, "objective/rlhf_reward": -11.291097640991211, "objective/scores": -9.790356636047363, "policy/approxkl_avg": 7.388994749391031e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.10442078858613968, "step": 900, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999839067459106, "val/ratio_var": NaN }, { "episode": 901, "epoch": 0.1692018779342723, "eps": 0, "loss/policy_avg": 8.529087608621921e-06, "loss/value_avg": 0.14813607931137085, "lr": 3.0000000000000004e-07, "objective/entropy": 97.56294250488281, "objective/kl": 36.81226348876953, "objective/non_score_reward": -1.8406133651733398, "objective/rlhf_reward": -11.909440040588379, "objective/scores": -10.068826675415039, "policy/approxkl_avg": 6.004351149613285e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6259881258010864, "step": 901, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999977707862854, "val/ratio_var": NaN }, { "episode": 902, "epoch": 0.16938967136150235, "eps": 0, "loss/policy_avg": -1.9104974853689782e-05, "loss/value_avg": 0.25688594579696655, "lr": 2.97e-07, "objective/entropy": 73.55294799804688, "objective/kl": 39.669822692871094, "objective/non_score_reward": -1.98349130153656, "objective/rlhf_reward": -10.92428207397461, "objective/scores": -8.940791130065918, "policy/approxkl_avg": 6.663178453436558e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5148742198944092, "step": 902, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999787211418152, "val/ratio_var": NaN }, { "episode": 903, "epoch": 0.1695774647887324, "eps": 0, "loss/policy_avg": -4.131389141548425e-05, "loss/value_avg": 0.09615745395421982, "lr": 2.94e-07, "objective/entropy": 26.135555267333984, "objective/kl": 39.85887908935547, "objective/non_score_reward": -1.9929442405700684, "objective/rlhf_reward": -11.09640884399414, "objective/scores": -9.10346508026123, "policy/approxkl_avg": 4.418524213178898e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.4358290135860443, "step": 903, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000388622283936, "val/ratio_var": NaN }, { "episode": 904, "epoch": 0.16976525821596244, "eps": 0, "loss/policy_avg": -1.515982330602128e-05, "loss/value_avg": 0.29353195428848267, "lr": 2.91e-07, "objective/entropy": 100.11473083496094, "objective/kl": 50.27587890625, "objective/non_score_reward": -2.5137939453125, "objective/rlhf_reward": -11.437036514282227, "objective/scores": -8.923242568969727, "policy/approxkl_avg": 7.410775992866547e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1056969165802, "step": 904, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999406933784485, "val/ratio_var": NaN }, { "episode": 905, "epoch": 0.1699530516431925, "eps": 0, "loss/policy_avg": -5.1804308895953e-05, "loss/value_avg": 0.1619727462530136, "lr": 2.8800000000000004e-07, "objective/entropy": 101.95097351074219, "objective/kl": 26.620508193969727, "objective/non_score_reward": -1.33102548122406, "objective/rlhf_reward": -11.339380264282227, "objective/scores": -10.008355140686035, "policy/approxkl_avg": 1.1014768830364119e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.619513750076294, "step": 905, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999071955680847, "val/ratio_var": NaN }, { "episode": 906, "epoch": 0.17014084507042254, "eps": 0, "loss/policy_avg": 4.563241600408219e-05, "loss/value_avg": 0.20036855340003967, "lr": 2.85e-07, "objective/entropy": 78.20149230957031, "objective/kl": 33.519805908203125, "objective/non_score_reward": -1.675990343093872, "objective/rlhf_reward": -11.139666557312012, "objective/scores": -9.463676452636719, "policy/approxkl_avg": 5.418516835220544e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7833367586135864, "step": 906, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999984502792358, "val/ratio_var": NaN }, { "episode": 907, "epoch": 0.1703286384976526, "eps": 0, "loss/policy_avg": 1.3495391470996765e-08, "loss/value_avg": 0.13937059044837952, "lr": 2.82e-07, "objective/entropy": 21.132232666015625, "objective/kl": 36.023841857910156, "objective/non_score_reward": -1.801192045211792, "objective/rlhf_reward": -11.84951400756836, "objective/scores": -10.048321723937988, "policy/approxkl_avg": 3.200227993715998e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5142914652824402, "step": 907, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000171661376953, "val/ratio_var": NaN }, { "episode": 908, "epoch": 0.17051643192488264, "eps": 0, "loss/policy_avg": -3.733725009169575e-07, "loss/value_avg": 0.13364584743976593, "lr": 2.79e-07, "objective/entropy": 132.04473876953125, "objective/kl": 25.382648468017578, "objective/non_score_reward": -1.2691324949264526, "objective/rlhf_reward": -12.807353019714355, "objective/scores": -11.538220405578613, "policy/approxkl_avg": 7.904458954044458e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4368066787719727, "step": 908, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.999869167804718, "val/ratio_var": NaN }, { "episode": 909, "epoch": 0.1707042253521127, "eps": 0, "loss/policy_avg": -6.684716936433688e-06, "loss/value_avg": 0.13645675778388977, "lr": 2.76e-07, "objective/entropy": 109.39717102050781, "objective/kl": 40.03447723388672, "objective/non_score_reward": -2.0017240047454834, "objective/rlhf_reward": -12.509763717651367, "objective/scores": -10.508039474487305, "policy/approxkl_avg": 1.0807126926692945e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7512184381484985, "step": 909, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000216960906982, "val/ratio_var": NaN }, { "episode": 910, "epoch": 0.17089201877934274, "eps": 0, "loss/policy_avg": 3.5807770473184064e-06, "loss/value_avg": 0.2508305311203003, "lr": 2.73e-07, "objective/entropy": 111.9783935546875, "objective/kl": 53.81822967529297, "objective/non_score_reward": -2.690911293029785, "objective/rlhf_reward": -12.422808647155762, "objective/scores": -9.731897354125977, "policy/approxkl_avg": 8.706605569841486e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7253096103668213, "step": 910, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000077486038208, "val/ratio_var": NaN }, { "episode": 911, "epoch": 0.17107981220657276, "eps": 0, "loss/policy_avg": 6.154798029456288e-05, "loss/value_avg": 0.23590973019599915, "lr": 2.7e-07, "objective/entropy": 126.1414566040039, "objective/kl": 49.89778137207031, "objective/non_score_reward": -2.494889259338379, "objective/rlhf_reward": -11.969632148742676, "objective/scores": -9.474742889404297, "policy/approxkl_avg": 1.1911544106624206e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0882015228271484, "step": 911, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999524354934692, "val/ratio_var": NaN }, { "episode": 912, "epoch": 0.1712676056338028, "eps": 0, "loss/policy_avg": 3.1381281587528065e-05, "loss/value_avg": 0.1945875883102417, "lr": 2.67e-07, "objective/entropy": 95.54139709472656, "objective/kl": 20.995595932006836, "objective/non_score_reward": -1.0497797727584839, "objective/rlhf_reward": -11.479257583618164, "objective/scores": -10.42947769165039, "policy/approxkl_avg": 1.0355012847185208e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.766661524772644, "step": 912, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999114871025085, "val/ratio_var": NaN }, { "episode": 913, "epoch": 0.17145539906103285, "eps": 0, "loss/policy_avg": 0.0001646257733227685, "loss/value_avg": 0.3490176200866699, "lr": 2.64e-07, "objective/entropy": 110.20828247070312, "objective/kl": 33.0790901184082, "objective/non_score_reward": -1.6539547443389893, "objective/rlhf_reward": -10.577651977539062, "objective/scores": -8.923697471618652, "policy/approxkl_avg": 2.2113897557574091e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.176626443862915, "step": 913, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9998992681503296, "val/ratio_var": NaN }, { "episode": 914, "epoch": 0.1716431924882629, "eps": 0, "loss/policy_avg": 9.161571506410837e-05, "loss/value_avg": 0.16726213693618774, "lr": 2.6099999999999997e-07, "objective/entropy": 74.37952423095703, "objective/kl": 42.042747497558594, "objective/non_score_reward": -2.102137327194214, "objective/rlhf_reward": -11.574546813964844, "objective/scores": -9.47240924835205, "policy/approxkl_avg": 5.7810986220374616e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5875060558319092, "step": 914, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000041723251343, "val/ratio_var": NaN }, { "episode": 915, "epoch": 0.17183098591549295, "eps": 0, "loss/policy_avg": 1.194792002934264e-05, "loss/value_avg": 0.14791028201580048, "lr": 2.58e-07, "objective/entropy": 65.57174682617188, "objective/kl": 37.807491302490234, "objective/non_score_reward": -1.8903744220733643, "objective/rlhf_reward": -11.736533164978027, "objective/scores": -9.846158981323242, "policy/approxkl_avg": 6.612228276026144e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5965415239334106, "step": 915, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000395774841309, "val/ratio_var": NaN }, { "episode": 916, "epoch": 0.172018779342723, "eps": 0, "loss/policy_avg": 2.323906301171519e-05, "loss/value_avg": 0.9218993186950684, "lr": 2.5500000000000005e-07, "objective/entropy": 21.655040740966797, "objective/kl": 45.80534362792969, "objective/non_score_reward": -2.290266990661621, "objective/rlhf_reward": -12.670103073120117, "objective/scores": -10.379836082458496, "policy/approxkl_avg": 2.0556564450657788e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5149778127670288, "step": 916, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000072717666626, "val/ratio_var": NaN }, { "episode": 917, "epoch": 0.17220657276995305, "eps": 0, "loss/policy_avg": 4.290634751669131e-05, "loss/value_avg": 0.2492169737815857, "lr": 2.5200000000000003e-07, "objective/entropy": 102.55292510986328, "objective/kl": 34.2597770690918, "objective/non_score_reward": -1.7129887342453003, "objective/rlhf_reward": -11.895365715026855, "objective/scores": -10.182376861572266, "policy/approxkl_avg": 3.987588286236132e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5762180089950562, "step": 917, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9998205304145813, "val/ratio_var": NaN }, { "episode": 918, "epoch": 0.1723943661971831, "eps": 0, "loss/policy_avg": 2.002716064453125e-05, "loss/value_avg": 0.3709317743778229, "lr": 2.49e-07, "objective/entropy": 87.57637023925781, "objective/kl": 17.89927101135254, "objective/non_score_reward": -0.8949635028839111, "objective/rlhf_reward": -12.995697021484375, "objective/scores": -12.100733757019043, "policy/approxkl_avg": 5.949882364575387e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4810491800308228, "step": 918, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999110698699951, "val/ratio_var": NaN }, { "episode": 919, "epoch": 0.17258215962441315, "eps": 0, "loss/policy_avg": 5.215968849370256e-05, "loss/value_avg": 0.3811643421649933, "lr": 2.46e-07, "objective/entropy": 98.70429992675781, "objective/kl": 31.733245849609375, "objective/non_score_reward": -1.5866621732711792, "objective/rlhf_reward": -10.926448822021484, "objective/scores": -9.339786529541016, "policy/approxkl_avg": 1.0739744027432607e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2051610946655273, "step": 919, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999952495098114, "val/ratio_var": NaN }, { "episode": 920, "epoch": 0.1727699530516432, "eps": 0, "loss/policy_avg": 1.6050518752308562e-05, "loss/value_avg": 0.5239989161491394, "lr": 2.43e-07, "objective/entropy": 39.86094665527344, "objective/kl": 62.7515869140625, "objective/non_score_reward": -3.1375792026519775, "objective/rlhf_reward": -12.494796752929688, "objective/scores": -9.357217788696289, "policy/approxkl_avg": 5.4904514712461605e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.6028546094894409, "step": 920, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999527335166931, "val/ratio_var": NaN }, { "episode": 921, "epoch": 0.17295774647887324, "eps": 0, "loss/policy_avg": 2.8790168471459765e-06, "loss/value_avg": 0.034022752195596695, "lr": 2.4000000000000003e-07, "objective/entropy": 4.984613418579102, "objective/kl": 28.548276901245117, "objective/non_score_reward": -1.4274139404296875, "objective/rlhf_reward": -10.758380889892578, "objective/scores": -9.33096694946289, "policy/approxkl_avg": 3.4835296869317744e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.17879386246204376, "step": 921, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000149011611938, "val/ratio_var": NaN }, { "episode": 922, "epoch": 0.1731455399061033, "eps": 0, "loss/policy_avg": 6.366676097968593e-05, "loss/value_avg": 0.2378654181957245, "lr": 2.3700000000000002e-07, "objective/entropy": 86.14927673339844, "objective/kl": 30.692447662353516, "objective/non_score_reward": -1.5346224308013916, "objective/rlhf_reward": -10.932662010192871, "objective/scores": -9.398039817810059, "policy/approxkl_avg": 5.408728043221345e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5358333587646484, "step": 922, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000907182693481, "val/ratio_var": NaN }, { "episode": 923, "epoch": 0.17333333333333334, "eps": 0, "loss/policy_avg": -6.4355022914242e-05, "loss/value_avg": 0.20199470221996307, "lr": 2.34e-07, "objective/entropy": 52.38569641113281, "objective/kl": 36.31978225708008, "objective/non_score_reward": -1.8159892559051514, "objective/rlhf_reward": -10.913402557373047, "objective/scores": -9.097413063049316, "policy/approxkl_avg": 5.9495235404938285e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.1045151948928833, "step": 923, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999947190284729, "val/ratio_var": NaN }, { "episode": 924, "epoch": 0.1735211267605634, "eps": 0, "loss/policy_avg": 2.4687569748493843e-05, "loss/value_avg": 0.4468284249305725, "lr": 2.31e-07, "objective/entropy": 116.23290252685547, "objective/kl": 58.04553985595703, "objective/non_score_reward": -2.9022772312164307, "objective/rlhf_reward": -13.049997329711914, "objective/scores": -10.147720336914062, "policy/approxkl_avg": 7.228545939597097e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1039717197418213, "step": 924, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000488758087158, "val/ratio_var": NaN }, { "episode": 925, "epoch": 0.17370892018779344, "eps": 0, "loss/policy_avg": -2.881266118492931e-05, "loss/value_avg": 0.26663315296173096, "lr": 2.28e-07, "objective/entropy": 13.551980972290039, "objective/kl": 51.49692916870117, "objective/non_score_reward": -2.5748465061187744, "objective/rlhf_reward": -12.784059524536133, "objective/scores": -10.209213256835938, "policy/approxkl_avg": 6.0822817893324554e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.25092360377311707, "step": 925, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999801516532898, "val/ratio_var": NaN }, { "episode": 926, "epoch": 0.17389671361502348, "eps": 0, "loss/policy_avg": -8.097234967863187e-06, "loss/value_avg": 0.4962392747402191, "lr": 2.25e-07, "objective/entropy": 65.09076690673828, "objective/kl": 56.76751708984375, "objective/non_score_reward": -2.838376045227051, "objective/rlhf_reward": -10.494649887084961, "objective/scores": -7.656274318695068, "policy/approxkl_avg": 4.970275924165435e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.216681718826294, "step": 926, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999847412109375, "val/ratio_var": NaN }, { "episode": 927, "epoch": 0.17408450704225353, "eps": 0, "loss/policy_avg": 4.255546627973672e-06, "loss/value_avg": 0.19138209521770477, "lr": 2.22e-07, "objective/entropy": 97.8451156616211, "objective/kl": 39.609867095947266, "objective/non_score_reward": -1.9804933071136475, "objective/rlhf_reward": -12.84495735168457, "objective/scores": -10.864463806152344, "policy/approxkl_avg": 9.275133550090686e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7505688667297363, "step": 927, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000219345092773, "val/ratio_var": NaN }, { "episode": 928, "epoch": 0.17427230046948355, "eps": 0, "loss/policy_avg": 2.554227648943197e-05, "loss/value_avg": 0.13769620656967163, "lr": 2.19e-07, "objective/entropy": 93.52681732177734, "objective/kl": 35.006614685058594, "objective/non_score_reward": -1.7503310441970825, "objective/rlhf_reward": -11.972159385681152, "objective/scores": -10.22182846069336, "policy/approxkl_avg": 8.123132033688307e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6611328125, "step": 928, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999459385871887, "val/ratio_var": NaN }, { "episode": 929, "epoch": 0.1744600938967136, "eps": 0, "loss/policy_avg": 3.406236646696925e-05, "loss/value_avg": 0.19197942316532135, "lr": 2.16e-07, "objective/entropy": 97.41806030273438, "objective/kl": 34.60809326171875, "objective/non_score_reward": -1.7304046154022217, "objective/rlhf_reward": -11.365640640258789, "objective/scores": -9.635235786437988, "policy/approxkl_avg": 7.132894097594544e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8916292190551758, "step": 929, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.99992835521698, "val/ratio_var": NaN }, { "episode": 930, "epoch": 0.17464788732394365, "eps": 0, "loss/policy_avg": -1.8083824215864297e-06, "loss/value_avg": 0.8359845280647278, "lr": 2.13e-07, "objective/entropy": 47.34485626220703, "objective/kl": 63.05455780029297, "objective/non_score_reward": -3.152728319168091, "objective/rlhf_reward": -14.33662223815918, "objective/scores": -11.183894157409668, "policy/approxkl_avg": 8.878350143959324e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9629113674163818, "step": 930, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000102519989014, "val/ratio_var": NaN }, { "episode": 931, "epoch": 0.1748356807511737, "eps": 0, "loss/policy_avg": 3.352255225763656e-05, "loss/value_avg": 0.2681039273738861, "lr": 2.1000000000000003e-07, "objective/entropy": 122.6598129272461, "objective/kl": 31.121015548706055, "objective/non_score_reward": -1.5560507774353027, "objective/rlhf_reward": -10.732685089111328, "objective/scores": -9.176634788513184, "policy/approxkl_avg": 9.831408220861704e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.6990299224853516, "step": 931, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000237226486206, "val/ratio_var": NaN }, { "episode": 932, "epoch": 0.17502347417840375, "eps": 0, "loss/policy_avg": -1.7737442249199376e-05, "loss/value_avg": 0.1798112392425537, "lr": 2.0700000000000001e-07, "objective/entropy": 102.11337280273438, "objective/kl": 38.87554168701172, "objective/non_score_reward": -1.943777084350586, "objective/rlhf_reward": -11.310952186584473, "objective/scores": -9.367175102233887, "policy/approxkl_avg": 8.877319146449736e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.069119453430176, "step": 932, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999739527702332, "val/ratio_var": NaN }, { "episode": 933, "epoch": 0.1752112676056338, "eps": 0, "loss/policy_avg": -1.1673513654386625e-05, "loss/value_avg": 0.056674856692552567, "lr": 2.0400000000000003e-07, "objective/entropy": 4.105778694152832, "objective/kl": 20.847875595092773, "objective/non_score_reward": -1.042393684387207, "objective/rlhf_reward": -9.759858131408691, "objective/scores": -8.717464447021484, "policy/approxkl_avg": 8.834771136889685e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.11728884279727936, "step": 933, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000190734863281, "val/ratio_var": NaN }, { "episode": 934, "epoch": 0.17539906103286385, "eps": 0, "loss/policy_avg": 3.815596937783994e-05, "loss/value_avg": 0.38080841302871704, "lr": 2.01e-07, "objective/entropy": 108.325439453125, "objective/kl": 21.987300872802734, "objective/non_score_reward": -1.0993651151657104, "objective/rlhf_reward": -10.07892894744873, "objective/scores": -8.97956371307373, "policy/approxkl_avg": 1.0919805504272517e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8837568759918213, "step": 934, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000237226486206, "val/ratio_var": NaN }, { "episode": 935, "epoch": 0.1755868544600939, "eps": 0, "loss/policy_avg": -2.9024087780271657e-05, "loss/value_avg": 0.054385192692279816, "lr": 1.9800000000000003e-07, "objective/entropy": 25.45954132080078, "objective/kl": 38.87242126464844, "objective/non_score_reward": -1.9436209201812744, "objective/rlhf_reward": -11.504350662231445, "objective/scores": -9.56072998046875, "policy/approxkl_avg": 3.138217152809375e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.6443308591842651, "step": 935, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999774098396301, "val/ratio_var": NaN }, { "episode": 936, "epoch": 0.17577464788732394, "eps": 0, "loss/policy_avg": -3.124745489913039e-05, "loss/value_avg": 0.5212452411651611, "lr": 1.95e-07, "objective/entropy": 133.07818603515625, "objective/kl": 36.92317581176758, "objective/non_score_reward": -1.8461589813232422, "objective/rlhf_reward": -11.903215408325195, "objective/scores": -10.057056427001953, "policy/approxkl_avg": 1.442736987655735e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3492777347564697, "step": 936, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999477863311768, "val/ratio_var": NaN }, { "episode": 937, "epoch": 0.175962441314554, "eps": 0, "loss/policy_avg": 9.549338574288413e-05, "loss/value_avg": 0.48164990544319153, "lr": 1.92e-07, "objective/entropy": 85.08589935302734, "objective/kl": 23.181564331054688, "objective/non_score_reward": -1.1590782403945923, "objective/rlhf_reward": -9.212910652160645, "objective/scores": -8.053832054138184, "policy/approxkl_avg": 1.0864065558280345e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.967524766921997, "step": 937, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999797344207764, "val/ratio_var": NaN }, { "episode": 938, "epoch": 0.17615023474178404, "eps": 0, "loss/policy_avg": -4.044118759338744e-05, "loss/value_avg": 0.1984962373971939, "lr": 1.89e-07, "objective/entropy": 111.6712417602539, "objective/kl": 20.238433837890625, "objective/non_score_reward": -1.0119216442108154, "objective/rlhf_reward": -11.094429969787598, "objective/scores": -10.082508087158203, "policy/approxkl_avg": 3.8270465552159294e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8711997270584106, "step": 938, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.00016450881958, "val/ratio_var": NaN }, { "episode": 939, "epoch": 0.1763380281690141, "eps": 0, "loss/policy_avg": 0.00011238511797273532, "loss/value_avg": 0.2259601503610611, "lr": 1.86e-07, "objective/entropy": 117.85758972167969, "objective/kl": 36.64963912963867, "objective/non_score_reward": -1.832481861114502, "objective/rlhf_reward": -12.010435104370117, "objective/scores": -10.177953720092773, "policy/approxkl_avg": 1.0501137381879744e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.20703125, "step": 939, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999611377716064, "val/ratio_var": NaN }, { "episode": 940, "epoch": 0.17652582159624414, "eps": 0, "loss/policy_avg": -4.76837158203125e-06, "loss/value_avg": 1.1018317937850952, "lr": 1.83e-07, "objective/entropy": 110.2158203125, "objective/kl": 96.80085754394531, "objective/non_score_reward": -4.840043067932129, "objective/rlhf_reward": -15.024828910827637, "objective/scores": -10.184785842895508, "policy/approxkl_avg": 7.218141462317362e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6404430866241455, "step": 940, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.00003182888031, "val/ratio_var": NaN }, { "episode": 941, "epoch": 0.17671361502347419, "eps": 0, "loss/policy_avg": -7.746354822302237e-05, "loss/value_avg": 0.16770002245903015, "lr": 1.8e-07, "objective/entropy": 70.58377075195312, "objective/kl": 29.65259552001953, "objective/non_score_reward": -1.4826298952102661, "objective/rlhf_reward": -12.291107177734375, "objective/scores": -10.808477401733398, "policy/approxkl_avg": 9.527315114610246e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7128100395202637, "step": 941, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000886917114258, "val/ratio_var": NaN }, { "episode": 942, "epoch": 0.17690140845070423, "eps": 0, "loss/policy_avg": -9.358603711007163e-05, "loss/value_avg": 0.3882143199443817, "lr": 1.7699999999999998e-07, "objective/entropy": 85.24858856201172, "objective/kl": 47.64545440673828, "objective/non_score_reward": -2.382272720336914, "objective/rlhf_reward": -14.120450019836426, "objective/scores": -11.738177299499512, "policy/approxkl_avg": 1.122661430486005e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6357122659683228, "step": 942, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999544024467468, "val/ratio_var": NaN }, { "episode": 943, "epoch": 0.17708920187793428, "eps": 0, "loss/policy_avg": -1.54477238538675e-05, "loss/value_avg": 0.11859798431396484, "lr": 1.7400000000000002e-07, "objective/entropy": 146.80343627929688, "objective/kl": 16.530088424682617, "objective/non_score_reward": -0.8265044689178467, "objective/rlhf_reward": -10.581040382385254, "objective/scores": -9.754535675048828, "policy/approxkl_avg": 1.1043764658325017e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.99309504032135, "step": 943, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999647736549377, "val/ratio_var": NaN }, { "episode": 944, "epoch": 0.17727699530516433, "eps": 0, "loss/policy_avg": -7.525929959228961e-06, "loss/value_avg": 0.25311747193336487, "lr": 1.71e-07, "objective/entropy": 75.6336669921875, "objective/kl": 28.72989845275879, "objective/non_score_reward": -1.4364948272705078, "objective/rlhf_reward": -10.605718612670898, "objective/scores": -9.16922378540039, "policy/approxkl_avg": 5.584491447052642e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2962807416915894, "step": 944, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000020980834961, "val/ratio_var": NaN }, { "episode": 945, "epoch": 0.17746478873239438, "eps": 0, "loss/policy_avg": -1.1426098353695124e-05, "loss/value_avg": 0.28750014305114746, "lr": 1.68e-07, "objective/entropy": 143.6572265625, "objective/kl": 38.500877380371094, "objective/non_score_reward": -1.9250437021255493, "objective/rlhf_reward": -10.147838592529297, "objective/scores": -8.222794532775879, "policy/approxkl_avg": 2.06222807719314e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.3166894912719727, "step": 945, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999648928642273, "val/ratio_var": NaN }, { "episode": 946, "epoch": 0.1776525821596244, "eps": 0, "loss/policy_avg": 3.489008668111637e-05, "loss/value_avg": 0.6550703048706055, "lr": 1.65e-07, "objective/entropy": 101.2534408569336, "objective/kl": 78.27881622314453, "objective/non_score_reward": -3.9139413833618164, "objective/rlhf_reward": -12.743647575378418, "objective/scores": -8.829706192016602, "policy/approxkl_avg": 4.3253830739331534e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9970357418060303, "step": 946, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.999996542930603, "val/ratio_var": NaN }, { "episode": 947, "epoch": 0.17784037558685445, "eps": 0, "loss/policy_avg": 5.695954678230919e-05, "loss/value_avg": 0.17584851384162903, "lr": 1.62e-07, "objective/entropy": 70.82171630859375, "objective/kl": 27.707984924316406, "objective/non_score_reward": -1.3853991031646729, "objective/rlhf_reward": -10.577509880065918, "objective/scores": -9.192111015319824, "policy/approxkl_avg": 9.357815144994674e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6277005672454834, "step": 947, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000149011611938, "val/ratio_var": NaN }, { "episode": 948, "epoch": 0.1780281690140845, "eps": 0, "loss/policy_avg": -6.654127355432138e-05, "loss/value_avg": 0.19809460639953613, "lr": 1.59e-07, "objective/entropy": 191.44390869140625, "objective/kl": 60.460845947265625, "objective/non_score_reward": -3.0230422019958496, "objective/rlhf_reward": -13.018482208251953, "objective/scores": -9.995440483093262, "policy/approxkl_avg": 1.026182516739027e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.9801900386810303, "step": 948, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000454187393188, "val/ratio_var": NaN }, { "episode": 949, "epoch": 0.17821596244131455, "eps": 0, "loss/policy_avg": -3.207404733984731e-05, "loss/value_avg": 0.0903264507651329, "lr": 1.56e-07, "objective/entropy": 128.59933471679688, "objective/kl": 33.29360580444336, "objective/non_score_reward": -1.6646803617477417, "objective/rlhf_reward": -12.593788146972656, "objective/scores": -10.929107666015625, "policy/approxkl_avg": 5.648255552159753e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2406327724456787, "step": 949, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999578595161438, "val/ratio_var": NaN }, { "episode": 950, "epoch": 0.1784037558685446, "eps": 0, "loss/policy_avg": -1.2217827134008985e-05, "loss/value_avg": 1.973974585533142, "lr": 1.53e-07, "objective/entropy": 89.62750244140625, "objective/kl": 79.73388671875, "objective/non_score_reward": -3.986694097518921, "objective/rlhf_reward": -16.107900619506836, "objective/scores": -12.121207237243652, "policy/approxkl_avg": 9.827148517160822e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3919677734375, "step": 950, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9998998045921326, "val/ratio_var": NaN }, { "episode": 951, "epoch": 0.17859154929577464, "eps": 0, "loss/policy_avg": -3.243842365918681e-05, "loss/value_avg": 0.5327892899513245, "lr": 1.5000000000000002e-07, "objective/entropy": 85.49566650390625, "objective/kl": 40.19312286376953, "objective/non_score_reward": -2.0096561908721924, "objective/rlhf_reward": -10.225129127502441, "objective/scores": -8.215473175048828, "policy/approxkl_avg": 5.127613533773001e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8910925388336182, "step": 951, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999962449073792, "val/ratio_var": NaN }, { "episode": 952, "epoch": 0.1787793427230047, "eps": 0, "loss/policy_avg": 3.0148703444865532e-05, "loss/value_avg": 0.3008478879928589, "lr": 1.47e-07, "objective/entropy": 92.78604888916016, "objective/kl": 39.36948776245117, "objective/non_score_reward": -1.9684743881225586, "objective/rlhf_reward": -10.756299018859863, "objective/scores": -8.787824630737305, "policy/approxkl_avg": 9.709000181601368e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8456156253814697, "step": 952, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9998658299446106, "val/ratio_var": NaN }, { "episode": 953, "epoch": 0.17896713615023474, "eps": 0, "loss/policy_avg": 1.099424571293639e-05, "loss/value_avg": 0.33867883682250977, "lr": 1.4400000000000002e-07, "objective/entropy": 23.19976806640625, "objective/kl": 57.55624008178711, "objective/non_score_reward": -2.877811908721924, "objective/rlhf_reward": -12.101253509521484, "objective/scores": -9.223442077636719, "policy/approxkl_avg": 2.7015554238118966e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.4691968262195587, "step": 953, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999992251396179, "val/ratio_var": NaN }, { "episode": 954, "epoch": 0.1791549295774648, "eps": 0, "loss/policy_avg": -3.193909378751414e-06, "loss/value_avg": 0.06148446351289749, "lr": 1.41e-07, "objective/entropy": 23.77320098876953, "objective/kl": 41.288917541503906, "objective/non_score_reward": -2.064445734024048, "objective/rlhf_reward": -11.20981216430664, "objective/scores": -9.145366668701172, "policy/approxkl_avg": 2.7022009518873347e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.5601415038108826, "step": 954, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000274181365967, "val/ratio_var": NaN }, { "episode": 955, "epoch": 0.17934272300469484, "eps": 0, "loss/policy_avg": 5.749036790803075e-05, "loss/value_avg": 0.16060598194599152, "lr": 1.38e-07, "objective/entropy": 102.73761749267578, "objective/kl": 26.00151824951172, "objective/non_score_reward": -1.3000760078430176, "objective/rlhf_reward": -11.778770446777344, "objective/scores": -10.478693962097168, "policy/approxkl_avg": 1.4971664086260716e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.8629887104034424, "step": 955, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999476075172424, "val/ratio_var": NaN }, { "episode": 956, "epoch": 0.17953051643192489, "eps": 0, "loss/policy_avg": 6.625337118748575e-05, "loss/value_avg": 0.16585221886634827, "lr": 1.35e-07, "objective/entropy": 17.919429779052734, "objective/kl": 37.46044921875, "objective/non_score_reward": -1.8730225563049316, "objective/rlhf_reward": -10.375505447387695, "objective/scores": -8.502482414245605, "policy/approxkl_avg": 3.2592748056003984e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.393508642911911, "step": 956, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000219345092773, "val/ratio_var": NaN }, { "episode": 957, "epoch": 0.17971830985915493, "eps": 0, "loss/policy_avg": 5.923127173446119e-05, "loss/value_avg": 0.24836216866970062, "lr": 1.32e-07, "objective/entropy": 111.2071533203125, "objective/kl": 47.86920166015625, "objective/non_score_reward": -2.393460273742676, "objective/rlhf_reward": -12.864115715026855, "objective/scores": -10.47065544128418, "policy/approxkl_avg": 1.5412535958603257e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.273911952972412, "step": 957, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000437498092651, "val/ratio_var": NaN }, { "episode": 958, "epoch": 0.17990610328638498, "eps": 0, "loss/policy_avg": -5.9721605794038624e-05, "loss/value_avg": 0.7753437757492065, "lr": 1.29e-07, "objective/entropy": 105.02678680419922, "objective/kl": 35.1563606262207, "objective/non_score_reward": -1.7578179836273193, "objective/rlhf_reward": -9.279748916625977, "objective/scores": -7.521930694580078, "policy/approxkl_avg": 9.098680919805702e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0647940635681152, "step": 958, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999968945980072, "val/ratio_var": NaN }, { "episode": 959, "epoch": 0.18009389671361503, "eps": 0, "loss/policy_avg": -4.4084943510824814e-05, "loss/value_avg": 0.7408401966094971, "lr": 1.2600000000000002e-07, "objective/entropy": 77.03816223144531, "objective/kl": 32.1870002746582, "objective/non_score_reward": -1.6093500852584839, "objective/rlhf_reward": -11.461925506591797, "objective/scores": -9.852575302124023, "policy/approxkl_avg": 7.566179505147375e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7286146879196167, "step": 959, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999766945838928, "val/ratio_var": NaN }, { "episode": 960, "epoch": 0.18028169014084508, "eps": 0, "loss/policy_avg": 2.680184661585372e-05, "loss/value_avg": 0.11284880340099335, "lr": 1.23e-07, "objective/entropy": 11.451347351074219, "objective/kl": 25.071632385253906, "objective/non_score_reward": -1.2535816431045532, "objective/rlhf_reward": -10.12872314453125, "objective/scores": -8.875141143798828, "policy/approxkl_avg": 3.185452124299104e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.23291419446468353, "step": 960, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999997079372406, "val/ratio_var": NaN }, { "episode": 961, "epoch": 0.18046948356807513, "eps": 0, "loss/policy_avg": 9.055407645064406e-06, "loss/value_avg": 3.974461317062378, "lr": 1.2000000000000002e-07, "objective/entropy": 60.79718780517578, "objective/kl": 75.06784057617188, "objective/non_score_reward": -3.753392219543457, "objective/rlhf_reward": -18.073917388916016, "objective/scores": -14.320526123046875, "policy/approxkl_avg": 3.548656479779311e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2930033206939697, "step": 961, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999792575836182, "val/ratio_var": NaN }, { "episode": 962, "epoch": 0.18065727699530518, "eps": 0, "loss/policy_avg": -5.475529906107113e-05, "loss/value_avg": 0.2248333990573883, "lr": 1.17e-07, "objective/entropy": 96.51241302490234, "objective/kl": 42.07085037231445, "objective/non_score_reward": -2.1035423278808594, "objective/rlhf_reward": -12.31918716430664, "objective/scores": -10.215644836425781, "policy/approxkl_avg": 5.562621652188682e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.1100521087646484, "step": 962, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999808669090271, "val/ratio_var": NaN }, { "episode": 963, "epoch": 0.1808450704225352, "eps": 0, "loss/policy_avg": 5.5187152611324564e-05, "loss/value_avg": 3.8282370567321777, "lr": 1.14e-07, "objective/entropy": 29.964988708496094, "objective/kl": 131.63938903808594, "objective/non_score_reward": -6.581969738006592, "objective/rlhf_reward": -19.369674682617188, "objective/scores": -12.787705421447754, "policy/approxkl_avg": 8.045617505558766e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.7259752154350281, "step": 963, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999967813491821, "val/ratio_var": NaN }, { "episode": 964, "epoch": 0.18103286384976525, "eps": 0, "loss/policy_avg": -2.709874570427928e-05, "loss/value_avg": 0.016953120008111, "lr": 1.11e-07, "objective/entropy": 26.332984924316406, "objective/kl": 32.89926528930664, "objective/non_score_reward": -1.644963264465332, "objective/rlhf_reward": -11.328191757202148, "objective/scores": -9.683228492736816, "policy/approxkl_avg": 1.8539916268878187e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.4630219042301178, "step": 964, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999887943267822, "val/ratio_var": NaN }, { "episode": 965, "epoch": 0.1812206572769953, "eps": 0, "loss/policy_avg": 0.00025327701587229967, "loss/value_avg": 0.2034572809934616, "lr": 1.08e-07, "objective/entropy": 106.12743377685547, "objective/kl": 37.02515411376953, "objective/non_score_reward": -1.851257562637329, "objective/rlhf_reward": -12.891807556152344, "objective/scores": -11.040550231933594, "policy/approxkl_avg": 2.1259407390061824e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.849740743637085, "step": 965, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999464750289917, "val/ratio_var": NaN }, { "episode": 966, "epoch": 0.18140845070422534, "eps": 0, "loss/policy_avg": -4.272640944691375e-05, "loss/value_avg": 0.20036444067955017, "lr": 1.0500000000000001e-07, "objective/entropy": 121.9717788696289, "objective/kl": 22.431259155273438, "objective/non_score_reward": -1.1215629577636719, "objective/rlhf_reward": -11.835363388061523, "objective/scores": -10.713800430297852, "policy/approxkl_avg": 7.946497504462968e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.6462218761444092, "step": 966, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999793767929077, "val/ratio_var": NaN }, { "episode": 967, "epoch": 0.1815962441314554, "eps": 0, "loss/policy_avg": -0.00012074326514266431, "loss/value_avg": 0.03555906191468239, "lr": 1.0200000000000001e-07, "objective/entropy": 32.16166305541992, "objective/kl": 13.533589363098145, "objective/non_score_reward": -0.6766794919967651, "objective/rlhf_reward": -11.610528945922852, "objective/scores": -10.933849334716797, "policy/approxkl_avg": 3.7452309697982855e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.9117984771728516, "step": 967, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999752640724182, "val/ratio_var": NaN }, { "episode": 968, "epoch": 0.18178403755868544, "eps": 0, "loss/policy_avg": -5.488125680130906e-06, "loss/value_avg": 0.28643888235092163, "lr": 9.900000000000001e-08, "objective/entropy": 83.12531280517578, "objective/kl": 31.298959732055664, "objective/non_score_reward": -1.5649480819702148, "objective/rlhf_reward": -10.518560409545898, "objective/scores": -8.953612327575684, "policy/approxkl_avg": 7.971396343009474e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9004735946655273, "step": 968, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.000000238418579, "val/ratio_var": NaN }, { "episode": 969, "epoch": 0.1819718309859155, "eps": 0, "loss/policy_avg": -6.0864214901812375e-05, "loss/value_avg": 0.07400896400213242, "lr": 9.6e-08, "objective/entropy": 65.97218322753906, "objective/kl": 49.093196868896484, "objective/non_score_reward": -2.454659938812256, "objective/rlhf_reward": -11.807079315185547, "objective/scores": -9.352418899536133, "policy/approxkl_avg": 3.901711309595157e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.3380126953125, "step": 969, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999352097511292, "val/ratio_var": NaN }, { "episode": 970, "epoch": 0.18215962441314554, "eps": 0, "loss/policy_avg": -9.199358828482218e-06, "loss/value_avg": 0.0399012565612793, "lr": 9.3e-08, "objective/entropy": 5.221830368041992, "objective/kl": 27.161865234375, "objective/non_score_reward": -1.3580933809280396, "objective/rlhf_reward": -10.985621452331543, "objective/scores": -9.627528190612793, "policy/approxkl_avg": 1.0206281375602089e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.17869481444358826, "step": 970, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999920725822449, "val/ratio_var": NaN }, { "episode": 971, "epoch": 0.1823474178403756, "eps": 0, "loss/policy_avg": -1.0526405276323203e-05, "loss/value_avg": 0.7304164171218872, "lr": 9e-08, "objective/entropy": 26.360450744628906, "objective/kl": 48.23431396484375, "objective/non_score_reward": -2.4117159843444824, "objective/rlhf_reward": -11.948196411132812, "objective/scores": -9.536479949951172, "policy/approxkl_avg": 1.4353751076612298e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.7476553320884705, "step": 971, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999937415122986, "val/ratio_var": NaN }, { "episode": 972, "epoch": 0.18253521126760563, "eps": 0, "loss/policy_avg": -2.996876537508797e-05, "loss/value_avg": 0.10692588984966278, "lr": 8.700000000000001e-08, "objective/entropy": 8.82129955291748, "objective/kl": 33.40154266357422, "objective/non_score_reward": -1.6700772047042847, "objective/rlhf_reward": -10.122932434082031, "objective/scores": -8.452855110168457, "policy/approxkl_avg": 5.594378738038586e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.2895738184452057, "step": 972, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999884963035583, "val/ratio_var": NaN }, { "episode": 973, "epoch": 0.18272300469483568, "eps": 0, "loss/policy_avg": 6.0099475376773626e-05, "loss/value_avg": 0.22116205096244812, "lr": 8.4e-08, "objective/entropy": 77.65829467773438, "objective/kl": 32.852272033691406, "objective/non_score_reward": -1.6426136493682861, "objective/rlhf_reward": -11.30057430267334, "objective/scores": -9.657960891723633, "policy/approxkl_avg": 2.1569734087734105e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7464162111282349, "step": 973, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999912679195404, "val/ratio_var": NaN }, { "episode": 974, "epoch": 0.18291079812206573, "eps": 0, "loss/policy_avg": -1.680625973676797e-05, "loss/value_avg": 0.6279889941215515, "lr": 8.1e-08, "objective/entropy": 16.447805404663086, "objective/kl": 41.175254821777344, "objective/non_score_reward": -2.058762788772583, "objective/rlhf_reward": -10.561946868896484, "objective/scores": -8.50318431854248, "policy/approxkl_avg": 5.589609664014006e-09, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.3477552831172943, "step": 974, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999992251396179, "val/ratio_var": NaN }, { "episode": 975, "epoch": 0.18309859154929578, "eps": 0, "loss/policy_avg": 9.473764293943532e-06, "loss/value_avg": 0.21188563108444214, "lr": 7.8e-08, "objective/entropy": 139.03842163085938, "objective/kl": 57.47703552246094, "objective/non_score_reward": -2.873851776123047, "objective/rlhf_reward": -12.723891258239746, "objective/scores": -9.8500394821167, "policy/approxkl_avg": 8.882262392262419e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.409451484680176, "step": 975, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9999663233757019, "val/ratio_var": NaN }, { "episode": 976, "epoch": 0.18328638497652583, "eps": 0, "loss/policy_avg": 3.318066956126131e-05, "loss/value_avg": 0.5610944032669067, "lr": 7.500000000000001e-08, "objective/entropy": 99.35005950927734, "objective/kl": 26.30145835876465, "objective/non_score_reward": -1.3150728940963745, "objective/rlhf_reward": -9.484772682189941, "objective/scores": -8.169699668884277, "policy/approxkl_avg": 6.730335400106924e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.5928863286972046, "step": 976, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999212622642517, "val/ratio_var": NaN }, { "episode": 977, "epoch": 0.18347417840375588, "eps": 0, "loss/policy_avg": -2.3814867745386437e-05, "loss/value_avg": 0.10445094853639603, "lr": 7.200000000000001e-08, "objective/entropy": 78.85806274414062, "objective/kl": 27.707630157470703, "objective/non_score_reward": -1.3853814601898193, "objective/rlhf_reward": -11.512476921081543, "objective/scores": -10.127095222473145, "policy/approxkl_avg": 5.009691506074887e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4372397661209106, "step": 977, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999951720237732, "val/ratio_var": NaN }, { "episode": 978, "epoch": 0.18366197183098593, "eps": 0, "loss/policy_avg": -1.7460786693845876e-05, "loss/value_avg": 0.18678639829158783, "lr": 6.9e-08, "objective/entropy": 85.19758605957031, "objective/kl": 42.45833969116211, "objective/non_score_reward": -2.1229169368743896, "objective/rlhf_reward": -12.166364669799805, "objective/scores": -10.043447494506836, "policy/approxkl_avg": 5.717841844443683e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7744693756103516, "step": 978, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000317096710205, "val/ratio_var": NaN }, { "episode": 979, "epoch": 0.18384976525821597, "eps": 0, "loss/policy_avg": 4.032423021271825e-05, "loss/value_avg": 2.80727219581604, "lr": 6.6e-08, "objective/entropy": 106.25469970703125, "objective/kl": 63.3036003112793, "objective/non_score_reward": -3.165179967880249, "objective/rlhf_reward": -8.61418628692627, "objective/scores": -5.449006080627441, "policy/approxkl_avg": 8.229832104689194e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9273877143859863, "step": 979, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999535083770752, "val/ratio_var": NaN }, { "episode": 980, "epoch": 0.18403755868544602, "eps": 0, "loss/policy_avg": -0.00010691049101296812, "loss/value_avg": 0.4672190845012665, "lr": 6.300000000000001e-08, "objective/entropy": 61.64970397949219, "objective/kl": 39.04524230957031, "objective/non_score_reward": -1.9522624015808105, "objective/rlhf_reward": -13.478826522827148, "objective/scores": -11.526564598083496, "policy/approxkl_avg": 6.97575899266667e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.4199045896530151, "step": 980, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000407695770264, "val/ratio_var": NaN }, { "episode": 981, "epoch": 0.18422535211267604, "eps": 0, "loss/policy_avg": 7.650537008885294e-05, "loss/value_avg": 0.5317828059196472, "lr": 6.000000000000001e-08, "objective/entropy": 81.4742202758789, "objective/kl": 57.137550354003906, "objective/non_score_reward": -2.856877565383911, "objective/rlhf_reward": -10.611292839050293, "objective/scores": -7.754415512084961, "policy/approxkl_avg": 1.6823828730139212e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.560899257659912, "step": 981, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9998723864555359, "val/ratio_var": NaN }, { "episode": 982, "epoch": 0.1844131455399061, "eps": 0, "loss/policy_avg": 7.9964695032686e-05, "loss/value_avg": 0.08258043974637985, "lr": 5.7e-08, "objective/entropy": 145.00909423828125, "objective/kl": 21.409902572631836, "objective/non_score_reward": -1.0704952478408813, "objective/rlhf_reward": -11.663549423217773, "objective/scores": -10.593053817749023, "policy/approxkl_avg": 1.1569446911607884e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.4743998050689697, "step": 982, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.00007963180542, "val/ratio_var": NaN }, { "episode": 983, "epoch": 0.18460093896713614, "eps": 0, "loss/policy_avg": 2.544331073295325e-05, "loss/value_avg": 0.3135547339916229, "lr": 5.4e-08, "objective/entropy": 106.3749008178711, "objective/kl": 53.93701171875, "objective/non_score_reward": -2.696850538253784, "objective/rlhf_reward": -12.7295503616333, "objective/scores": -10.032699584960938, "policy/approxkl_avg": 1.2587928210905375e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7198970317840576, "step": 983, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999800324440002, "val/ratio_var": NaN }, { "episode": 984, "epoch": 0.1847887323943662, "eps": 0, "loss/policy_avg": -8.38513642520411e-06, "loss/value_avg": 0.28103581070899963, "lr": 5.100000000000001e-08, "objective/entropy": 119.26212310791016, "objective/kl": 70.58076477050781, "objective/non_score_reward": -3.529038429260254, "objective/rlhf_reward": -13.361041069030762, "objective/scores": -9.832002639770508, "policy/approxkl_avg": 7.449666128422905e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.055804491043091, "step": 984, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999955952167511, "val/ratio_var": NaN }, { "episode": 985, "epoch": 0.18497652582159624, "eps": 0, "loss/policy_avg": 0.00010267743346048519, "loss/value_avg": 0.7072219848632812, "lr": 4.8e-08, "objective/entropy": 113.3726806640625, "objective/kl": 33.93988800048828, "objective/non_score_reward": -1.6969945430755615, "objective/rlhf_reward": -11.501801490783691, "objective/scores": -9.80480670928955, "policy/approxkl_avg": 8.900836689917924e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.063455820083618, "step": 985, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999526739120483, "val/ratio_var": NaN }, { "episode": 986, "epoch": 0.1851643192488263, "eps": 0, "loss/policy_avg": -7.647387974429876e-06, "loss/value_avg": 0.49752482771873474, "lr": 4.5e-08, "objective/entropy": 94.80780792236328, "objective/kl": 67.16241455078125, "objective/non_score_reward": -3.3581204414367676, "objective/rlhf_reward": -13.649394989013672, "objective/scores": -10.291274070739746, "policy/approxkl_avg": 7.695010140196246e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.7321569919586182, "step": 986, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.999938428401947, "val/ratio_var": NaN }, { "episode": 987, "epoch": 0.18535211267605634, "eps": 0, "loss/policy_avg": 7.890305823821109e-06, "loss/value_avg": 0.13835422694683075, "lr": 4.2e-08, "objective/entropy": 53.42512130737305, "objective/kl": 47.63087844848633, "objective/non_score_reward": -2.3815438747406006, "objective/rlhf_reward": -12.621194839477539, "objective/scores": -10.23965072631836, "policy/approxkl_avg": 3.031250983553946e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.114875078201294, "step": 987, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999967217445374, "val/ratio_var": NaN }, { "episode": 988, "epoch": 0.18553990610328638, "eps": 0, "loss/policy_avg": 7.7895398135297e-05, "loss/value_avg": 0.9342927932739258, "lr": 3.9e-08, "objective/entropy": 90.97291564941406, "objective/kl": 47.696800231933594, "objective/non_score_reward": -2.3848397731781006, "objective/rlhf_reward": -10.132054328918457, "objective/scores": -7.747214317321777, "policy/approxkl_avg": 1.1943052413698751e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.960951328277588, "step": 988, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.0000377893447876, "val/ratio_var": NaN }, { "episode": 989, "epoch": 0.18572769953051643, "eps": 0, "loss/policy_avg": 8.514242654200643e-05, "loss/value_avg": 0.29694506525993347, "lr": 3.6000000000000005e-08, "objective/entropy": 118.119384765625, "objective/kl": 35.59148406982422, "objective/non_score_reward": -1.7795743942260742, "objective/rlhf_reward": -10.727578163146973, "objective/scores": -8.948003768920898, "policy/approxkl_avg": 1.167271150848137e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2425377368927, "step": 989, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0001529455184937, "val/ratio_var": NaN }, { "episode": 990, "epoch": 0.18591549295774648, "eps": 0, "loss/policy_avg": 1.851567685662303e-05, "loss/value_avg": 0.20265893638134003, "lr": 3.3e-08, "objective/entropy": 21.6502742767334, "objective/kl": 57.05150604248047, "objective/non_score_reward": -2.8525753021240234, "objective/rlhf_reward": -12.446228981018066, "objective/scores": -9.593653678894043, "policy/approxkl_avg": 6.143540076664067e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.37133559584617615, "step": 990, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000251531600952, "val/ratio_var": NaN }, { "episode": 991, "epoch": 0.18610328638497653, "eps": 0, "loss/policy_avg": 3.489008668111637e-05, "loss/value_avg": 3.467355728149414, "lr": 3.0000000000000004e-08, "objective/entropy": 37.096534729003906, "objective/kl": 45.02491760253906, "objective/non_score_reward": -2.2512457370758057, "objective/rlhf_reward": -6.2476806640625, "objective/scores": -3.9964351654052734, "policy/approxkl_avg": 1.0365569380610395e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.013621211051941, "step": 991, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000782012939453, "val/ratio_var": NaN }, { "episode": 992, "epoch": 0.18629107981220658, "eps": 0, "loss/policy_avg": -2.2492318407785206e-07, "loss/value_avg": 0.12221002578735352, "lr": 2.7e-08, "objective/entropy": 86.67166137695312, "objective/kl": 46.848350524902344, "objective/non_score_reward": -2.3424172401428223, "objective/rlhf_reward": -12.457124710083008, "objective/scores": -10.114707946777344, "policy/approxkl_avg": 8.368261461555448e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.044316053390503, "step": 992, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9999915361404419, "val/ratio_var": NaN }, { "episode": 993, "epoch": 0.18647887323943663, "eps": 0, "loss/policy_avg": -1.2280805776754278e-06, "loss/value_avg": 0.5326202511787415, "lr": 2.4e-08, "objective/entropy": 97.4530258178711, "objective/kl": 46.69620132446289, "objective/non_score_reward": -2.334810256958008, "objective/rlhf_reward": -10.223193168640137, "objective/scores": -7.888382911682129, "policy/approxkl_avg": 9.703326497856324e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.0918545722961426, "step": 993, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999801516532898, "val/ratio_var": NaN }, { "episode": 994, "epoch": 0.18666666666666668, "eps": 0, "loss/policy_avg": 5.676273940480314e-05, "loss/value_avg": 0.13131697475910187, "lr": 2.1e-08, "objective/entropy": 65.26976013183594, "objective/kl": 22.733779907226562, "objective/non_score_reward": -1.1366890668869019, "objective/rlhf_reward": -11.290173530578613, "objective/scores": -10.153484344482422, "policy/approxkl_avg": 5.161283667121097e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.2832053899765015, "step": 994, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.000081181526184, "val/ratio_var": NaN }, { "episode": 995, "epoch": 0.18685446009389672, "eps": 0, "loss/policy_avg": -0.00010653711797203869, "loss/value_avg": 0.46525076031684875, "lr": 1.8000000000000002e-08, "objective/entropy": 140.0326690673828, "objective/kl": 66.7928466796875, "objective/non_score_reward": -3.3396425247192383, "objective/rlhf_reward": -14.00626277923584, "objective/scores": -10.666620254516602, "policy/approxkl_avg": 1.1535231436710092e-07, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.6046395301818848, "step": 995, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.999871015548706, "val/ratio_var": NaN }, { "episode": 996, "epoch": 0.18704225352112677, "eps": 0, "loss/policy_avg": -2.4417660824838094e-05, "loss/value_avg": 0.7544089555740356, "lr": 1.5000000000000002e-08, "objective/entropy": 41.39933395385742, "objective/kl": 55.655677795410156, "objective/non_score_reward": -2.7827839851379395, "objective/rlhf_reward": -13.757291793823242, "objective/scores": -10.974508285522461, "policy/approxkl_avg": 2.5715719331742548e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.7063955664634705, "step": 996, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.0000150203704834, "val/ratio_var": NaN }, { "episode": 997, "epoch": 0.18723004694835682, "eps": 0, "loss/policy_avg": -3.133180143777281e-05, "loss/value_avg": 0.7381324172019958, "lr": 1.2e-08, "objective/entropy": 111.71247863769531, "objective/kl": 57.902130126953125, "objective/non_score_reward": -2.895106554031372, "objective/rlhf_reward": -11.104738235473633, "objective/scores": -8.20963191986084, "policy/approxkl_avg": 9.195250783022857e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.187638282775879, "step": 997, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.0000030994415283, "val/ratio_var": NaN }, { "episode": 998, "epoch": 0.18741784037558684, "eps": 0, "loss/policy_avg": -5.8614983572624624e-05, "loss/value_avg": 0.18362030386924744, "lr": 9.000000000000001e-09, "objective/entropy": 108.70333862304688, "objective/kl": 33.6229362487793, "objective/non_score_reward": -1.6811468601226807, "objective/rlhf_reward": -10.797221183776855, "objective/scores": -9.116074562072754, "policy/approxkl_avg": 8.888504510196071e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.9765809774398804, "step": 998, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999818801879883, "val/ratio_var": NaN }, { "episode": 999, "epoch": 0.1876056338028169, "eps": 0, "loss/policy_avg": 9.896619985738653e-07, "loss/value_avg": 2.170288324356079, "lr": 6e-09, "objective/entropy": 50.885887145996094, "objective/kl": 89.29595947265625, "objective/non_score_reward": -4.464798450469971, "objective/rlhf_reward": -16.922555923461914, "objective/scores": -12.457757949829102, "policy/approxkl_avg": 8.021469000141224e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 1.0696929693222046, "step": 999, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 0.9999644756317139, "val/ratio_var": NaN }, { "episode": 1000, "epoch": 0.18779342723004694, "eps": 0, "loss/policy_avg": -5.6356755521846935e-05, "loss/value_avg": 0.12751711905002594, "lr": 3e-09, "objective/entropy": 102.96729278564453, "objective/kl": 29.009939193725586, "objective/non_score_reward": -1.4504969120025635, "objective/rlhf_reward": -11.462577819824219, "objective/scores": -10.012081146240234, "policy/approxkl_avg": 8.965206887978638e-08, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 2.2162394523620605, "step": 1000, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 0.9999628663063049, "val/ratio_var": NaN } ], "logging_steps": 500, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 0.18779342723004694, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": true, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }