jostyposty's picture
feat: add four models
3261e0d
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env
import random
def eval_model_with_seed(model_fp, env_id, seed, n_eval_episodes=10, n_envs=1):
eval_env = make_vec_env(env_id, seed=seed, n_envs=n_envs)
return eval_model(model_fp, eval_env, n_eval_episodes)
def eval_model_random(model_fp, env_id, n_eval_episodes=10):
eval_env = Monitor(gym.make(env_id))
return eval_model(model_fp, eval_env, n_eval_episodes)
def eval_model_random_with_average(
model_fp, env_id, n_eval_episodes=10, n_average=10, verbose=False
):
result_sum = 0
mean_reward_sum = 0
std_reward_sum = 0
for i in range(n_average):
if verbose and i % 100 == 0:
print(f"Progress: {i}/{n_average}")
result, mean_reward, std_reward = eval_model_random(
model_fp, env_id, n_eval_episodes
)
result_sum += result
mean_reward_sum += mean_reward
std_reward_sum += std_reward
return (
result_sum / n_average,
mean_reward_sum / n_average,
std_reward_sum / n_average,
)
def eval_model(model_fp, eval_env, n_eval_episodes=10):
model = PPO.load(model_fp, env=eval_env)
mean_reward, std_reward = evaluate_policy(
model, eval_env, n_eval_episodes=n_eval_episodes, deterministic=True
)
result = mean_reward - std_reward
return result, mean_reward, std_reward
def search_for_best_seed(
model_fp,
env_id,
n_eval_episodes=10,
n_total_envs_to_search=1000,
max_n_envs=16,
verbose=False,
):
best_result = 0
best_seed = 0
best_n_envs = 0
for i in range(n_total_envs_to_search):
if verbose and i % 100 == 0:
print(f"Progress: {i}/{n_total_envs_to_search}")
seed = random.randint(0, 1000000000000)
n_envs = random.randint(1, max_n_envs)
result, _, _ = eval_model_with_seed(
model_fp, env_id, seed, n_eval_episodes, n_envs
)
if result > best_result:
best_result = result
best_seed = seed
best_n_envs = n_envs
return best_result, best_seed, best_n_envs
def search_for_best_seed_in_range(model_fp, env_id, range=range(0, 1000)):
best_result = 0
best_seed = 0
best_n_envs = 0
for seed in range:
for n_envs in [1, 2, 4, 8, 16, 32]:
result, _, _ = eval_model_with_seed(model_fp, env_id, seed, 10, n_envs)
if result > best_result:
best_result = result
best_seed = seed
best_n_envs = n_envs
print(best_result, seed, n_envs)
print(best_result, best_seed, best_n_envs)
return best_result, best_seed, best_n_envs