|
import gymnasium as gym |
|
from stable_baselines3 import PPO |
|
from stable_baselines3.common.evaluation import evaluate_policy |
|
from stable_baselines3.common.monitor import Monitor |
|
from stable_baselines3.common.env_util import make_vec_env |
|
import random |
|
|
|
|
|
def eval_model_with_seed(model_fp, env_id, seed, n_eval_episodes=10, n_envs=1): |
|
eval_env = make_vec_env(env_id, seed=seed, n_envs=n_envs) |
|
return eval_model(model_fp, eval_env, n_eval_episodes) |
|
|
|
|
|
def eval_model_random(model_fp, env_id, n_eval_episodes=10): |
|
eval_env = Monitor(gym.make(env_id)) |
|
return eval_model(model_fp, eval_env, n_eval_episodes) |
|
|
|
|
|
def eval_model_random_with_average( |
|
model_fp, env_id, n_eval_episodes=10, n_average=10, verbose=False |
|
): |
|
result_sum = 0 |
|
mean_reward_sum = 0 |
|
std_reward_sum = 0 |
|
for i in range(n_average): |
|
if verbose and i % 100 == 0: |
|
print(f"Progress: {i}/{n_average}") |
|
result, mean_reward, std_reward = eval_model_random( |
|
model_fp, env_id, n_eval_episodes |
|
) |
|
result_sum += result |
|
mean_reward_sum += mean_reward |
|
std_reward_sum += std_reward |
|
return ( |
|
result_sum / n_average, |
|
mean_reward_sum / n_average, |
|
std_reward_sum / n_average, |
|
) |
|
|
|
|
|
def eval_model(model_fp, eval_env, n_eval_episodes=10): |
|
model = PPO.load(model_fp, env=eval_env) |
|
mean_reward, std_reward = evaluate_policy( |
|
model, eval_env, n_eval_episodes=n_eval_episodes, deterministic=True |
|
) |
|
result = mean_reward - std_reward |
|
return result, mean_reward, std_reward |
|
|
|
|
|
def search_for_best_seed( |
|
model_fp, |
|
env_id, |
|
n_eval_episodes=10, |
|
n_total_envs_to_search=1000, |
|
max_n_envs=16, |
|
verbose=False, |
|
): |
|
best_result = 0 |
|
best_seed = 0 |
|
best_n_envs = 0 |
|
for i in range(n_total_envs_to_search): |
|
if verbose and i % 100 == 0: |
|
print(f"Progress: {i}/{n_total_envs_to_search}") |
|
seed = random.randint(0, 1000000000000) |
|
n_envs = random.randint(1, max_n_envs) |
|
result, _, _ = eval_model_with_seed( |
|
model_fp, env_id, seed, n_eval_episodes, n_envs |
|
) |
|
if result > best_result: |
|
best_result = result |
|
best_seed = seed |
|
best_n_envs = n_envs |
|
return best_result, best_seed, best_n_envs |
|
|
|
|
|
def search_for_best_seed_in_range(model_fp, env_id, range=range(0, 1000)): |
|
best_result = 0 |
|
best_seed = 0 |
|
best_n_envs = 0 |
|
for seed in range: |
|
for n_envs in [1, 2, 4, 8, 16, 32]: |
|
result, _, _ = eval_model_with_seed(model_fp, env_id, seed, 10, n_envs) |
|
if result > best_result: |
|
best_result = result |
|
best_seed = seed |
|
best_n_envs = n_envs |
|
print(best_result, seed, n_envs) |
|
print(best_result, best_seed, best_n_envs) |
|
return best_result, best_seed, best_n_envs |
|
|