SocialAISchool / data_visualize.py
grg's picture
Cleaned old git history
be5548b
raw
history blame
42.3 kB
#!/usr/bin/env python
import re
import itertools
import math
from itertools import chain
import time
# import seaborn
import numpy as np
import os
from collections import OrderedDict, defaultdict
import pandas as pd
import matplotlib.pyplot as plt
import sys
from termcolor import cprint, colored
from pathlib import Path
import pickle
from scipy import stats
save = True
show_plot = False
metrics = [
'success_rate_mean',
# 'FPS',
# 'extrinsic_return_mean',
# 'exploration_bonus_mean',
# 'NPC_intro',
# 'curriculum_param_mean',
# 'curriculum_max_success_rate_mean',
# 'rreturn_mean'
]
eval_metric = "test_success_rates"
# eval_metric = "exploration_bonus_mean"
super_title = ""
# super_title = "PPO - No exploration bonus"
# super_title = "Count Based exploration bonus (Grid Search)"
# super_title = "PPO + RND"
# super_title = "PPO + RIDE"
# statistical evaluation p-value
test_p = 0.05
agg_title = ""
color_dict = None
eval_filename = None
max_frames = 20_000_000
legend_show_n_seeds = False
draw_legend = True
per_seed = False
study_train = False
study_eval = True
plot_test = True
plot_aggregated_test = True
plot_only_aggregated_test = True
xnbins = 4
ynbins = 3
steps_denom = 1e6
# Global vas for tracking and labeling data at load time.
exp_idx = 0
label_parser_dict = None
label_parser = lambda l, _, label_parser_dict: l
smooth_factor = 10 # used
# smooth_factor = 0
print("smooth factor:", smooth_factor)
eval_smooth_factor = None
leg_size = 30
def smooth(x_, n=50):
if n is None:
return x_
if type(x_) == list:
x_ = np.array(x_)
return np.array([x_[max(i - n, 0):i + 1].mean() for i in range(len(x_))])
sort_test = False
def sort_test_set(env_name):
helps = [
"LanguageFeedback",
"LanguageColor",
"Pointing",
"Emulation",
]
problems = [
"Boxes",
"Switches",
"Generators",
"Marble",
"Doors",
"Levers",
]
env_names = []
for p in problems:
for h in helps:
env_names.append(h+p)
env_names.extend([
"LeverDoorColl",
"MarblePushColl",
"MarblePassColl",
"AppleStealing"
])
for i, en in enumerate(env_names):
if en in env_name:
return i
raise ValueError(f"Test env {env_name} not known")
subsample_step = 1
load_subsample_step = 1
x_lim = 0
max_x_lim = np.inf
summary_dict = {}
summary_dict_colors = {}
to_plot_dict = {}
default_colors_ = ["blue","orange","green","magenta", "brown", "red",'black',"grey",u'#ff7f0e',
"cyan", "pink",'purple', u'#1f77b4',
"darkorchid","sienna","lightpink", "indigo","mediumseagreen",'aqua',
'deeppink','silver','khaki','goldenrod'] * 100
def get_eval_data(logdir, eval_metric):
eval_data = defaultdict(lambda :defaultdict(list))
for root, _, files in os.walk(logdir):
for file in files:
if 'testing_' in file:
assert ".pkl" in file
test_env_name = file.lstrip("testing_").rstrip(".pkl")
try:
with open(root+"/"+file, "rb") as f:
seed_eval_data = pickle.load(f)
except:
print("Pickle not loaded: ", root+"/"+file)
time.sleep(1)
continue
eval_data[test_env_name]["values"].append(seed_eval_data[eval_metric])
eval_data[test_env_name]["steps"].append(seed_eval_data["test_step_nb"])
for test_env, seed_data in eval_data.items():
min_len_seed = min([len(s) for s in seed_data['steps']])
eval_data[test_env]["values"] = np.array([s[:min_len_seed] for s in eval_data[test_env]["values"]])
eval_data[test_env]["steps"] = np.array([s[:min_len_seed] for s in eval_data[test_env]["steps"]])
return eval_data
def get_all_runs(logdir, load_subsample_step=1):
"""
Recursively look through logdir for output files produced by
Assumes that any file "log.csv" is a valid hit.
"""
global exp_idx
global units
datasets = []
for root, _, files in os.walk(logdir):
if 'log.csv' in files:
if (Path(root) / 'log.csv').stat().st_size == 0:
print("CSV {} empty".format(os.path.join(root, 'log.csv')))
continue
run_name = root[8:]
exp_name = None
config = None
exp_idx += 1
# load progress data
try:
exp_data = pd.read_csv(os.path.join(root, 'log.csv'))
print("Loaded:", os.path.join(root, 'log.csv'))
except:
raise ValueError("CSV {} faulty".format(os.path.join(root, 'log.csv')))
exp_data = exp_data[::load_subsample_step]
data_dict = exp_data.to_dict("list")
data_dict['config'] = config
nb_epochs = len(data_dict['frames'])
if nb_epochs == 1:
print(f'{run_name} -> {colored(f"nb_epochs {nb_epochs}", "red")}')
else:
print('{} -> nb_epochs {}'.format(run_name, nb_epochs))
datasets.append(data_dict)
return datasets
def get_datasets(rootdir, load_only="", load_subsample_step=1, ignore_patterns=("ignore"), require_patterns=()):
_, models_list, _ = next(os.walk(rootdir))
for dir_name in models_list.copy():
# add "ignore" in a directory name to avoid loading its content
for ignore_pattern in ignore_patterns:
if ignore_pattern in dir_name or load_only not in dir_name:
if dir_name in models_list:
models_list.remove(dir_name)
if len(require_patterns) > 0:
if not any([require_pattern in dir_name for require_pattern in require_patterns]):
if dir_name in models_list:
models_list.remove(dir_name)
for expe_name in list(labels.keys()):
if expe_name not in models_list:
del labels[expe_name]
# setting per-model type colors
for i, m_name in enumerate(models_list):
for m_type, m_color in per_model_colors.items():
if m_type in m_name:
colors[m_name] = m_color
print("extracting data for {}...".format(m_name))
m_id = m_name
models_saves[m_id] = OrderedDict()
models_saves[m_id]['data'] = get_all_runs(rootdir+m_name, load_subsample_step=load_subsample_step)
print("done")
if m_name not in labels:
labels[m_name] = m_name
model_eval_data[m_id] = get_eval_data(logdir=rootdir+m_name, eval_metric=eval_metric)
"""
retrieve all experiences located in "data to vizu" folder
"""
labels = OrderedDict()
per_model_colors = OrderedDict()
# LOAD DATA
models_saves = OrderedDict()
colors = OrderedDict()
model_eval_data = OrderedDict()
static_lines = {}
ignore_patterns = ["_ignore_"]
to_compare = None
load_pattern = sys.argv[1]
test_envs_to_plot = None # plot all
min_y, max_y = 0.0, 1.1
def label_parser(label):
label = label.replace("04-01_Pointing_CB_heldout_doors", "PPO_CB")
label = label.replace("19-01_Color_CB_heldout_doors", "PPO_CBL")
label = label.replace("19-01_Feedback_CB_heldout_doors_20M", "PPO_CBL")
label = label.replace("20-01_JA_Color_CB_heldout_doors", "JA_PPO_CBL")
label = label.replace("05-01_scaffolding_50M_no_acl", "PPO_no_scaf")
label = label.replace("05-01_scaffolding_50M_acl_4_acl-type_intro_seq", "PPO_scaf_4")
label = label.replace("05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf", "PPO_scaf_8")
label = label.replace("03-01_RR_ft_single_CB_marble_pass_A_soc_exp", "PPO_CB_role_B")
label = label.replace("03-01_RR_ft_single_CB_marble_pass_A_asoc_contr", "PPO_CB_asocial")
label = label.replace("05-01_RR_ft_group_50M_CB_marble_pass_A_soc_exp", "PPO_CB_role_B")
label = label.replace("05-01_RR_ft_group_50M_CB_marble_pass_A_asoc_contr", "PPO_CB_asocial")
label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50",
"PPO_CB_0.25")
label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50",
"PPO_CB_0.5")
label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50",
"PPO_CB_1")
return label
color_dict = {
'PPO_CB': "blue",
'PPO_CB(train)': "blue",
"PPO_CB(test)": "orange",
'PPO_no_bonus': "orange",
'PPO_CBL': "blue",
'PPO_CBL(train)': "blue",
"PPO_CBL(test)": "orange",
'JA_PPO_CBL': "green",
"PPO_CB_role_B": "blue",
"PPO_CB_asocial": "orange",
'PPO_CB_0.25': "blue",
'PPO_CB_0.5': "green",
'PPO_CB_1': "orange",
}
if load_pattern == "RR_single":
save = False
show_plot = True
load_pattern = "_"
plot_path = "../case_studies_final_figures/RR_dummy_single"
require_patterns = [
"03-01_RR_ft_single_CB_marble_pass_A_asoc_contr",
"03-01_RR_ft_single_CB_marble_pass_A_soc_exp",
]
plot_aggregated_test = False
plot_only_aggregated_test = False
study_train = True
study_eval = False
elif load_pattern == "RR_group":
load_pattern = "_"
plot_path = "../case_studies_final_figures/RR_dummy_group"
require_patterns = [
"05-01_RR_ft_group_50M_CB_marble_pass_A_asoc_contr",
"05-01_RR_ft_group_50M_CB_marble_pass_A_soc_exp",
]
plot_aggregated_test = False
plot_only_aggregated_test = False
study_train = True
study_eval = False
elif load_pattern == "scaffolding":
load_pattern = "_"
plot_path = "../case_studies_final_figures/Scaffolding_test"
require_patterns = [
"05-01_scaffolding_50M_no_acl",
"05-01_scaffolding_50M_acl_4_acl-type_intro_seq",
"05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf",
]
test_envs_to_plot = None # aggregate all of them
plot_aggregated_test = True
plot_only_aggregated_test = True
study_train = False
study_eval = True
to_compare = [
("05-01_scaffolding_50M_acl_4_acl-type_intro_seq_agg_test", "05-01_scaffolding_50M_no_acl_agg_test", "auto_color"),
("05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf_agg_test", "05-01_scaffolding_50M_no_acl_agg_test", "auto_color"),
]
elif load_pattern == "pointing":
study_train = True
study_eval = True
plot_aggregated_test = False
plot_only_aggregated_test = False
load_pattern = "_"
test_envs_to_plot = [
"SocialAI-EPointingDoorsTestInformationSeekingParamEnv-v1",
]
plot_path = "../case_studies_final_figures/Pointing_train_test"
require_patterns = [
"04-01_Pointing_CB_heldout_doors",
]
to_compare = [
("04-01_Pointing_CB_heldout_doors", "04-01_Pointing_CB_heldout_doors_SocialAI-EPointingDoorsTestInformationSeekingParamEnv-v1", "black")
]
elif load_pattern == "color":
study_train = True
study_eval = True
plot_aggregated_test = False
plot_only_aggregated_test = False
max_x_lim = 18
load_pattern = "_"
test_envs_to_plot = [
"SocialAI-ELangColorDoorsTestInformationSeekingParamEnv-v1",
]
plot_path = "../case_studies_final_figures/Color_train_test"
require_patterns = [
"19-01_Color_CB_heldout_doors",
]
to_compare = [
("19-01_Color_CB_heldout_doors", "19-01_Color_CB_heldout_doors_SocialAI-ELangColorDoorsTestInformationSeekingParamEnv-v1", "black")
]
elif load_pattern == "ja_color":
study_train = True
study_eval = False
plot_aggregated_test = False
plot_only_aggregated_test = False
max_x_lim = 18
load_pattern = "_"
test_envs_to_plot = None
plot_path = "../case_studies_final_figures/JA_Color_train"
require_patterns = [
"19-01_Color_CB_heldout_doors",
"20-01_JA_Color_CB_heldout_doors",
]
to_compare = [
("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black")
]
elif load_pattern == "feedback_per_seed":
study_train = True
study_eval = False
per_seed = True
draw_legend = False
plot_aggregated_test = False
plot_only_aggregated_test = False
max_x_lim = 18
load_pattern = "_"
test_envs_to_plot = [
"SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1",
]
plot_path = "../case_studies_final_figures/Feedback_train_per_seed"
require_patterns = [
"19-01_Feedback_CB_heldout_doors",
]
to_compare = None
elif load_pattern == "feedback":
study_train = True
study_eval = True
plot_aggregated_test = False
plot_only_aggregated_test = False
max_x_lim = 18
load_pattern = "_"
test_envs_to_plot = [
"SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1",
]
plot_path = "../case_studies_final_figures/Feedback_train_test"
require_patterns = [
"19-01_Feedback_CB_heldout_doors",
]
to_compare = [
("19-01_Feedback_CB_heldout_doors_20M", "19-01_Feedback_CB_heldout_doors_20M_SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1", "black")
]
elif load_pattern == "imitation_train":
study_train = True
study_eval = False
plot_aggregated_test = False
plot_only_aggregated_test = False
max_x_lim = 18
load_pattern = "_"
test_envs_to_plot = None
plot_path = "../case_studies_final_figures/Imitation_train"
require_patterns = [
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50",
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50",
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50",
]
# to_compare = [
# ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black")
# ]
to_compare = None
elif load_pattern == "imitation_train_intro":
metrics = ["NPC_intro"]
show_plot = False
save = True
study_train = True
study_eval = False
plot_aggregated_test = False
plot_only_aggregated_test = False
max_x_lim = 18
load_pattern = "_"
test_envs_to_plot = None
plot_path = "../case_studies_final_figures/Imitation_train_intro"
require_patterns = [
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50",
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50",
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50",
]
# to_compare = [
# ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black")
# ]
to_compare = None
elif load_pattern == "imitation_test":
study_train = False
study_eval = True
plot_aggregated_test = False
plot_only_aggregated_test = False
max_x_lim = 18
load_pattern = "_"
test_envs_to_plot = None
plot_path = "../case_studies_final_figures/Imitation_test"
require_patterns = [
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50",
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50",
"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50",
]
# to_compare = [
# ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black")
# ]
to_compare = None
elif load_pattern == "pilot_pointing":
study_train = True
study_eval = False
show_plot = False
save = True
plot_path = "../case_studies_final_figures/pilot_pointing"
load_pattern = "29-10_SAI_Pointing_CS_PPO_"
require_patterns = [
"29-10_SAI_Pointing_CS_PPO_CB_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_cell_exploration-bonus-params__2_50_exploration-bonus-tanh_0.6",
"29-10_SAI_Pointing_CS_PPO_CBL_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_lang_exploration-bonus-params__10_50_exploration-bonus-tanh_0.6",
"29-10_SAI_Pointing_CS_PPO_no_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4",
"29-10_SAI_Pointing_CS_PPO_RIDE_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_ride_intrinsic-reward-coef_0.01",
"29-10_SAI_Pointing_CS_PPO_RND_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_rnd_intrinsic-reward-coef_0.005",
]
color_dict = {
"PPO_RIDE": "orange",
"PPO_RND": "magenta",
"PPO_no": "maroon",
"PPO_CBL": "green",
"PPO_CB": "blue",
}
def label_parser(label):
label = label.split("_env_")[0].split("SAI_")[1]
label=label.replace("Pointing_CS_", "")
return label
to_compare = None
elif load_pattern == "pilot_color":
study_train = True
study_eval = False
show_plot = False
save = True
plot_path = "../case_studies_final_figures/pilot_color"
load_pattern = "29-10_SAI_LangColor_CS"
require_patterns = [
"29-10_SAI_LangColor_CS_PPO_CB_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_cell_exploration-bonus-params__2_50_exploration-bonus-tanh_0.6",
"29-10_SAI_LangColor_CS_PPO_CBL_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_lang_exploration-bonus-params__10_50_exploration-bonus-tanh_0.6",
"29-10_SAI_LangColor_CS_PPO_no_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4",
"29-10_SAI_LangColor_CS_PPO_RIDE_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_ride_intrinsic-reward-coef_0.01",
"29-10_SAI_LangColor_CS_PPO_RND_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_rnd_intrinsic-reward-coef_0.005"
]
color_dict = {
"PPO_RIDE": "orange",
"PPO_RND": "magenta",
"PPO_no": "maroon",
"PPO_CBL": "green",
"PPO_CB": "blue",
}
def label_parser(label):
label = label.split("_env_")[0].split("SAI_")[1]
label=label.replace("LangColor_CS_", "")
return label
to_compare = None
elif load_pattern == "formats_train":
study_train = True
study_eval = False
plot_aggregated_test = False
plot_only_aggregated_test = False
max_x_lim = 45
load_pattern = "_"
test_envs_to_plot = None
plot_path = "../case_studies_final_figures/Formats_train"
require_patterns = [
"21-01_formats_50M_CBL",
"05-01_scaffolding_50M_no_acl",
]
to_compare = [
("21-01_formats_50M_CBL", "05-01_scaffolding_50M_no_acl", "black")
]
def label_parser(label):
label = label.replace("05-01_scaffolding_50M_no_acl", "PPO_no_bonus")
label = label.replace("21-01_formats_50M_CBL", "PPO_CBL")
return label
elif load_pattern == "adversarial":
show_plot = False
save = True
study_train = True
study_eval = False
plot_aggregated_test = False
plot_only_aggregated_test = False
# max_x_lim = 45
smooth_factor = 0
load_pattern = "_"
test_envs_to_plot = None
plot_path = "../case_studies_final_figures/adversarial"
require_patterns = [
"26-01_Adversarial_2M_PPO_CB_hidden_npc",
"26-01_Adversarial_2M_PPO_CB_asoc",
"26-01_Adversarial_2M_PPO_CB",
]
to_compare = [
("26-01_Adversarial_2M_PPO_CB", "26-01_Adversarial_2M_PPO_CB_hidden_npc", "orange"),
("26-01_Adversarial_2M_PPO_CB", "26-01_Adversarial_2M_PPO_CB_asoc", "green")
]
def label_parser(label):
label = label.replace("26-01_Adversarial_2M_PPO_CB_hidden_npc", "PPO_CB_invisible_peer")
label = label.replace("26-01_Adversarial_2M_PPO_CB_asoc", "PPO_CB_no_peer")
label = label.replace("26-01_Adversarial_2M_PPO_CB", "PPO_CB")
return label
color_dict = {
"PPO_CB": "blue",
"PPO_CB_invisible_peer": "orange",
"PPO_CB_no_peer": "green",
}
elif load_pattern == "adversarial_stumps":
study_train = True
study_eval = False
plot_aggregated_test = False
plot_only_aggregated_test = False
# max_x_lim = 45
smooth_factor = 0
load_pattern = "_"
test_envs_to_plot = None
plot_path = "../case_studies_final_figures/adversarial_stumps"
require_patterns = [
"26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc",
"26-01_Adversarial_5M_Stumps_PPO_CB_asoc",
"26-01_Adversarial_5M_Stumps_PPO_CB",
]
to_compare = [
("26-01_Adversarial_5M_Stumps_PPO_CB", "26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc", "orange"),
("26-01_Adversarial_5M_Stumps_PPO_CB", "26-01_Adversarial_5M_Stumps_PPO_CB_asoc", "green")
]
def label_parser(label):
label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc", "PPO_CB_invisible_peer")
label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB_asoc", "PPO_CB_no_peer")
label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB", "PPO_CB")
return label
color_dict = {
"PPO_CB": "blue",
"PPO_CB_invisible_peer": "orange",
"PPO_CB_no_peer": "green",
}
else:
plot_path = "plots/testplot"
require_patterns = [
"_",
# pointing
# "04-01_Pointing_CB_heldout_doors",
]
if to_compare is None and len(require_patterns) == 2 and "_" not in require_patterns:
# if only two curves compare those two automatically
to_compare = [(require_patterns[0], require_patterns[1], "black")]
save=False
show_plot = True
# all of those
include_patterns = []
#include_patterns = ["rec_5"]
fontsize = 20
legend_fontsize = 20
linewidth = 5
# linewidth = 1
leg_args = {
'fontsize': legend_fontsize
}
title_fontsize = int(fontsize*1.2)
storage_dir = "storage/"
if load_pattern.startswith(storage_dir):
load_pattern = load_pattern[len(storage_dir):]
if load_pattern.startswith("./storage/"):
load_pattern = load_pattern[len("./storage/"):]
get_datasets(storage_dir, str(load_pattern), load_subsample_step=load_subsample_step, ignore_patterns=ignore_patterns, require_patterns=require_patterns)
label_parser_dict = {
# "PPO_CB": "PPO_CB",
# "02-06_AppleStealing_experiments_cb_bonus_angle_occ_env_SocialAI-OthersPerceptionInferenceParamEnv-v1_exploration-bonus-type_cell": "NPC_visible",
}
env_type = str(load_pattern)
fig_type = "test"
try:
top_n = int(sys.argv[2])
except:
top_n = 8
to_remove = []
for tr_ in to_remove:
if tr_ in models_saves:
del models_saves[tr_]
print("Loaded:")
print("\n".join(list(models_saves.keys())))
if per_model_colors: # order runs for legend order as in per_models_colors, with corresponding colors
ordered_labels = OrderedDict()
for teacher_type in per_model_colors.keys():
for k,v in labels.items():
if teacher_type in k:
ordered_labels[k] = v
labels = ordered_labels
else:
print('not using per_model_color')
for k in models_saves.keys():
labels[k] = k
# Plot utils
def plot_with_shade(subplot_nb, ax, x, y, err, color, shade_color, label,
legend=False, leg_loc='best', title=None,
ylim=[0, 100], xlim=[0, 40], leg_args={}, leg_linewidth=13.0, linewidth=10.0, labelsize=20, fontsize=20, title_fontsize=30,
zorder=None, xlabel='Perf', ylabel='Env steps', linestyle="-", xnbins=3, ynbins=3):
#plt.rcParams.update({'font.size': 15})
ax.locator_params(axis='x', nbins=xnbins)
ax.locator_params(axis='y', nbins=ynbins)
ax.tick_params(axis='y', which='both', labelsize=labelsize)
ax.tick_params(axis='x', which='both', labelsize=labelsize*0.8)
# ax.tick_params(axis='both', which='both', labelsize="small")
# ax.scatter(x, y, color=color,linewidth=linewidth,zorder=zorder, linestyle=linestyle)
ax.plot(x, y, color=color, label=label, linewidth=linewidth, zorder=zorder, linestyle=linestyle)
if not np.array_equal(err, np.zeros_like(err)):
ax.fill_between(x, y-err, y+err, color=shade_color, alpha=0.2)
if legend:
leg = ax.legend(loc=leg_loc, **leg_args) # 34
for legobj in leg.legendHandles:
legobj.set_linewidth(leg_linewidth)
ax.set_xlabel(xlabel, fontsize=fontsize)
if subplot_nb == 0:
ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=2)
ax.set_xlim(xmin=xlim[0], xmax=xlim[1])
ax.set_ylim(bottom=ylim[0], top=ylim[1])
if title:
ax.set_title(title, fontsize=title_fontsize)
# only one figure is drawn -> maybe we can add loops later
assert len(metrics) == 1
f, ax = plt.subplots(1, 1, figsize=(9.0, 9.0))
if len(metrics) == 1:
ax = [ax]
# max_y = -np.inf
min_y = np.inf
max_steps = 0
exclude_patterns = []
metric = metrics[0]
ylabel = {
"success_rate_mean": "Success rate (%)",
"exploration_bonus_mean": "Exploration bonus",
"NPC_intro": "Successful introduction (%)",
}.get(metric, metric)
# for metric_i, metric in enumerate(metrics):
default_colors = default_colors_.copy()
if study_train:
for model_i, model_id in enumerate(models_saves.keys()):
#excluding some experiments
if any([ex_pat in model_id for ex_pat in exclude_patterns]):
continue
if len(include_patterns) > 0:
if not any([in_pat in model_id for in_pat in include_patterns]):
continue
runs_data = models_saves[model_id]['data']
ys = []
if runs_data[0]['frames'][1] == 'frames':
runs_data[0]['frames'] = list(filter(('frames').__ne__, runs_data[0]['frames']))
if per_seed:
min_len = None
else:
# determine minimal run length across seeds
lens = [len(run['frames']) for run in runs_data if len(run['frames'])]
minimum = sorted(lens)[-min(top_n, len(lens))]
min_len = np.min([len(run['frames']) for run in runs_data if len(run['frames']) >= minimum])
# keep only top k
runs_data = [run for run in runs_data if len(run['frames']) >= minimum]
# min_len = np.min([len(run['frames']) for run in runs_data if len(run['frames']) > 10])
# compute env steps (x axis)
longest_id = np.argmax([len(rd['frames']) for rd in runs_data])
steps = np.array(runs_data[longest_id]['frames'], dtype=np.int) / steps_denom
steps = steps[:min_len]
for run in runs_data:
if metric not in run:
raise ValueError(f"Metric {metric} not found. Possible metrics: {list(run.keys())}")
data = run[metric]
# checking for header
if data[1] == metric:
data = np.array(list(filter((metric).__ne__, data)), dtype=np.float16)
if per_seed:
ys.append(data)
else:
if len(data) >= min_len:
# discard extra
if len(data) > min_len:
print("run has too many {} datapoints ({}). Discarding {}".format(model_id, len(data),
len(data) - min_len))
data = data[0:min_len]
ys.append(data)
else:
raise ValueError("How can data be < min_len if it was capped above")
ys_same_len = ys
# computes stats
n_seeds = len(ys_same_len)
if per_seed:
sems = np.array(ys_same_len)
means = np.array(ys_same_len)
stds = np.zeros_like(means)
color = default_colors[model_i]
else:
sems = np.std(ys_same_len, axis=0)/np.sqrt(len(ys_same_len)) # sem
stds = np.std(ys_same_len, axis=0) # std
means = np.mean(ys_same_len, axis=0)
color = default_colors[model_i]
if metric == 'duration':
means = means / 3600
sems = sems / 3600
stds = stds / 3600
if per_seed:
# plot x y bounds
curr_max_steps = np.max(np.max(steps))
else:
# plot x y bounds
curr_max_steps = np.max(steps)
if curr_max_steps > max_steps:
max_steps = curr_max_steps
if subsample_step:
steps = steps[0::subsample_step]
means = means[0::subsample_step]
stds = stds[0::subsample_step]
sems = sems[0::subsample_step]
ys_same_len = [y[0::subsample_step] for y in ys_same_len]
# display seeds separately
if per_seed:
for s_i, seed_ys in enumerate(ys_same_len):
label = label_parser(model_id)
if study_eval:
label = label + "_train_"
label = label + f"(s:{s_i})"
if label in color_dict:
color = color_dict[label]
else:
color = default_colors[model_i*20+s_i]
curve_ID = f"{model_id}_{s_i}"
assert np.array_equal(stds, np.zeros_like(stds))
if smooth_factor:
means = smooth(means, smooth_factor)
to_plot_dict[curve_ID] = {
"label": label,
"steps": steps,
"means": seed_ys,
"stds": stds,
"ys": ys_same_len,
"color": color
}
else:
label = label_parser(model_id)
if study_eval:
label = label+"(train)"
if color_dict:
color = color_dict[label]
else:
color = default_colors[model_i]
if smooth_factor:
means = smooth(means, smooth_factor)
stds = smooth(stds, smooth_factor)
to_plot_dict[model_id] = {
"label": label,
"steps": steps,
"means": means,
"stds": stds,
"sems": sems,
"ys": ys_same_len,
"color": color,
}
if study_eval:
print("Evaluation")
# evaluation sets
number_of_eval_envs = max(list([len(v.keys()) for v in model_eval_data.values()]))
if plot_aggregated_test:
number_of_eval_envs += 1
if number_of_eval_envs == 0:
print("No eval envs")
exit()
default_colors = default_colors_.copy()
test_summary_dict = defaultdict(dict)
test_summary_dict_colors = defaultdict(dict)
for model_i, model_id in enumerate(model_eval_data.keys()):
# excluding some experiments
if any([ex_pat in model_id for ex_pat in exclude_patterns]):
continue
if len(include_patterns) > 0:
if not any([in_pat in model_id for in_pat in include_patterns]):
continue
# test envs
test_envs = model_eval_data[model_id].items()
# filter unwanted eval envs
if test_envs_to_plot is not None:
test_envs = [(name, data) for name, data in test_envs if name in test_envs_to_plot]
# computes stats
if sort_test:
test_envs_sorted = list(sorted(test_envs, key=lambda kv: sort_test_set(kv[0])))
else:
test_envs_sorted = list(test_envs)
if plot_aggregated_test:
agg_means = []
for env_i, (test_env, env_data) in enumerate(test_envs_sorted):
ys_same_len = env_data["values"]
steps = env_data["steps"].mean(0) / steps_denom
n_seeds = len(ys_same_len)
if per_seed:
sems = np.array(ys_same_len)
stds = np.array(ys_same_len)
means = np.array(ys_same_len)
color = default_colors[model_i]
# plot x y bounds
curr_max_steps = np.max(np.max(steps))
else:
sems = np.std(ys_same_len, axis=0) / np.sqrt(len(ys_same_len)) # sem
stds = np.std(ys_same_len, axis=0) # std
means = np.mean(ys_same_len, axis=0)
color = default_colors[model_i]
curr_max_steps = np.max(steps)
if plot_aggregated_test:
agg_means.append(means)
x_lim = max(steps[-1], x_lim)
x_lim = min(max_x_lim, x_lim)
eval_metric_name = {
"test_success_rates": "Success rate",
'exploration_bonus_mean': "Exploration bonus",
}.get(eval_metric, eval_metric)
test_env_name = test_env.replace("Env", "").replace("Test", "")
env_types = ["InformationSeeking", "Collaboration", "PerspectiveTaking"]
for env_type in env_types:
if env_type in test_env_name:
test_env_name = test_env_name.replace(env_type, "")
test_env_name += f"\n({env_type})"
if per_seed:
for s_i, seed_ys in enumerate(ys_same_len):
label = label_parser(model_id) + f"_{test_env}" + f"(s:{s_i})"
if eval_smooth_factor:
seed_ys = smooth(seed_ys, eval_smooth_factor)
curve_ID = f"{model_id}_{test_env}_{s_i}"
to_plot_dict[curve_ID] = {
"label": label,
"steps": steps,
"means": seed_ys,
"stds": np.zeros_like(seed_ys),
"ys": ys_same_len,
"color": color
}
else:
if len(test_envs_sorted) > 1:
label = label_parser(model_id) + f"_{test_env}"
else:
label = label_parser(model_id)
if study_train:
label=label+"(test)"
if not plot_only_aggregated_test:
if label in color_dict:
color = color_dict[label]
else:
color = default_colors[model_i*len(test_envs_sorted)+env_i]
if legend_show_n_seeds:
label = label + "({})".format(n_seeds)
if eval_smooth_factor:
means = smooth(means, eval_smooth_factor)
stds = smooth(stds, eval_smooth_factor)
sems = smooth(sems, eval_smooth_factor)
to_plot_dict[model_id+f"_{test_env}"] = {
"label": label,
"steps": steps,
"means": means,
"stds": stds,
"sems": sems,
"ys": ys_same_len,
"color": color,
}
if plot_aggregated_test:
ys_same_len = agg_means
agg_means = np.array(agg_means)
agg_mean = agg_means.mean(axis=0)
agg_std = agg_means.std(axis=0) # std
agg_sems = ...
label = label_parser(model_id)
if study_train:
label = label + "(train)"
if eval_smooth_factor:
agg_mean = smooth(agg_mean, eval_smooth_factor)
agg_std = smooth(agg_std, eval_smooth_factor)
agg_sems = smooth(agg_sems, eval_smooth_factor)
if per_seed:
print("Not smooth aggregated because of per seed")
for s_i, (seed_ys, seed_st) in enumerate(zip(agg_mean, agg_std)):
seed_c = default_colors[model_i + s_i]
label = str(s_i)
to_plot_dict[curve_ID] = {
"label": label,
"steps": steps,
"means": seed_ys,
"stds": seed_st,
"ys": ys_same_len,
"color": color
}
else:
if label in color_dict:
color = color_dict[label]
else:
color = default_colors[model_i]
to_plot_dict[model_id+"_agg_test"] = {
"label": label,
"steps": steps,
"means": agg_mean,
"stds": agg_std,
"sems": agg_sems,
"ys": ys_same_len,
"color": color,
}
# should be labels
to_scatter_dict = {}
if to_compare is not None:
for comp_i, (a_model_id, b_model_id, color) in enumerate(to_compare):
a_data = to_plot_dict[a_model_id]["ys"]
b_data = to_plot_dict[b_model_id]["ys"]
steps = to_plot_dict[a_model_id]["steps"]
if color == "auto_color":
color = to_plot_dict[a_model_id]["color"]
if len(a_data[0]) != len(b_data[0]):
# extract steps present in both
a_steps = to_plot_dict[a_model_id]["steps"]
b_steps = to_plot_dict[b_model_id]["steps"]
steps = list(set(a_steps) & set(b_steps))
# keep only the values for those steps
mask_a = [(a_s in steps) for a_s in a_steps]
a_data = np.array(a_data)[:, mask_a]
mask_b = [(b_s in steps) for b_s in b_steps]
b_data = np.array(b_data)[:, mask_b]
p = stats.ttest_ind(
a_data,
b_data,
equal_var=False
).pvalue
steps = [s for s, p in zip(steps, p) if p < test_p]
ys = [1.02+0.02*comp_i]*len(steps)
to_scatter_dict[f"compare_{a_model_id}_{b_model_id}"] = {
"label": "",
"xs": steps,
"ys": ys,
"color": color,
}
for scatter_i, (scatter_ID, scatter_id_data) in enumerate(to_scatter_dict.items()):
# unpack data
label, xs, ys, color = (
scatter_id_data["label"],
scatter_id_data["xs"],
scatter_id_data["ys"],
scatter_id_data["color"],
)
xlabel = f"Env steps (1e6)"
plt.scatter(
xs,
ys,
color=color,
marker="x"
)
summary_dict[label] = xs[-1]
summary_dict_colors[label] = color
for curve_i, (curve_ID, model_id_data) in enumerate(to_plot_dict.items()):
# unpack data
label, steps, means, stds, sems, ys, color = (
model_id_data["label"],
model_id_data["steps"],
model_id_data["means"],
model_id_data["stds"],
model_id_data["sems"],
model_id_data["ys"],
model_id_data["color"]
)
# if smooth_factor:
# means = smooth(means, smooth_factor)
# stds = smooth(stds, smooth_factor)
if legend_show_n_seeds:
n_seeds = len(ys)
label = label+"({})".format(n_seeds)
x_lim = max(steps[-1], x_lim)
x_lim = min(max_x_lim, x_lim)
xlabel = f"Env steps (1e6)"
plot_with_shade(
0, ax[0], steps, means, stds, color, color, label,
# 0, ax[0], steps, means, sems, color, color, label,
legend=draw_legend,
xlim=[0, x_lim],
ylim=[0, max_y],
xlabel=xlabel,
ylabel=ylabel,
title=None,
labelsize=fontsize,
fontsize=fontsize,
title_fontsize=title_fontsize,
linewidth=linewidth,
leg_linewidth=5,
leg_args=leg_args,
xnbins=xnbins,
ynbins=ynbins,
)
summary_dict[label] = means[-1]
summary_dict_colors[label] = color
# plot static lines
if static_lines:
for label, (mean, std, color) in static_lines.items():
if label == "":
label = None
plot_with_shade(
0, ax[0], steps, np.array([mean]*len(steps)), np.array([std]*len(steps)), color, color, label,
legend=True,
xlim=[0, x_lim],
ylim=[0, 1.0],
xlabel=f"Env steps (1e6)",
ylabel=ylabel,
linestyle=":",
leg_args=leg_args,
fontsize=fontsize,
title_fontsize=title_fontsize,
xnbins=xnbins,
ynbins=ynbins,
)
if plot_path:
f.savefig(plot_path+".png")
f.savefig(plot_path+".svg")
print(f"Plot saved to {plot_path}.[png/svg].")
# Summary dict
if len(summary_dict) == 0:
raise ValueError(f"No experiments found for {load_pattern}.")
else:
# print summary
best = max(summary_dict.values())
pc = 0.3
n = int(len(summary_dict)*pc)
print("top n: ", n)
top_pc = sorted(summary_dict.values())[-n:]
bottom_pc = sorted(summary_dict.values())[:n]
print("legend:")
cprint("\tbest", "green")
cprint("\ttop {} %".format(pc), "blue")
cprint("\tbottom {} %".format(pc), "red")
print("\tothers")
print()
for l, p in sorted(summary_dict.items(), key=lambda kv: kv[1]):
c = summary_dict_colors[l]
if p == best:
cprint("label: {} ({})".format(l, c), "green")
cprint("\t {}:{}".format(metric, p), "green")
elif p in top_pc:
cprint("label: {} ({})".format(l, c), "blue")
cprint("\t {}:{}".format(metric, p), "blue")
elif p in bottom_pc:
cprint("label: {} ({})".format(l, c), "red")
cprint("\t {}:{}".format(metric, p), "red")
else:
print("label: {} ({})".format(l, c))
print("\t {}:{}".format(metric, p))
if show_plot:
plt.tight_layout()
plt.subplots_adjust(hspace=1.5, wspace=0.5, left=0.1, right=0.9, bottom=0.1, top=0.85)
plt.suptitle(super_title)
plt.show()
plt.close()