Spaces:

flowers-team
/

SocialAISchool

Running

App Files Files Community

SocialAISchool / data_visualize.py

grg

Cleaned old git history

be5548b about 1 year ago

raw

history blame

42.3 kB

	#!/usr/bin/env python
	import re
	import itertools
	import math
	from itertools import chain
	import time

	# import seaborn
	import numpy as np
	import os
	from collections import OrderedDict, defaultdict
	import pandas as pd
	import matplotlib.pyplot as plt
	import sys
	from termcolor import cprint, colored
	from pathlib import Path
	import pickle
	from scipy import stats

	save = True
	show_plot = False

	metrics = [
	'success_rate_mean',
	# 'FPS',
	# 'extrinsic_return_mean',
	# 'exploration_bonus_mean',
	# 'NPC_intro',
	# 'curriculum_param_mean',
	# 'curriculum_max_success_rate_mean',
	# 'rreturn_mean'
	]


	eval_metric = "test_success_rates"
	# eval_metric = "exploration_bonus_mean"

	super_title = ""
	# super_title = "PPO - No exploration bonus"
	# super_title = "Count Based exploration bonus (Grid Search)"
	# super_title = "PPO + RND"
	# super_title = "PPO + RIDE"

	# statistical evaluation p-value
	test_p = 0.05

	agg_title = ""

	color_dict = None
	eval_filename = None

	max_frames = 20_000_000

	legend_show_n_seeds = False
	draw_legend = True
	per_seed = False

	study_train = False
	study_eval = True

	plot_test = True

	plot_aggregated_test = True
	plot_only_aggregated_test = True


	xnbins = 4
	ynbins = 3

	steps_denom = 1e6

	# Global vas for tracking and labeling data at load time.
	exp_idx = 0
	label_parser_dict = None
	label_parser = lambda l, _, label_parser_dict: l

	smooth_factor = 10 # used
	# smooth_factor = 0
	print("smooth factor:", smooth_factor)
	eval_smooth_factor = None
	leg_size = 30

	def smooth(x_, n=50):
	if n is None:
	return x_

	if type(x_) == list:
	x_ = np.array(x_)
	return np.array([x_[max(i - n, 0):i + 1].mean() for i in range(len(x_))])

	sort_test = False

	def sort_test_set(env_name):
	helps = [
	"LanguageFeedback",
	"LanguageColor",
	"Pointing",
	"Emulation",
	]
	problems = [
	"Boxes",
	"Switches",
	"Generators",
	"Marble",
	"Doors",
	"Levers",
	]

	env_names = []
	for p in problems:
	for h in helps:
	env_names.append(h+p)

	env_names.extend([
	"LeverDoorColl",
	"MarblePushColl",
	"MarblePassColl",
	"AppleStealing"
	])

	for i, en in enumerate(env_names):
	if en in env_name:
	return i

	raise ValueError(f"Test env {env_name} not known")



	subsample_step = 1
	load_subsample_step = 1

	x_lim = 0
	max_x_lim = np.inf

	summary_dict = {}
	summary_dict_colors = {}
	to_plot_dict = {}


	default_colors_ = ["blue","orange","green","magenta", "brown", "red",'black',"grey",u'#ff7f0e',
	"cyan", "pink",'purple', u'#1f77b4',
	"darkorchid","sienna","lightpink", "indigo","mediumseagreen",'aqua',
	'deeppink','silver','khaki','goldenrod'] * 100


	def get_eval_data(logdir, eval_metric):
	eval_data = defaultdict(lambda :defaultdict(list))

	for root, _, files in os.walk(logdir):
	for file in files:
	if 'testing_' in file:
	assert ".pkl" in file
	test_env_name = file.lstrip("testing_").rstrip(".pkl")
	try:
	with open(root+"/"+file, "rb") as f:
	seed_eval_data = pickle.load(f)
	except:
	print("Pickle not loaded: ", root+"/"+file)
	time.sleep(1)
	continue

	eval_data[test_env_name]["values"].append(seed_eval_data[eval_metric])
	eval_data[test_env_name]["steps"].append(seed_eval_data["test_step_nb"])

	for test_env, seed_data in eval_data.items():
	min_len_seed = min([len(s) for s in seed_data['steps']])
	eval_data[test_env]["values"] = np.array([s[:min_len_seed] for s in eval_data[test_env]["values"]])
	eval_data[test_env]["steps"] = np.array([s[:min_len_seed] for s in eval_data[test_env]["steps"]])

	return eval_data

	def get_all_runs(logdir, load_subsample_step=1):
	"""
	Recursively look through logdir for output files produced by
	Assumes that any file "log.csv" is a valid hit.
	"""
	global exp_idx
	global units
	datasets = []
	for root, _, files in os.walk(logdir):
	if 'log.csv' in files:
	if (Path(root) / 'log.csv').stat().st_size == 0:
	print("CSV {} empty".format(os.path.join(root, 'log.csv')))
	continue

	run_name = root[8:]

	exp_name = None

	config = None
	exp_idx += 1

	# load progress data
	try:
	exp_data = pd.read_csv(os.path.join(root, 'log.csv'))
	print("Loaded:", os.path.join(root, 'log.csv'))
	except:
	raise ValueError("CSV {} faulty".format(os.path.join(root, 'log.csv')))

	exp_data = exp_data[::load_subsample_step]
	data_dict = exp_data.to_dict("list")

	data_dict['config'] = config
	nb_epochs = len(data_dict['frames'])
	if nb_epochs == 1:
	print(f'{run_name} -> {colored(f"nb_epochs {nb_epochs}", "red")}')
	else:
	print('{} -> nb_epochs {}'.format(run_name, nb_epochs))

	datasets.append(data_dict)

	return datasets


	def get_datasets(rootdir, load_only="", load_subsample_step=1, ignore_patterns=("ignore"), require_patterns=()):
	_, models_list, _ = next(os.walk(rootdir))
	for dir_name in models_list.copy():
	# add "ignore" in a directory name to avoid loading its content
	for ignore_pattern in ignore_patterns:
	if ignore_pattern in dir_name or load_only not in dir_name:
	if dir_name in models_list:
	models_list.remove(dir_name)

	if len(require_patterns) > 0:
	if not any([require_pattern in dir_name for require_pattern in require_patterns]):
	if dir_name in models_list:
	models_list.remove(dir_name)

	for expe_name in list(labels.keys()):
	if expe_name not in models_list:
	del labels[expe_name]

	# setting per-model type colors
	for i, m_name in enumerate(models_list):
	for m_type, m_color in per_model_colors.items():
	if m_type in m_name:
	colors[m_name] = m_color
	print("extracting data for {}...".format(m_name))
	m_id = m_name
	models_saves[m_id] = OrderedDict()
	models_saves[m_id]['data'] = get_all_runs(rootdir+m_name, load_subsample_step=load_subsample_step)
	print("done")

	if m_name not in labels:
	labels[m_name] = m_name

	model_eval_data[m_id] = get_eval_data(logdir=rootdir+m_name, eval_metric=eval_metric)

	"""
	retrieve all experiences located in "data to vizu" folder
	"""
	labels = OrderedDict()
	per_model_colors = OrderedDict()

	# LOAD DATA
	models_saves = OrderedDict()
	colors = OrderedDict()
	model_eval_data = OrderedDict()

	static_lines = {}

	ignore_patterns = ["_ignore_"]

	to_compare = None
	load_pattern = sys.argv[1]

	test_envs_to_plot = None # plot all

	min_y, max_y = 0.0, 1.1


	def label_parser(label):
	label = label.replace("04-01_Pointing_CB_heldout_doors", "PPO_CB")
	label = label.replace("19-01_Color_CB_heldout_doors", "PPO_CBL")
	label = label.replace("19-01_Feedback_CB_heldout_doors_20M", "PPO_CBL")

	label = label.replace("20-01_JA_Color_CB_heldout_doors", "JA_PPO_CBL")

	label = label.replace("05-01_scaffolding_50M_no_acl", "PPO_no_scaf")
	label = label.replace("05-01_scaffolding_50M_acl_4_acl-type_intro_seq", "PPO_scaf_4")
	label = label.replace("05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf", "PPO_scaf_8")


	label = label.replace("03-01_RR_ft_single_CB_marble_pass_A_soc_exp", "PPO_CB_role_B")
	label = label.replace("03-01_RR_ft_single_CB_marble_pass_A_asoc_contr", "PPO_CB_asocial")

	label = label.replace("05-01_RR_ft_group_50M_CB_marble_pass_A_soc_exp", "PPO_CB_role_B")
	label = label.replace("05-01_RR_ft_group_50M_CB_marble_pass_A_asoc_contr", "PPO_CB_asocial")

	label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50",
	"PPO_CB_0.25")
	label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50",
	"PPO_CB_0.5")
	label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50",
	"PPO_CB_1")

	return label

	color_dict = {
	'PPO_CB': "blue",
	'PPO_CB(train)': "blue",
	"PPO_CB(test)": "orange",

	'PPO_no_bonus': "orange",

	'PPO_CBL': "blue",
	'PPO_CBL(train)': "blue",
	"PPO_CBL(test)": "orange",
	'JA_PPO_CBL': "green",

	"PPO_CB_role_B": "blue",
	"PPO_CB_asocial": "orange",

	'PPO_CB_0.25': "blue",
	'PPO_CB_0.5': "green",
	'PPO_CB_1': "orange",

	}

	if load_pattern == "RR_single":
	save = False
	show_plot = True
	load_pattern = "_"

	plot_path = "../case_studies_final_figures/RR_dummy_single"

	require_patterns = [
	"03-01_RR_ft_single_CB_marble_pass_A_asoc_contr",
	"03-01_RR_ft_single_CB_marble_pass_A_soc_exp",
	]

	plot_aggregated_test = False
	plot_only_aggregated_test = False
	study_train = True
	study_eval = False

	elif load_pattern == "RR_group":

	load_pattern = "_"

	plot_path = "../case_studies_final_figures/RR_dummy_group"

	require_patterns = [
	"05-01_RR_ft_group_50M_CB_marble_pass_A_asoc_contr",
	"05-01_RR_ft_group_50M_CB_marble_pass_A_soc_exp",
	]

	plot_aggregated_test = False
	plot_only_aggregated_test = False
	study_train = True
	study_eval = False


	elif load_pattern == "scaffolding":
	load_pattern = "_"

	plot_path = "../case_studies_final_figures/Scaffolding_test"

	require_patterns = [
	"05-01_scaffolding_50M_no_acl",
	"05-01_scaffolding_50M_acl_4_acl-type_intro_seq",
	"05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf",
	]

	test_envs_to_plot = None # aggregate all of them
	plot_aggregated_test = True
	plot_only_aggregated_test = True
	study_train = False
	study_eval = True

	to_compare = [
	("05-01_scaffolding_50M_acl_4_acl-type_intro_seq_agg_test", "05-01_scaffolding_50M_no_acl_agg_test", "auto_color"),
	("05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf_agg_test", "05-01_scaffolding_50M_no_acl_agg_test", "auto_color"),
	]

	elif load_pattern == "pointing":
	study_train = True
	study_eval = True

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	load_pattern = "_"

	test_envs_to_plot = [
	"SocialAI-EPointingDoorsTestInformationSeekingParamEnv-v1",
	]

	plot_path = "../case_studies_final_figures/Pointing_train_test"

	require_patterns = [
	"04-01_Pointing_CB_heldout_doors",
	]

	to_compare = [
	("04-01_Pointing_CB_heldout_doors", "04-01_Pointing_CB_heldout_doors_SocialAI-EPointingDoorsTestInformationSeekingParamEnv-v1", "black")
	]

	elif load_pattern == "color":
	study_train = True
	study_eval = True

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	max_x_lim = 18

	load_pattern = "_"

	test_envs_to_plot = [
	"SocialAI-ELangColorDoorsTestInformationSeekingParamEnv-v1",
	]

	plot_path = "../case_studies_final_figures/Color_train_test"

	require_patterns = [
	"19-01_Color_CB_heldout_doors",
	]

	to_compare = [
	("19-01_Color_CB_heldout_doors", "19-01_Color_CB_heldout_doors_SocialAI-ELangColorDoorsTestInformationSeekingParamEnv-v1", "black")
	]

	elif load_pattern == "ja_color":

	study_train = True
	study_eval = False

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	max_x_lim = 18

	load_pattern = "_"

	test_envs_to_plot = None
	plot_path = "../case_studies_final_figures/JA_Color_train"

	require_patterns = [
	"19-01_Color_CB_heldout_doors",
	"20-01_JA_Color_CB_heldout_doors",
	]

	to_compare = [
	("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black")
	]

	elif load_pattern == "feedback_per_seed":
	study_train = True
	study_eval = False
	per_seed = True
	draw_legend = False

	plot_aggregated_test = False
	plot_only_aggregated_test = False
	max_x_lim = 18

	load_pattern = "_"

	test_envs_to_plot = [
	"SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1",
	]

	plot_path = "../case_studies_final_figures/Feedback_train_per_seed"

	require_patterns = [
	"19-01_Feedback_CB_heldout_doors",
	]

	to_compare = None

	elif load_pattern == "feedback":
	study_train = True
	study_eval = True

	plot_aggregated_test = False
	plot_only_aggregated_test = False
	max_x_lim = 18

	load_pattern = "_"

	test_envs_to_plot = [
	"SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1",
	]

	plot_path = "../case_studies_final_figures/Feedback_train_test"

	require_patterns = [
	"19-01_Feedback_CB_heldout_doors",
	]

	to_compare = [
	("19-01_Feedback_CB_heldout_doors_20M", "19-01_Feedback_CB_heldout_doors_20M_SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1", "black")
	]

	elif load_pattern == "imitation_train":

	study_train = True
	study_eval = False

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	max_x_lim = 18

	load_pattern = "_"

	test_envs_to_plot = None
	plot_path = "../case_studies_final_figures/Imitation_train"

	require_patterns = [
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50",
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50",
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50",
	]

	# to_compare = [
	# ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black")
	# ]
	to_compare = None

	elif load_pattern == "imitation_train_intro":

	metrics = ["NPC_intro"]

	show_plot = False
	save = True

	study_train = True
	study_eval = False

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	max_x_lim = 18

	load_pattern = "_"

	test_envs_to_plot = None
	plot_path = "../case_studies_final_figures/Imitation_train_intro"

	require_patterns = [
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50",
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50",
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50",
	]

	# to_compare = [
	# ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black")
	# ]
	to_compare = None

	elif load_pattern == "imitation_test":

	study_train = False
	study_eval = True

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	max_x_lim = 18

	load_pattern = "_"

	test_envs_to_plot = None
	plot_path = "../case_studies_final_figures/Imitation_test"

	require_patterns = [
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50",
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50",
	"20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50",
	]

	# to_compare = [
	# ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black")
	# ]
	to_compare = None

	elif load_pattern == "pilot_pointing":

	study_train = True
	study_eval = False

	show_plot = False
	save = True
	plot_path = "../case_studies_final_figures/pilot_pointing"

	load_pattern = "29-10_SAI_Pointing_CS_PPO_"

	require_patterns = [
	"29-10_SAI_Pointing_CS_PPO_CB_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_cell_exploration-bonus-params__2_50_exploration-bonus-tanh_0.6",
	"29-10_SAI_Pointing_CS_PPO_CBL_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_lang_exploration-bonus-params__10_50_exploration-bonus-tanh_0.6",
	"29-10_SAI_Pointing_CS_PPO_no_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4",
	"29-10_SAI_Pointing_CS_PPO_RIDE_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_ride_intrinsic-reward-coef_0.01",
	"29-10_SAI_Pointing_CS_PPO_RND_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_rnd_intrinsic-reward-coef_0.005",
	]

	color_dict = {
	"PPO_RIDE": "orange",
	"PPO_RND": "magenta",
	"PPO_no": "maroon",
	"PPO_CBL": "green",
	"PPO_CB": "blue",
	}

	def label_parser(label):
	label = label.split("_env_")[0].split("SAI_")[1]
	label=label.replace("Pointing_CS_", "")
	return label

	to_compare = None

	elif load_pattern == "pilot_color":

	study_train = True
	study_eval = False

	show_plot = False
	save = True
	plot_path = "../case_studies_final_figures/pilot_color"

	load_pattern = "29-10_SAI_LangColor_CS"

	require_patterns = [
	"29-10_SAI_LangColor_CS_PPO_CB_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_cell_exploration-bonus-params__2_50_exploration-bonus-tanh_0.6",
	"29-10_SAI_LangColor_CS_PPO_CBL_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_lang_exploration-bonus-params__10_50_exploration-bonus-tanh_0.6",
	"29-10_SAI_LangColor_CS_PPO_no_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4",
	"29-10_SAI_LangColor_CS_PPO_RIDE_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_ride_intrinsic-reward-coef_0.01",
	"29-10_SAI_LangColor_CS_PPO_RND_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_rnd_intrinsic-reward-coef_0.005"
	]
	color_dict = {
	"PPO_RIDE": "orange",
	"PPO_RND": "magenta",
	"PPO_no": "maroon",
	"PPO_CBL": "green",
	"PPO_CB": "blue",
	}

	def label_parser(label):
	label = label.split("_env_")[0].split("SAI_")[1]
	label=label.replace("LangColor_CS_", "")
	return label

	to_compare = None

	elif load_pattern == "formats_train":

	study_train = True
	study_eval = False

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	max_x_lim = 45

	load_pattern = "_"

	test_envs_to_plot = None
	plot_path = "../case_studies_final_figures/Formats_train"

	require_patterns = [
	"21-01_formats_50M_CBL",
	"05-01_scaffolding_50M_no_acl",
	]

	to_compare = [
	("21-01_formats_50M_CBL", "05-01_scaffolding_50M_no_acl", "black")
	]


	def label_parser(label):
	label = label.replace("05-01_scaffolding_50M_no_acl", "PPO_no_bonus")
	label = label.replace("21-01_formats_50M_CBL", "PPO_CBL")
	return label

	elif load_pattern == "adversarial":

	show_plot = False
	save = True

	study_train = True
	study_eval = False

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	# max_x_lim = 45

	smooth_factor = 0

	load_pattern = "_"

	test_envs_to_plot = None
	plot_path = "../case_studies_final_figures/adversarial"

	require_patterns = [
	"26-01_Adversarial_2M_PPO_CB_hidden_npc",
	"26-01_Adversarial_2M_PPO_CB_asoc",
	"26-01_Adversarial_2M_PPO_CB",
	]

	to_compare = [
	("26-01_Adversarial_2M_PPO_CB", "26-01_Adversarial_2M_PPO_CB_hidden_npc", "orange"),
	("26-01_Adversarial_2M_PPO_CB", "26-01_Adversarial_2M_PPO_CB_asoc", "green")
	]

	def label_parser(label):
	label = label.replace("26-01_Adversarial_2M_PPO_CB_hidden_npc", "PPO_CB_invisible_peer")
	label = label.replace("26-01_Adversarial_2M_PPO_CB_asoc", "PPO_CB_no_peer")
	label = label.replace("26-01_Adversarial_2M_PPO_CB", "PPO_CB")
	return label

	color_dict = {
	"PPO_CB": "blue",
	"PPO_CB_invisible_peer": "orange",
	"PPO_CB_no_peer": "green",
	}

	elif load_pattern == "adversarial_stumps":


	study_train = True
	study_eval = False

	plot_aggregated_test = False
	plot_only_aggregated_test = False

	# max_x_lim = 45

	smooth_factor = 0

	load_pattern = "_"

	test_envs_to_plot = None
	plot_path = "../case_studies_final_figures/adversarial_stumps"

	require_patterns = [
	"26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc",
	"26-01_Adversarial_5M_Stumps_PPO_CB_asoc",
	"26-01_Adversarial_5M_Stumps_PPO_CB",
	]

	to_compare = [
	("26-01_Adversarial_5M_Stumps_PPO_CB", "26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc", "orange"),
	("26-01_Adversarial_5M_Stumps_PPO_CB", "26-01_Adversarial_5M_Stumps_PPO_CB_asoc", "green")
	]

	def label_parser(label):
	label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc", "PPO_CB_invisible_peer")
	label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB_asoc", "PPO_CB_no_peer")
	label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB", "PPO_CB")
	return label

	color_dict = {
	"PPO_CB": "blue",
	"PPO_CB_invisible_peer": "orange",
	"PPO_CB_no_peer": "green",
	}

	else:
	plot_path = "plots/testplot"

	require_patterns = [
	"_",
	# pointing
	# "04-01_Pointing_CB_heldout_doors",
	]

	if to_compare is None and len(require_patterns) == 2 and "_" not in require_patterns:
	# if only two curves compare those two automatically
	to_compare = [(require_patterns[0], require_patterns[1], "black")]



	save=False
	show_plot = True


	# all of those
	include_patterns = []
	#include_patterns = ["rec_5"]

	fontsize = 20
	legend_fontsize = 20
	linewidth = 5
	# linewidth = 1

	leg_args = {
	'fontsize': legend_fontsize
	}

	title_fontsize = int(fontsize*1.2)


	storage_dir = "storage/"
	if load_pattern.startswith(storage_dir):
	load_pattern = load_pattern[len(storage_dir):]

	if load_pattern.startswith("./storage/"):
	load_pattern = load_pattern[len("./storage/"):]

	get_datasets(storage_dir, str(load_pattern), load_subsample_step=load_subsample_step, ignore_patterns=ignore_patterns, require_patterns=require_patterns)

	label_parser_dict = {
	# "PPO_CB": "PPO_CB",
	# "02-06_AppleStealing_experiments_cb_bonus_angle_occ_env_SocialAI-OthersPerceptionInferenceParamEnv-v1_exploration-bonus-type_cell": "NPC_visible",
	}

	env_type = str(load_pattern)

	fig_type = "test"
	try:
	top_n = int(sys.argv[2])
	except:
	top_n = 8

	to_remove = []

	for tr_ in to_remove:
	if tr_ in models_saves:
	del models_saves[tr_]

	print("Loaded:")
	print("\n".join(list(models_saves.keys())))


	if per_model_colors: # order runs for legend order as in per_models_colors, with corresponding colors
	ordered_labels = OrderedDict()
	for teacher_type in per_model_colors.keys():
	for k,v in labels.items():
	if teacher_type in k:
	ordered_labels[k] = v
	labels = ordered_labels
	else:
	print('not using per_model_color')
	for k in models_saves.keys():
	labels[k] = k

	# Plot utils
	def plot_with_shade(subplot_nb, ax, x, y, err, color, shade_color, label,
	legend=False, leg_loc='best', title=None,
	ylim=[0, 100], xlim=[0, 40], leg_args={}, leg_linewidth=13.0, linewidth=10.0, labelsize=20, fontsize=20, title_fontsize=30,
	zorder=None, xlabel='Perf', ylabel='Env steps', linestyle="-", xnbins=3, ynbins=3):

	#plt.rcParams.update({'font.size': 15})
	ax.locator_params(axis='x', nbins=xnbins)
	ax.locator_params(axis='y', nbins=ynbins)

	ax.tick_params(axis='y', which='both', labelsize=labelsize)
	ax.tick_params(axis='x', which='both', labelsize=labelsize*0.8)
	# ax.tick_params(axis='both', which='both', labelsize="small")

	# ax.scatter(x, y, color=color,linewidth=linewidth,zorder=zorder, linestyle=linestyle)
	ax.plot(x, y, color=color, label=label, linewidth=linewidth, zorder=zorder, linestyle=linestyle)

	if not np.array_equal(err, np.zeros_like(err)):
	ax.fill_between(x, y-err, y+err, color=shade_color, alpha=0.2)

	if legend:
	leg = ax.legend(loc=leg_loc, **leg_args) # 34
	for legobj in leg.legendHandles:
	legobj.set_linewidth(leg_linewidth)

	ax.set_xlabel(xlabel, fontsize=fontsize)
	if subplot_nb == 0:
	ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=2)

	ax.set_xlim(xmin=xlim[0], xmax=xlim[1])
	ax.set_ylim(bottom=ylim[0], top=ylim[1])
	if title:
	ax.set_title(title, fontsize=title_fontsize)




	# only one figure is drawn -> maybe we can add loops later
	assert len(metrics) == 1

	f, ax = plt.subplots(1, 1, figsize=(9.0, 9.0))

	if len(metrics) == 1:
	ax = [ax]

	# max_y = -np.inf
	min_y = np.inf

	max_steps = 0
	exclude_patterns = []

	metric = metrics[0]

	ylabel = {
	"success_rate_mean": "Success rate (%)",
	"exploration_bonus_mean": "Exploration bonus",
	"NPC_intro": "Successful introduction (%)",
	}.get(metric, metric)

	# for metric_i, metric in enumerate(metrics):
	default_colors = default_colors_.copy()

	if study_train:
	for model_i, model_id in enumerate(models_saves.keys()):

	#excluding some experiments
	if any([ex_pat in model_id for ex_pat in exclude_patterns]):
	continue

	if len(include_patterns) > 0:
	if not any([in_pat in model_id for in_pat in include_patterns]):
	continue

	runs_data = models_saves[model_id]['data']
	ys = []

	if runs_data[0]['frames'][1] == 'frames':
	runs_data[0]['frames'] = list(filter(('frames').__ne__, runs_data[0]['frames']))

	if per_seed:
	min_len = None

	else:
	# determine minimal run length across seeds
	lens = [len(run['frames']) for run in runs_data if len(run['frames'])]
	minimum = sorted(lens)[-min(top_n, len(lens))]
	min_len = np.min([len(run['frames']) for run in runs_data if len(run['frames']) >= minimum])

	# keep only top k
	runs_data = [run for run in runs_data if len(run['frames']) >= minimum]

	# min_len = np.min([len(run['frames']) for run in runs_data if len(run['frames']) > 10])

	# compute env steps (x axis)
	longest_id = np.argmax([len(rd['frames']) for rd in runs_data])
	steps = np.array(runs_data[longest_id]['frames'], dtype=np.int) / steps_denom
	steps = steps[:min_len]

	for run in runs_data:
	if metric not in run:
	raise ValueError(f"Metric {metric} not found. Possible metrics: {list(run.keys())}")

	data = run[metric]

	# checking for header
	if data[1] == metric:
	data = np.array(list(filter((metric).__ne__, data)), dtype=np.float16)

	if per_seed:
	ys.append(data)

	else:
	if len(data) >= min_len:
	# discard extra
	if len(data) > min_len:
	print("run has too many {} datapoints ({}). Discarding {}".format(model_id, len(data),
	len(data) - min_len))
	data = data[0:min_len]
	ys.append(data)
	else:
	raise ValueError("How can data be < min_len if it was capped above")

	ys_same_len = ys

	# computes stats
	n_seeds = len(ys_same_len)

	if per_seed:
	sems = np.array(ys_same_len)
	means = np.array(ys_same_len)
	stds = np.zeros_like(means)
	color = default_colors[model_i]

	else:
	sems = np.std(ys_same_len, axis=0)/np.sqrt(len(ys_same_len)) # sem
	stds = np.std(ys_same_len, axis=0) # std
	means = np.mean(ys_same_len, axis=0)
	color = default_colors[model_i]

	if metric == 'duration':
	means = means / 3600
	sems = sems / 3600
	stds = stds / 3600

	if per_seed:
	# plot x y bounds
	curr_max_steps = np.max(np.max(steps))

	else:
	# plot x y bounds
	curr_max_steps = np.max(steps)

	if curr_max_steps > max_steps:
	max_steps = curr_max_steps

	if subsample_step:
	steps = steps[0::subsample_step]
	means = means[0::subsample_step]
	stds = stds[0::subsample_step]
	sems = sems[0::subsample_step]
	ys_same_len = [y[0::subsample_step] for y in ys_same_len]

	# display seeds separately
	if per_seed:
	for s_i, seed_ys in enumerate(ys_same_len):

	label = label_parser(model_id)

	if study_eval:
	label = label + "_train_"

	label = label + f"(s:{s_i})"

	if label in color_dict:
	color = color_dict[label]
	else:
	color = default_colors[model_i*20+s_i]

	curve_ID = f"{model_id}_{s_i}"
	assert np.array_equal(stds, np.zeros_like(stds))

	if smooth_factor:
	means = smooth(means, smooth_factor)

	to_plot_dict[curve_ID] = {
	"label": label,
	"steps": steps,
	"means": seed_ys,
	"stds": stds,
	"ys": ys_same_len,
	"color": color
	}

	else:
	label = label_parser(model_id)

	if study_eval:
	label = label+"(train)"

	if color_dict:
	color = color_dict[label]
	else:
	color = default_colors[model_i]

	if smooth_factor:
	means = smooth(means, smooth_factor)
	stds = smooth(stds, smooth_factor)

	to_plot_dict[model_id] = {
	"label": label,
	"steps": steps,
	"means": means,
	"stds": stds,
	"sems": sems,
	"ys": ys_same_len,
	"color": color,
	}


	if study_eval:
	print("Evaluation")
	# evaluation sets
	number_of_eval_envs = max(list([len(v.keys()) for v in model_eval_data.values()]))

	if plot_aggregated_test:
	number_of_eval_envs += 1

	if number_of_eval_envs == 0:
	print("No eval envs")
	exit()

	default_colors = default_colors_.copy()

	test_summary_dict = defaultdict(dict)
	test_summary_dict_colors = defaultdict(dict)

	for model_i, model_id in enumerate(model_eval_data.keys()):
	# excluding some experiments
	if any([ex_pat in model_id for ex_pat in exclude_patterns]):
	continue
	if len(include_patterns) > 0:
	if not any([in_pat in model_id for in_pat in include_patterns]):
	continue

	# test envs
	test_envs = model_eval_data[model_id].items()

	# filter unwanted eval envs
	if test_envs_to_plot is not None:
	test_envs = [(name, data) for name, data in test_envs if name in test_envs_to_plot]

	# computes stats
	if sort_test:
	test_envs_sorted = list(sorted(test_envs, key=lambda kv: sort_test_set(kv[0])))
	else:
	test_envs_sorted = list(test_envs)

	if plot_aggregated_test:
	agg_means = []

	for env_i, (test_env, env_data) in enumerate(test_envs_sorted):
	ys_same_len = env_data["values"]
	steps = env_data["steps"].mean(0) / steps_denom
	n_seeds = len(ys_same_len)

	if per_seed:
	sems = np.array(ys_same_len)
	stds = np.array(ys_same_len)
	means = np.array(ys_same_len)
	color = default_colors[model_i]

	# plot x y bounds
	curr_max_steps = np.max(np.max(steps))

	else:
	sems = np.std(ys_same_len, axis=0) / np.sqrt(len(ys_same_len)) # sem
	stds = np.std(ys_same_len, axis=0) # std
	means = np.mean(ys_same_len, axis=0)
	color = default_colors[model_i]

	curr_max_steps = np.max(steps)

	if plot_aggregated_test:
	agg_means.append(means)


	x_lim = max(steps[-1], x_lim)
	x_lim = min(max_x_lim, x_lim)

	eval_metric_name = {
	"test_success_rates": "Success rate",
	'exploration_bonus_mean': "Exploration bonus",
	}.get(eval_metric, eval_metric)

	test_env_name = test_env.replace("Env", "").replace("Test", "")

	env_types = ["InformationSeeking", "Collaboration", "PerspectiveTaking"]
	for env_type in env_types:
	if env_type in test_env_name:
	test_env_name = test_env_name.replace(env_type, "")
	test_env_name += f"\n({env_type})"

	if per_seed:
	for s_i, seed_ys in enumerate(ys_same_len):
	label = label_parser(model_id) + f"_{test_env}" + f"(s:{s_i})"

	if eval_smooth_factor:
	seed_ys = smooth(seed_ys, eval_smooth_factor)

	curve_ID = f"{model_id}_{test_env}_{s_i}"

	to_plot_dict[curve_ID] = {
	"label": label,
	"steps": steps,
	"means": seed_ys,
	"stds": np.zeros_like(seed_ys),
	"ys": ys_same_len,
	"color": color
	}
	else:
	if len(test_envs_sorted) > 1:
	label = label_parser(model_id) + f"_{test_env}"
	else:
	label = label_parser(model_id)

	if study_train:
	label=label+"(test)"

	if not plot_only_aggregated_test:

	if label in color_dict:
	color = color_dict[label]
	else:
	color = default_colors[model_i*len(test_envs_sorted)+env_i]

	if legend_show_n_seeds:
	label = label + "({})".format(n_seeds)

	if eval_smooth_factor:
	means = smooth(means, eval_smooth_factor)
	stds = smooth(stds, eval_smooth_factor)
	sems = smooth(sems, eval_smooth_factor)

	to_plot_dict[model_id+f"_{test_env}"] = {
	"label": label,
	"steps": steps,
	"means": means,
	"stds": stds,
	"sems": sems,
	"ys": ys_same_len,
	"color": color,
	}

	if plot_aggregated_test:

	ys_same_len = agg_means
	agg_means = np.array(agg_means)
	agg_mean = agg_means.mean(axis=0)
	agg_std = agg_means.std(axis=0) # std
	agg_sems = ...

	label = label_parser(model_id)

	if study_train:
	label = label + "(train)"

	if eval_smooth_factor:
	agg_mean = smooth(agg_mean, eval_smooth_factor)
	agg_std = smooth(agg_std, eval_smooth_factor)
	agg_sems = smooth(agg_sems, eval_smooth_factor)

	if per_seed:
	print("Not smooth aggregated because of per seed")
	for s_i, (seed_ys, seed_st) in enumerate(zip(agg_mean, agg_std)):
	seed_c = default_colors[model_i + s_i]
	label = str(s_i)

	to_plot_dict[curve_ID] = {
	"label": label,
	"steps": steps,
	"means": seed_ys,
	"stds": seed_st,
	"ys": ys_same_len,
	"color": color
	}
	else:

	if label in color_dict:
	color = color_dict[label]

	else:
	color = default_colors[model_i]

	to_plot_dict[model_id+"_agg_test"] = {
	"label": label,
	"steps": steps,
	"means": agg_mean,
	"stds": agg_std,
	"sems": agg_sems,
	"ys": ys_same_len,
	"color": color,
	}


	# should be labels
	to_scatter_dict = {}

	if to_compare is not None:
	for comp_i, (a_model_id, b_model_id, color) in enumerate(to_compare):

	a_data = to_plot_dict[a_model_id]["ys"]
	b_data = to_plot_dict[b_model_id]["ys"]

	steps = to_plot_dict[a_model_id]["steps"]

	if color == "auto_color":
	color = to_plot_dict[a_model_id]["color"]

	if len(a_data[0]) != len(b_data[0]):
	# extract steps present in both
	a_steps = to_plot_dict[a_model_id]["steps"]
	b_steps = to_plot_dict[b_model_id]["steps"]

	steps = list(set(a_steps) & set(b_steps))

	# keep only the values for those steps
	mask_a = [(a_s in steps) for a_s in a_steps]
	a_data = np.array(a_data)[:, mask_a]

	mask_b = [(b_s in steps) for b_s in b_steps]
	b_data = np.array(b_data)[:, mask_b]

	p = stats.ttest_ind(
	a_data,
	b_data,
	equal_var=False
	).pvalue

	steps = [s for s, p in zip(steps, p) if p < test_p]

	ys = [1.02+0.02comp_i]len(steps)

	to_scatter_dict[f"compare_{a_model_id}_{b_model_id}"] = {
	"label": "",
	"xs": steps,
	"ys": ys,
	"color": color,
	}

	for scatter_i, (scatter_ID, scatter_id_data) in enumerate(to_scatter_dict.items()):

	# unpack data
	label, xs, ys, color = (
	scatter_id_data["label"],
	scatter_id_data["xs"],
	scatter_id_data["ys"],
	scatter_id_data["color"],
	)

	xlabel = f"Env steps (1e6)"

	plt.scatter(
	xs,
	ys,
	color=color,
	marker="x"
	)

	summary_dict[label] = xs[-1]
	summary_dict_colors[label] = color

	for curve_i, (curve_ID, model_id_data) in enumerate(to_plot_dict.items()):

	# unpack data
	label, steps, means, stds, sems, ys, color = (
	model_id_data["label"],
	model_id_data["steps"],
	model_id_data["means"],
	model_id_data["stds"],
	model_id_data["sems"],
	model_id_data["ys"],
	model_id_data["color"]
	)

	# if smooth_factor:
	# means = smooth(means, smooth_factor)
	# stds = smooth(stds, smooth_factor)

	if legend_show_n_seeds:
	n_seeds = len(ys)
	label = label+"({})".format(n_seeds)


	x_lim = max(steps[-1], x_lim)
	x_lim = min(max_x_lim, x_lim)

	xlabel = f"Env steps (1e6)"


	plot_with_shade(
	0, ax[0], steps, means, stds, color, color, label,
	# 0, ax[0], steps, means, sems, color, color, label,
	legend=draw_legend,
	xlim=[0, x_lim],
	ylim=[0, max_y],
	xlabel=xlabel,
	ylabel=ylabel,
	title=None,
	labelsize=fontsize,
	fontsize=fontsize,
	title_fontsize=title_fontsize,
	linewidth=linewidth,
	leg_linewidth=5,
	leg_args=leg_args,
	xnbins=xnbins,
	ynbins=ynbins,
	)

	summary_dict[label] = means[-1]
	summary_dict_colors[label] = color

	# plot static lines
	if static_lines:
	for label, (mean, std, color) in static_lines.items():

	if label == "":
	label = None

	plot_with_shade(
	0, ax[0], steps, np.array([mean]len(steps)), np.array([std]len(steps)), color, color, label,
	legend=True,
	xlim=[0, x_lim],
	ylim=[0, 1.0],
	xlabel=f"Env steps (1e6)",
	ylabel=ylabel,
	linestyle=":",
	leg_args=leg_args,
	fontsize=fontsize,
	title_fontsize=title_fontsize,
	xnbins=xnbins,
	ynbins=ynbins,
	)


	if plot_path:
	f.savefig(plot_path+".png")
	f.savefig(plot_path+".svg")
	print(f"Plot saved to {plot_path}.[png/svg].")


	# Summary dict
	if len(summary_dict) == 0:
	raise ValueError(f"No experiments found for {load_pattern}.")
	else:
	# print summary
	best = max(summary_dict.values())

	pc = 0.3
	n = int(len(summary_dict)*pc)
	print("top n: ", n)

	top_pc = sorted(summary_dict.values())[-n:]
	bottom_pc = sorted(summary_dict.values())[:n]

	print("legend:")
	cprint("\tbest", "green")
	cprint("\ttop {} %".format(pc), "blue")
	cprint("\tbottom {} %".format(pc), "red")
	print("\tothers")
	print()

	for l, p in sorted(summary_dict.items(), key=lambda kv: kv[1]):

	c = summary_dict_colors[l]
	if p == best:
	cprint("label: {} ({})".format(l, c), "green")
	cprint("\t {}:{}".format(metric, p), "green")

	elif p in top_pc:
	cprint("label: {} ({})".format(l, c), "blue")
	cprint("\t {}:{}".format(metric, p), "blue")

	elif p in bottom_pc:
	cprint("label: {} ({})".format(l, c), "red")
	cprint("\t {}:{}".format(metric, p), "red")

	else:
	print("label: {} ({})".format(l, c))
	print("\t {}:{}".format(metric, p))


	if show_plot:
	plt.tight_layout()
	plt.subplots_adjust(hspace=1.5, wspace=0.5, left=0.1, right=0.9, bottom=0.1, top=0.85)
	plt.suptitle(super_title)
	plt.show()
	plt.close()