leaderboard

Running on CPU Upgrade

leaderboard / app.py

Quentin Gallouédec

move eval to dedicated file

76e0bcf 7 months ago

6.19 kB

	import glob
	import json
	import os
	import pprint

	import gradio as gr
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download

	from src.css_html_js import dark_mode_gradio_js
	from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
	from src.evaluation import ALL_ENV_IDS, evaluate
	from src.logging import configure_root_logger, setup_logger

	configure_root_logger()
	logger = setup_logger(__name__)

	pp = pprint.PrettyPrinter(width=80)


	def model_hyperlink(link, model_id):
	return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'


	def make_clickable_model(model_id):
	link = f"https://huggingface.co/{model_id}"
	return model_hyperlink(link, model_id)


	def _backend_routine():
	# List only the text classification models
	rl_models = list(API.list_models(filter="reinforcement-learning"))
	logger.info(f"Found {len(rl_models)} RL models")
	compatible_models = []
	for model in rl_models:
	filenames = [sib.rfilename for sib in model.siblings]
	if "agent.pt" in filenames:
	compatible_models.append((model.modelId, model.sha))

	logger.info(f"Found {len(compatible_models)} compatible models")

	# Get the results
	snapshot_download(
	repo_id=RESULTS_REPO,
	revision="main",
	local_dir=RESULTS_PATH,
	repo_type="dataset",
	max_workers=60,
	token=TOKEN,
	)
	json_files = glob.glob(f"{RESULTS_PATH}/*/.json", recursive=True)

	evaluated_models = set()
	for json_filepath in json_files:
	with open(json_filepath) as fp:
	data = json.load(fp)
	evaluated_models.add((data["config"]["model_id"], data["config"]["model_sha"]))

	# Find the models that are not associated with any results
	pending_models = set(compatible_models) - evaluated_models
	logger.info(f"Found {len(pending_models)} pending models")

	# Run an evaluation on the models
	for model_id, sha in pending_models:
	logger.info(f"Running evaluation on {model_id}")
	report = {"config": {"model_id": model_id, "model_sha": sha}}
	try:
	evaluations = evaluate(model_id, revision=sha)
	except Exception as e:
	logger.error(f"Error evaluating {model_id}: {e}")
	evaluations = None

	if evaluations is not None:
	report["results"] = evaluations
	report["status"] = "DONE"
	else:
	report["status"] = "FAILED"

	# Update the results
	dumped = json.dumps(report, indent=2)
	output_path = os.path.join(RESULTS_PATH, model_id, f"results_{sha}.json")
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	with open(output_path, "w") as f:
	f.write(dumped)

	# Upload the results to the results repo
	API.upload_file(
	path_or_fileobj=output_path,
	path_in_repo=f"{model_id}/results_{sha}.json",
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	)


	def backend_routine():
	try:
	_backend_routine()
	except Exception as e:
	logger.error(f"{e.__class__.__name__}: {str(e)}")


	def get_leaderboard_df():
	snapshot_download(
	repo_id=RESULTS_REPO,
	revision="main",
	local_dir=RESULTS_PATH,
	repo_type="dataset",
	max_workers=60,
	token=TOKEN,
	)

	json_files = glob.glob(f"{RESULTS_PATH}/*/.json", recursive=True)
	data = []

	for json_filepath in json_files:
	with open(json_filepath) as fp:
	report = json.load(fp)
	model_id = report["config"]["model_id"]
	row = {"Agent": model_id, "Status": report["status"]}
	if report["status"] == "DONE":
	results = {env_id: result["episodic_return_mean"] for env_id, result in report["results"].items()}
	row.update(results)
	data.append(row)

	# Create DataFrame
	df = pd.DataFrame(data)
	# Replace NaN values with empty strings
	df = df.fillna("")
	return df


	TITLE = """
	🚀 Open RL Leaderboard
	"""

	INTRODUCTION_TEXT = """
	Welcome to the Open RL Leaderboard! This is a community-driven benchmark for reinforcement learning models.
	"""

	ABOUT_TEXT = """
	The Open RL Leaderboard is a community-driven benchmark for reinforcement learning models.
	"""


	def select_column(column_name: str, data: pd.DataFrame):
	# column_names = [col for col in column_names if col in data.columns]
	column_names = ["Agent"] + [column_name] # add model name column
	df = data[column_names]

	def check_row(row):
	return not (row.drop("Agent") == "").all()

	mask = df.apply(check_row, axis=1)
	df = df[mask]
	df = df.sort_values(by=column_name, ascending=False)
	return df


	with gr.Blocks(js=dark_mode_gradio_js) as demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
	hidden_df = gr.components.Dataframe(get_leaderboard_df, visible=False, every=5 * 60) # hidden dataframe

	env_selector = gr.components.Dropdown(
	label="Environments",
	choices=ALL_ENV_IDS,
	value=ALL_ENV_IDS[0],
	# interactive=True,
	)
	leaderboard = gr.components.Dataframe(select_column(ALL_ENV_IDS[0], get_leaderboard_df()))

	# Events
	env_selector.change(select_column, [env_selector, hidden_df], leaderboard)
	# Update hidden dataframe
	# hidden_df.change(get_leaderboard_df, [], hidden_df, every=10)

	with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
	gr.Markdown(ABOUT_TEXT)


	scheduler = BackgroundScheduler()
	scheduler.add_job(func=backend_routine, trigger="interval", seconds=0.5 * 60)
	scheduler.start()


	if __name__ == "__main__":
	demo.queue().launch() # server_name="0.0.0.0", show_error=True, server_port=7860)