Spaces:

open-llm-leaderboard
/

comparator

Running

App Files Files Community

comparator / app.py

albertvillanova HF staff

Schedule Space restart to update list of models

8a91492 verified 2 days ago

raw

history blame contribute delete

11.5 kB

	import gradio as gr
	from apscheduler.schedulers.background import BackgroundScheduler

	import src.constants as constants
	from src.details import (
	clear_details,
	display_details,
	display_loading_message_for_details,
	load_details,
	update_load_details_component,
	update_sample_idx_component,
	update_subtasks_component,
	update_task_description_component,
	)
	from src.env_impact import plot_env_impact
	from src.hub import restart_space
	from src.model_tree import load_model_tree
	from src.results import (
	clear_results,
	clear_results_file,
	display_loading_message_for_results,
	display_results,
	download_results,
	load_result_paths_per_model,
	load_results,
	plot_results,
	update_tasks_component,
	)


	# if __name__ == "__main__":

	with gr.Blocks(fill_height=True, fill_width=True) as demo:
	gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>")
	gr.HTML("<h3 style='text-align: center;'>Select models to load and compare their results</h3>")
	gr.HTML(
	"<p style='text-align: center; color:orange;'>⚠ This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
	)
	gr.Markdown(
	"Compare Results of the 🤗 [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). "
	"Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) 📄 to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
	)
	with gr.Row():
	model_ids = gr.Dropdown(label="Models", multiselect=True)
	result_paths_per_model = gr.State()

	with gr.Accordion("Model tree: Compare base and derived models", open=False):
	load_model_tree_btn = gr.Button("Load Model Tree", interactive=False)
	model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
	derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
	]
	base_and_derived_models = [
	gr.Dropdown(label=model_tree_labels[0], multiselect=True),
	]
	with gr.Row():
	for label in model_tree_labels[1:]:
	base_and_derived_models.append(gr.Dropdown(label=label, multiselect=True, interactive=False))

	with gr.Row():
	with gr.Tab("Results"):
	load_results_btn = gr.Button("Load", interactive=False)
	clear_results_btn = gr.Button("Clear")
	results_task = gr.Radio(
	["All"] + list(constants.TASKS.values()),
	label="Tasks",
	info="Evaluation tasks to be displayed",
	value="All",
	visible=False,
	)
	results_task_description = gr.Textbox(
	label="Task Description",
	lines=3,
	visible=False,
	)
	hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
	with gr.Row():
	results_plot_1 = gr.Plot(visible=True)
	results_plot_2 = gr.Plot(visible=True)
	results = gr.HTML()
	results_dataframe = gr.State()
	download_results_btn = gr.Button("Download")
	results_file = gr.File(visible=False)
	with gr.Tab("Configs"):
	load_configs_btn = gr.Button("Load", interactive=False)
	clear_configs_btn = gr.Button("Clear")
	configs_task = gr.Radio(
	["All"] + list(constants.TASKS.values()),
	label="Tasks",
	info="Evaluation tasks to be displayed",
	value="All",
	visible=False,
	)
	configs_task_description = gr.Textbox(
	label="Task Description",
	lines=3,
	visible=False,
	)
	show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
	configs = gr.HTML()
	with gr.Tab("Details"):
	details_task = gr.Radio(
	list(constants.TASKS.values()),
	label="Tasks",
	info="Evaluation tasks to be loaded",
	interactive=True,
	)
	details_task_description = gr.Textbox(
	label="Task Description",
	lines=3,
	)
	with gr.Row():
	login_btn = gr.LoginButton(size="sm", visible=False)
	subtask = gr.Radio(
	choices=None, # constants.SUBTASKS.get(details_task.value),
	label="Subtasks",
	info="Evaluation subtasks to be loaded (choose one of the Tasks above)",
	)
	load_details_btn = gr.Button("Load Details", interactive=False)
	clear_details_btn = gr.Button("Clear Details")
	sample_idx = gr.Number(
	label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False
	)
	details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
	details = gr.HTML()
	details_dataframe = gr.State()
	with gr.Tab("Environmental impact"):
	gr.Markdown(
	"The environmental impact calculations we display are derived from the specific inference setup used "
	"for evaluation. We leverage 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to efficiently "
	"parallelize the model across 8 Nvidia H100 SXM GPUs in a compute cluster located in Northern Virginia. "
	"These results reflect the energy consumption and associated emissions of this configuration, "
	"providing transparency and insight into the resource requirements of large language model evaluations. "
	"You can find more details in our documentation about the [environmental impact](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions)."
	)
	load_env_impact_btn = gr.Button("Load", interactive=False)
	clear_env_impact_btn = gr.Button("Clear")
	with gr.Row():
	env_impact_plot_1 = gr.Plot(visible=True)
	env_impact_plot_2 = gr.Plot(visible=True)
	env_impact = gr.HTML()

	# DEMO:
	demo.load(
	fn=load_result_paths_per_model,
	outputs=result_paths_per_model,
	).then(
	fn=lambda x: gr.Dropdown(choices=list(x.keys())),
	inputs=result_paths_per_model,
	outputs=model_ids,
	)

	# Buttons:
	gr.on(
	triggers=[model_ids.input],
	fn=lambda: (gr.Button(interactive=True),) * 4,
	outputs=[load_model_tree_btn, load_results_btn, load_configs_btn, load_env_impact_btn],
	)

	# RESULTS:
	gr.on(
	triggers=[load_results_btn.click, load_configs_btn.click, load_env_impact_btn.click],
	fn=display_loading_message_for_results,
	outputs=[results, configs, env_impact],
	).then(
	fn=load_results,
	inputs=[
	result_paths_per_model,
	model_ids,
	*base_and_derived_models,
	],
	outputs=[results_dataframe, results],
	).then(
	fn=update_tasks_component,
	outputs=[results_task, configs_task],
	)
	# Synchronize the results_task and configs_task radio buttons
	results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
	configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
	# Update task descriptions
	results_task.change(
	fn=update_task_description_component,
	inputs=results_task,
	outputs=results_task_description,
	).then(
	fn=update_task_description_component,
	inputs=results_task,
	outputs=configs_task_description,
	)
	# Display results
	gr.on(
	triggers=[
	results_dataframe.change,
	results_task.change,
	hide_std_errors.change,
	show_only_differences.change,
	],
	fn=display_results,
	inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
	outputs=[results, configs, env_impact],
	).then(
	fn=plot_results,
	inputs=[results_dataframe, results_task],
	outputs=[results_plot_1, results_plot_2],
	).then(
	fn=plot_env_impact,
	inputs=[results_dataframe],
	outputs=[env_impact_plot_1, env_impact_plot_2],
	).then(
	fn=clear_results_file,
	outputs=results_file,
	)
	download_results_btn.click(
	fn=download_results,
	inputs=results,
	outputs=results_file,
	)
	gr.on(
	triggers=[clear_results_btn.click, clear_configs_btn.click, clear_env_impact_btn.click],
	fn=clear_results,
	outputs=[
	model_ids,
	results_dataframe,
	load_results_btn,
	load_configs_btn,
	load_env_impact_btn,
	results_task,
	configs_task,
	],
	).then(
	fn=lambda: gr.Button(interactive=False),
	outputs=load_model_tree_btn,
	).then(
	fn=lambda: [gr.Dropdown(label=label, multiselect=True, interactive=False) for label in model_tree_labels],
	outputs=[*base_and_derived_models],
	).then(
	fn=clear_results_file,
	outputs=results_file,
	)

	# DETAILS:
	details_task.change(
	fn=update_task_description_component,
	inputs=details_task,
	outputs=details_task_description,
	).then(
	fn=update_subtasks_component,
	inputs=details_task,
	outputs=[login_btn, subtask],
	)
	gr.on(
	triggers=[model_ids.input, subtask.input, details_task.input],
	fn=update_load_details_component,
	inputs=[model_ids, subtask],
	outputs=load_details_btn,
	)
	load_details_btn.click(
	fn=display_loading_message_for_details,
	outputs=details,
	).then(
	fn=load_details,
	inputs=[
	subtask,
	model_ids,
	*base_and_derived_models,
	],
	outputs=[details_dataframe, details],
	).then(
	fn=update_sample_idx_component,
	inputs=[details_dataframe],
	outputs=sample_idx,
	)
	gr.on(
	triggers=[
	details_dataframe.change,
	sample_idx.change,
	details_show_only_differences.change,
	],
	fn=display_details,
	inputs=[details_dataframe, sample_idx, details_show_only_differences],
	outputs=details,
	)
	clear_details_btn.click(
	fn=clear_details,
	outputs=[
	model_ids,
	details_dataframe,
	details_task,
	subtask,
	load_details_btn,
	sample_idx,
	],
	)

	# MODEL TREE:
	load_model_tree_btn.click(
	fn=load_model_tree,
	inputs=[result_paths_per_model, model_ids],
	outputs=[
	*base_and_derived_models,
	],
	)

	# Start scheduler
	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", hours=1) # Restart every 1h
	scheduler.start()

	demo.launch()