Spaces:

openGPT-X
/

european-llm-leaderboard

Running

KlaudiaTH

Refactorings and fixes for tab handling and few-shot selection

625e239 4 months ago

5.88 kB

	import itertools
	import os

	import gradio as gr
	import numpy as np
	import pandas as pd
	import plotly.express as px
	from datasets import load_dataset

	import style

	ZERO_SHOT_ONLY = ["BELEBELE"]
	FEW_SHOT_ONLY = ["GSM8K", "TruthfulQA"]


	def init():
	global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict

	repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
	config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
	split_name = os.getenv("OGX_LEADERBOARD_DATASET_SPLIT")

	dataset = load_dataset(repo_id, config_name, split=split_name)
	hidden_df = dataset.to_pandas()

	task_group_names_list = hidden_df["Task_Group"].unique().tolist()
	task_group_type_df = hidden_df[["Task_Group", "Task_Type"]].drop_duplicates()
	task_group_type_dict = task_group_type_df.set_index("Task_Group")["Task_Type"].to_dict()
	task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
	task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
	languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
	model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
	model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()

	hidden_df = hidden_df.pivot_table(
	columns=["Task_Group", "Few_Shot", "Language"],
	index=["Model_Name"],
	values="Value",
	dropna=False,
	).reset_index(inplace=False)

	hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]])


	def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
	task_cols = get_task_columns(df)
	return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)


	def get_task_columns(df: pd.DataFrame) -> pd.DataFrame:
	l = list(df.columns)
	l.remove("Model_Name")
	l.remove("Average")
	l.remove("Type")
	return l


	def get_models(df: pd.DataFrame) -> pd.DataFrame:
	return df["Model_Name"].unique()


	def filter_type(df: pd.DataFrame, model_types: list[str]) -> pd.DataFrame:
	"""Keep only rows for which model type is in list of types"""
	return df[df["Type"].isin(model_types)]


	def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
	"""Keep only rows for which model name matches search query"""
	query = query.replace(";", "\|")
	return df[df["Model_Name"].str.contains(query, case=False)]


	def aggregate_langs(df: pd.DataFrame, tasks: list, langs: list):
	"""Aggregates results over langs for each task in tasks.
	If a language does not exist for a task, the aggregate for
	that task will be shown as NaN.
	"""

	langs_lower = [item.lower() for item in langs]
	df.columns = ["_".join(filter(None, col)) for col in df.columns]
	colset = set(df.columns)
	for t in tasks:
	cols = [(f"{a}_{b}") for a, b in itertools.product([t], langs_lower)]
	if set(cols).issubset(colset):
	df.loc[:, t] = df[cols].mean(axis=1, skipna=False)
	else:
	df.loc[:, t] = np.nan
	df.loc[:, "Average"] = df[tasks].mean(axis=1)
	return df[["Type", "Model_Name", "Average"] + tasks]


	def select_shots(df: pd.DataFrame, fewshot: bool = False):
	cols = [col for col in df.columns if col[1] == fewshot] + []
	# Move model name and type icon to the end
	cols.append(("Model_Name", "", ""))
	cols.append(("Type", "", ""))
	return df[cols].droplevel(level=1, axis="columns")


	def update_df(
	tasks: list[str],
	model_query: str,
	langs: list[str],
	model_types: list[str],
	fewshot: bool = False,
	format: bool = True,
	) -> pd.DataFrame:
	"""Return a filtered dataframe according to selected models, tasks and
	languages. The format flag controls whether the output dataframe should
	be formatted to tw significant figures.
	"""
	# keep only selected shots
	df = select_shots(hidden_df, fewshot)

	# aggregate results over languages per task
	df = aggregate_langs(df, tasks, langs)

	# filter models by search bar and model type
	df = search_model(df, model_query)
	df = filter_type(df, model_types)

	if format:
	return sort_cols(df, fewshot).style.format(precision=2, decimal=".", na_rep="N/A")
	else:
	return sort_cols(df, fewshot)


	def update_task_groups_and_fewshot(current_selected_tab: int, is_fewshot_current: bool = False):
	selected_task_type = get_selected_task_type(current_selected_tab)
	available_tasks = get_available_task_groups(selected_task_type, is_fewshot_current)
	new_selected_tasks = available_tasks.copy()

	tasks_checkbox_group_update = gr.CheckboxGroup(
	choices=available_tasks,
	value=new_selected_tasks,
	)

	if current_selected_tab == 0:
	is_fewshot_new = is_fewshot_current
	fewshot_available = True
	elif current_selected_tab == 1:
	is_fewshot_new = False
	fewshot_available = False

	fewshot_radio_update = gr.Radio(
	value=is_fewshot_new,
	interactive=fewshot_available,
	)

	return [tasks_checkbox_group_update, fewshot_radio_update, current_selected_tab]


	def get_selected_task_type(task_type_id):
	task_types = {0: "accuracy", 1: "misc"}
	selected_task_type = task_types[task_type_id]
	return selected_task_type


	def get_available_task_groups(selected_task_type, fewshot):
	task_groups = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if task_type == selected_task_type]

	if fewshot:
	available_tasks = [c for c in task_groups if c not in ZERO_SHOT_ONLY]
	else:
	available_tasks = [c for c in task_groups if c not in FEW_SHOT_ONLY]

	return available_tasks


	init()