Spaces:

MERaLiON
/

SeaEval_Leaderboard

Running

App Files Files Community

SeaEval_Leaderboard / app.py

binwang

update

4ceaa06 over 1 year ago

raw

history blame

29.4 kB

	from functools import partial
	import json

	import gradio as gr
	import pandas as pd


	print("Loading datasets...")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def add_rank(df, compute_average=True):
	cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
	if len(cols_to_rank) == 1:
	df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
	else:
	if compute_average:
	df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
	df.sort_values("Average", ascending=False, inplace=True)
	else:
	df.sort_values(cols_to_rank[0], ascending=False, inplace=True)

	df.insert(0, "Rank", list(range(1, len(df) + 1)))
	df = df.round(2)
	# Fill NaN after averaging
	df.fillna("", inplace=True)
	return df

	def make_clickable_model(model_name, link=None):
	if link is None:
	link = "https://huggingface.co/" + model_name
	# Remove user from model name
	return (
	f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
	)


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =




	with open('all_results.json', 'r') as f:
	ALL_RESULTS = json.load(f)


	MODEL_LIST = list(ALL_RESULTS.keys())
	NUM_MODELS = len(set(MODEL_LIST))
	MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

	def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]


	try:
	overall_acc = [results['overall_acc'] for results in results_list]
	overall_acc = sum(overall_acc) / len(overall_acc)

	consistency_score_3 = [results['consistency_score_3'] for results in results_list]
	consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3)

	AC3_3 = [results['AC3_3'] for results in results_list]
	AC3_3 = sum(AC3_3) / len(AC3_3)

	except:
	print(results_list)
	consistency_score_3 = -1
	overall_acc = -1
	AC3_3 = -1

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"AC3": AC3_3,
	"Cross-Lingual Consistency": consistency_score_3,
	"Accuracy": overall_acc,
	}

	df_list.append(res)


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=False)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
	CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")


	def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]


	try:
	English = [results['language_acc']['English'] for results in results_list]
	Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
	Chinese = [results['language_acc']['Chinese'] for results in results_list]
	Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
	Filipino = [results['language_acc']['Filipino'] for results in results_list]
	Spanish = [results['language_acc']['Spanish'] for results in results_list]
	Malay = [results['language_acc']['Malay'] for results in results_list]

	English = sum(English) / len(English)
	Vietnamese = sum(Vietnamese) / len(Vietnamese)
	Chinese = sum(Chinese) / len(Chinese)
	Indonesian = sum(Indonesian) / len(Indonesian)
	Filipino = sum(Filipino) / len(Filipino)
	Spanish = sum(Spanish) / len(Spanish)
	Malay = sum(Malay) / len(Malay)


	except:
	print(results_list)
	English = -1
	Vietnamese = -1
	Chinese = -1
	Indonesian = -1
	Filipino = -1
	Spanish = -1
	Malay = -1

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"English": English,
	"Vietnamese": Vietnamese,
	"Chinese": Chinese,
	"Indonesian": Indonesian,
	"Filipino": Filipino,
	"Spanish": Spanish,
	"Malay": Malay,
	}

	df_list.append(res)


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
	CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =




	def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]


	try:
	overall_acc = [results['overall_acc'] for results in results_list]
	overall_acc = sum(overall_acc) / len(overall_acc)

	consistency_score_3 = [results['consistency_score_3'] for results in results_list]
	consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3)

	AC3_3 = [results['AC3_3'] for results in results_list]
	AC3_3 = sum(AC3_3) / len(AC3_3)

	except:
	print(results_list)
	consistency_score_3 = -1
	overall_acc = -1
	AC3_3 = -1

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"AC3": AC3_3,
	"Cross-Lingual Consistency": consistency_score_3,
	"Accuracy": overall_acc,
	}

	df_list.append(res)


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=False)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot")
	CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot")


	def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]


	try:
	English = [results['language_acc']['English'] for results in results_list]
	Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
	Chinese = [results['language_acc']['Chinese'] for results in results_list]
	Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
	Filipino = [results['language_acc']['Filipino'] for results in results_list]
	Spanish = [results['language_acc']['Spanish'] for results in results_list]
	Malay = [results['language_acc']['Malay'] for results in results_list]

	English = sum(English) / len(English)
	Vietnamese = sum(Vietnamese) / len(Vietnamese)
	Chinese = sum(Chinese) / len(Chinese)
	Indonesian = sum(Indonesian) / len(Indonesian)
	Filipino = sum(Filipino) / len(Filipino)
	Spanish = sum(Spanish) / len(Spanish)
	Malay = sum(Malay) / len(Malay)


	except:
	print(results_list)
	English = -1
	Vietnamese = -1
	Chinese = -1
	Indonesian = -1
	Filipino = -1
	Spanish = -1
	Malay = -1

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"English": English,
	"Vietnamese": Vietnamese,
	"Chinese": Chinese,
	"Indonesian": Indonesian,
	"Filipino": Filipino,
	"Spanish": Spanish,
	"Malay": Malay,
	}

	df_list.append(res)


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot")
	CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]


	try:
	accuracy = [results['accuracy'] for results in results_list]
	accuracy = sum(accuracy) / len(accuracy)

	except:
	print(results_list)
	accuracy = -1


	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot")
	SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]


	try:
	accuracy = [results['accuracy'] for results in results_list]
	accuracy = sum(accuracy) / len(accuracy)

	except:
	print(results_list)
	accuracy = -1


	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot")
	US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]


	try:
	accuracy = [results['accuracy'] for results in results_list]
	accuracy = sum(accuracy) / len(accuracy)

	except:
	print(results_list)
	accuracy = -1


	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
	CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]


	try:
	accuracy = [results['accuracy'] for results in results_list]
	accuracy = sum(accuracy) / len(accuracy)

	except:
	print(results_list)
	accuracy = -1


	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot")
	PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

	block = gr.Blocks()
	with block:
	gr.Markdown(f"""
	SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a> Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.

	- Total Datasets: 31
	- Total Languages: 8
	- Total Models: {NUM_MODELS}
	""")
	with gr.Tabs():


	# dataset 1: cross-mmlu
	with gr.TabItem("Cross-MMLU"):
	with gr.Row():
	gr.Markdown("""
	Cross-MMLU Leaderboard 🔮

	- Metric: Cross-Lingual Consistency, Accuracy, AC3
	- Languages: English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
	""")

	with gr.TabItem("zero_shot"):


	with gr.TabItem("Overall"):

	with gr.Row():
	cross_mmlu_zero_shot_overall = gr.components.Dataframe(
	CROSS_MMLU_ZERO_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
	type="pandas",
	)


	with gr.TabItem("Language Performance"):

	with gr.Row():
	cross_mmlu_zero_shot_overall = gr.components.Dataframe(
	CROSS_MMLU_ZERO_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
	type="pandas",
	)


	with gr.TabItem("five_shot"):


	with gr.TabItem("Overall"):

	with gr.Row():
	cross_mmlu_zero_shot_overall = gr.components.Dataframe(
	CROSS_MMLU_FIVE_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
	type="pandas",
	)


	with gr.TabItem("Language Performance"):

	with gr.Row():
	gr.components.Dataframe(
	CROSS_MMLU_FIVE_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
	type="pandas",
	)



	# dataset 2: cross-logiqa
	with gr.TabItem("Cross-LogiQA"):
	with gr.Row():
	gr.Markdown("""
	Cross-LogiQA Leaderboard 🔮

	- Metric: Cross-Lingual Consistency, Accuracy, AC3
	- Languages: English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
	""")

	with gr.TabItem("zero_shot"):


	with gr.TabItem("Overall"):

	with gr.Row():
	gr.components.Dataframe(
	CROSS_LOGIQA_ZERO_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
	type="pandas",
	)


	with gr.TabItem("Language Performance"):

	with gr.Row():
	gr.components.Dataframe(
	CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
	type="pandas",
	)


	with gr.TabItem("five_shot"):


	with gr.TabItem("Overall"):

	with gr.Row():
	gr.components.Dataframe(
	CROSS_LOGIQA_FIVE_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
	type="pandas",
	)


	with gr.TabItem("Language Performance"):

	with gr.Row():
	gr.components.Dataframe(
	CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
	type="pandas",
	)


	# dataset 3: SG_EVAL
	with gr.TabItem("SG_EVAL"):
	with gr.Row():
	gr.Markdown("""
	SG_EVAL Leaderboard 🔮

	- Metric: Accuracy
	- Languages: English
	""")

	with gr.TabItem("zero_shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SG_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)

	with gr.TabItem("five_shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SG_EVAL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
	type="pandas",
	)


	# dataset 4:
	with gr.TabItem("US_EVAL"):
	with gr.Row():
	gr.Markdown("""
	US_EVAL Leaderboard 🔮

	- Metric: Accuracy
	- Languages: English
	""")

	with gr.TabItem("zero_shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	US_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)

	with gr.TabItem("five_shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	US_EVAL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
	type="pandas",
	)


	# dataset 5:
	with gr.TabItem("CN_EVAL"):
	with gr.Row():
	gr.Markdown("""
	CN_EVAL Leaderboard 🔮

	- Metric: Accuracy
	- Languages: Chinese
	""")

	with gr.TabItem("zero_shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CN_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)

	with gr.TabItem("five_shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CN_EVAL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
	type="pandas",
	)



	# dataset 6:
	with gr.TabItem("PH_EVAL"):
	with gr.Row():
	gr.Markdown("""
	PH_EVAL Leaderboard 🔮

	- Metric: Accuracy
	- Languages: English
	""")

	with gr.TabItem("zero_shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	PH_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)

	with gr.TabItem("five_shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	PH_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)




	gr.Markdown(r"""

	If this work is useful to you, please citing our work:

	```bibtex
	@article{SeaEval2023,
	title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
	author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
	journal={arXiv preprint arXiv:2309.04766},
	year={2023}
	}
	```
	""")
	# Running the functions on page load in addition to when the button is clicked
	# This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
	"""
	block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
	"""

	block.queue(max_size=10)
	block.launch(server_name="0.0.0.0", share=True)


	# Possible changes:
	# Could add graphs / other visual content
	# Could add verification marks

	# Sources:
	# https://huggingface.co/spaces/gradio/leaderboard
	# https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard
	# https://getemoji.com/