binwang's picture
update
4ceaa06
raw
history blame
29.4 kB
from functools import partial
import json
import gradio as gr
import pandas as pd
print("Loading datasets...")
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def add_rank(df, compute_average=True):
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
if len(cols_to_rank) == 1:
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
else:
if compute_average:
df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
df.sort_values("Average", ascending=False, inplace=True)
else:
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
df.insert(0, "Rank", list(range(1, len(df) + 1)))
df = df.round(2)
# Fill NaN after averaging
df.fillna("", inplace=True)
return df
def make_clickable_model(model_name, link=None):
if link is None:
link = "https://huggingface.co/" + model_name
# Remove user from model name
return (
f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
)
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
with open('all_results.json', 'r') as f:
ALL_RESULTS = json.load(f)
MODEL_LIST = list(ALL_RESULTS.keys())
NUM_MODELS = len(set(MODEL_LIST))
MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
df_list = []
for model in MODEL_LIST:
results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
try:
overall_acc = [results['overall_acc'] for results in results_list]
overall_acc = sum(overall_acc) / len(overall_acc)
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3)
AC3_3 = [results['AC3_3'] for results in results_list]
AC3_3 = sum(AC3_3) / len(AC3_3)
except:
print(results_list)
consistency_score_3 = -1
overall_acc = -1
AC3_3 = -1
res = {
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
"AC3": AC3_3,
"Cross-Lingual Consistency": consistency_score_3,
"Accuracy": overall_acc,
}
df_list.append(res)
df = pd.DataFrame(df_list)
# If there are any models that are the same, merge them
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
df = df.groupby("Model", as_index=False).first()
# Put 'Model' column first
#cols = sorted(list(df.columns))
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("Model")))
df = df[cols]
if rank:
df = add_rank(df, compute_average=False)
if fillna:
df.fillna("", inplace=True)
return df
CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")
def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
df_list = []
for model in MODEL_LIST:
results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
try:
English = [results['language_acc']['English'] for results in results_list]
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
Chinese = [results['language_acc']['Chinese'] for results in results_list]
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
Filipino = [results['language_acc']['Filipino'] for results in results_list]
Spanish = [results['language_acc']['Spanish'] for results in results_list]
Malay = [results['language_acc']['Malay'] for results in results_list]
English = sum(English) / len(English)
Vietnamese = sum(Vietnamese) / len(Vietnamese)
Chinese = sum(Chinese) / len(Chinese)
Indonesian = sum(Indonesian) / len(Indonesian)
Filipino = sum(Filipino) / len(Filipino)
Spanish = sum(Spanish) / len(Spanish)
Malay = sum(Malay) / len(Malay)
except:
print(results_list)
English = -1
Vietnamese = -1
Chinese = -1
Indonesian = -1
Filipino = -1
Spanish = -1
Malay = -1
res = {
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
"English": English,
"Vietnamese": Vietnamese,
"Chinese": Chinese,
"Indonesian": Indonesian,
"Filipino": Filipino,
"Spanish": Spanish,
"Malay": Malay,
}
df_list.append(res)
df = pd.DataFrame(df_list)
# If there are any models that are the same, merge them
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
df = df.groupby("Model", as_index=False).first()
# Put 'Model' column first
#cols = sorted(list(df.columns))
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("Model")))
df = df[cols]
if rank:
df = add_rank(df, compute_average=True)
if fillna:
df.fillna("", inplace=True)
return df
CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True):
df_list = []
for model in MODEL_LIST:
results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
try:
overall_acc = [results['overall_acc'] for results in results_list]
overall_acc = sum(overall_acc) / len(overall_acc)
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3)
AC3_3 = [results['AC3_3'] for results in results_list]
AC3_3 = sum(AC3_3) / len(AC3_3)
except:
print(results_list)
consistency_score_3 = -1
overall_acc = -1
AC3_3 = -1
res = {
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
"AC3": AC3_3,
"Cross-Lingual Consistency": consistency_score_3,
"Accuracy": overall_acc,
}
df_list.append(res)
df = pd.DataFrame(df_list)
# If there are any models that are the same, merge them
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
df = df.groupby("Model", as_index=False).first()
# Put 'Model' column first
#cols = sorted(list(df.columns))
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("Model")))
df = df[cols]
if rank:
df = add_rank(df, compute_average=False)
if fillna:
df.fillna("", inplace=True)
return df
CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot")
CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot")
def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True):
df_list = []
for model in MODEL_LIST:
results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
try:
English = [results['language_acc']['English'] for results in results_list]
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
Chinese = [results['language_acc']['Chinese'] for results in results_list]
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
Filipino = [results['language_acc']['Filipino'] for results in results_list]
Spanish = [results['language_acc']['Spanish'] for results in results_list]
Malay = [results['language_acc']['Malay'] for results in results_list]
English = sum(English) / len(English)
Vietnamese = sum(Vietnamese) / len(Vietnamese)
Chinese = sum(Chinese) / len(Chinese)
Indonesian = sum(Indonesian) / len(Indonesian)
Filipino = sum(Filipino) / len(Filipino)
Spanish = sum(Spanish) / len(Spanish)
Malay = sum(Malay) / len(Malay)
except:
print(results_list)
English = -1
Vietnamese = -1
Chinese = -1
Indonesian = -1
Filipino = -1
Spanish = -1
Malay = -1
res = {
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
"English": English,
"Vietnamese": Vietnamese,
"Chinese": Chinese,
"Indonesian": Indonesian,
"Filipino": Filipino,
"Spanish": Spanish,
"Malay": Malay,
}
df_list.append(res)
df = pd.DataFrame(df_list)
# If there are any models that are the same, merge them
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
df = df.groupby("Model", as_index=False).first()
# Put 'Model' column first
#cols = sorted(list(df.columns))
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("Model")))
df = df[cols]
if rank:
df = add_rank(df, compute_average=True)
if fillna:
df.fillna("", inplace=True)
return df
CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot")
CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot")
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):
df_list = []
for model in MODEL_LIST:
results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]
try:
accuracy = [results['accuracy'] for results in results_list]
accuracy = sum(accuracy) / len(accuracy)
except:
print(results_list)
accuracy = -1
res = {
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
"Accuracy": accuracy,
}
df_list.append(res)
df = pd.DataFrame(df_list)
# If there are any models that are the same, merge them
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
df = df.groupby("Model", as_index=False).first()
# Put 'Model' column first
#cols = sorted(list(df.columns))
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("Model")))
df = df[cols]
if rank:
df = add_rank(df, compute_average=True)
if fillna:
df.fillna("", inplace=True)
return df
SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot")
SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot")
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):
df_list = []
for model in MODEL_LIST:
results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]
try:
accuracy = [results['accuracy'] for results in results_list]
accuracy = sum(accuracy) / len(accuracy)
except:
print(results_list)
accuracy = -1
res = {
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
"Accuracy": accuracy,
}
df_list.append(res)
df = pd.DataFrame(df_list)
# If there are any models that are the same, merge them
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
df = df.groupby("Model", as_index=False).first()
# Put 'Model' column first
#cols = sorted(list(df.columns))
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("Model")))
df = df[cols]
if rank:
df = add_rank(df, compute_average=True)
if fillna:
df.fillna("", inplace=True)
return df
US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot")
US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot")
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
df_list = []
for model in MODEL_LIST:
results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]
try:
accuracy = [results['accuracy'] for results in results_list]
accuracy = sum(accuracy) / len(accuracy)
except:
print(results_list)
accuracy = -1
res = {
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
"Accuracy": accuracy,
}
df_list.append(res)
df = pd.DataFrame(df_list)
# If there are any models that are the same, merge them
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
df = df.groupby("Model", as_index=False).first()
# Put 'Model' column first
#cols = sorted(list(df.columns))
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("Model")))
df = df[cols]
if rank:
df = add_rank(df, compute_average=True)
if fillna:
df.fillna("", inplace=True)
return df
CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
df_list = []
for model in MODEL_LIST:
results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]
try:
accuracy = [results['accuracy'] for results in results_list]
accuracy = sum(accuracy) / len(accuracy)
except:
print(results_list)
accuracy = -1
res = {
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
"Accuracy": accuracy,
}
df_list.append(res)
df = pd.DataFrame(df_list)
# If there are any models that are the same, merge them
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
df = df.groupby("Model", as_index=False).first()
# Put 'Model' column first
#cols = sorted(list(df.columns))
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("Model")))
df = df[cols]
if rank:
df = add_rank(df, compute_average=True)
if fillna:
df.fillna("", inplace=True)
return df
PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot")
PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot")
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
block = gr.Blocks()
with block:
gr.Markdown(f"""
SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a> Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.
- **Total Datasets**: 31
- **Total Languages**: 8
- **Total Models**: {NUM_MODELS}
""")
with gr.Tabs():
# dataset 1: cross-mmlu
with gr.TabItem("Cross-MMLU"):
with gr.Row():
gr.Markdown("""
**Cross-MMLU Leaderboard** 🔮
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
- **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
""")
with gr.TabItem("zero_shot"):
with gr.TabItem("Overall"):
with gr.Row():
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
CROSS_MMLU_ZERO_SHOT_OVERALL,
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
type="pandas",
)
with gr.TabItem("Language Performance"):
with gr.Row():
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
CROSS_MMLU_ZERO_SHOT_LANGUAGE,
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
type="pandas",
)
with gr.TabItem("five_shot"):
with gr.TabItem("Overall"):
with gr.Row():
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
CROSS_MMLU_FIVE_SHOT_OVERALL,
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
type="pandas",
)
with gr.TabItem("Language Performance"):
with gr.Row():
gr.components.Dataframe(
CROSS_MMLU_FIVE_SHOT_LANGUAGE,
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
type="pandas",
)
# dataset 2: cross-logiqa
with gr.TabItem("Cross-LogiQA"):
with gr.Row():
gr.Markdown("""
**Cross-LogiQA Leaderboard** 🔮
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
- **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
""")
with gr.TabItem("zero_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
CROSS_LOGIQA_ZERO_SHOT_OVERALL,
datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
type="pandas",
)
with gr.TabItem("Language Performance"):
with gr.Row():
gr.components.Dataframe(
CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
type="pandas",
)
with gr.TabItem("five_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
CROSS_LOGIQA_FIVE_SHOT_OVERALL,
datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
type="pandas",
)
with gr.TabItem("Language Performance"):
with gr.Row():
gr.components.Dataframe(
CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
type="pandas",
)
# dataset 3: SG_EVAL
with gr.TabItem("SG_EVAL"):
with gr.Row():
gr.Markdown("""
**SG_EVAL Leaderboard** 🔮
- **Metric:** Accuracy
- **Languages:** English
""")
with gr.TabItem("zero_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
SG_EVAL_ZERO_SHOT,
datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
type="pandas",
)
with gr.TabItem("five_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
SG_EVAL_FIVE_SHOT,
datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
type="pandas",
)
# dataset 4:
with gr.TabItem("US_EVAL"):
with gr.Row():
gr.Markdown("""
**US_EVAL Leaderboard** 🔮
- **Metric:** Accuracy
- **Languages:** English
""")
with gr.TabItem("zero_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
US_EVAL_ZERO_SHOT,
datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
type="pandas",
)
with gr.TabItem("five_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
US_EVAL_FIVE_SHOT,
datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
type="pandas",
)
# dataset 5:
with gr.TabItem("CN_EVAL"):
with gr.Row():
gr.Markdown("""
**CN_EVAL Leaderboard** 🔮
- **Metric:** Accuracy
- **Languages:** Chinese
""")
with gr.TabItem("zero_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
CN_EVAL_ZERO_SHOT,
datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
type="pandas",
)
with gr.TabItem("five_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
CN_EVAL_FIVE_SHOT,
datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
type="pandas",
)
# dataset 6:
with gr.TabItem("PH_EVAL"):
with gr.Row():
gr.Markdown("""
**PH_EVAL Leaderboard** 🔮
- **Metric:** Accuracy
- **Languages:** English
""")
with gr.TabItem("zero_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
PH_EVAL_ZERO_SHOT,
datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
type="pandas",
)
with gr.TabItem("five_shot"):
with gr.TabItem("Overall"):
with gr.Row():
gr.components.Dataframe(
PH_EVAL_ZERO_SHOT,
datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
type="pandas",
)
gr.Markdown(r"""
If this work is useful to you, please citing our work:
```bibtex
@article{SeaEval2023,
title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
journal={arXiv preprint arXiv:2309.04766},
year={2023}
}
```
""")
# Running the functions on page load in addition to when the button is clicked
# This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
"""
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
"""
block.queue(max_size=10)
block.launch(server_name="0.0.0.0", share=True)
# Possible changes:
# Could add graphs / other visual content
# Could add verification marks
# Sources:
# https://huggingface.co/spaces/gradio/leaderboard
# https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard
# https://getemoji.com/