Spaces:
Running
Running
from functools import partial | |
import json | |
import gradio as gr | |
import pandas as pd | |
print("Loading datasets...") | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
def add_rank(df, compute_average=True): | |
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]] | |
if len(cols_to_rank) == 1: | |
df.sort_values(cols_to_rank[0], ascending=False, inplace=True) | |
else: | |
if compute_average: | |
df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False)) | |
df.sort_values("Average", ascending=False, inplace=True) | |
else: | |
df.sort_values(cols_to_rank[0], ascending=False, inplace=True) | |
df.insert(0, "Rank", list(range(1, len(df) + 1))) | |
df = df.round(2) | |
# Fill NaN after averaging | |
df.fillna("", inplace=True) | |
return df | |
def make_clickable_model(model_name, link=None): | |
if link is None: | |
link = "https://huggingface.co/" + model_name | |
# Remove user from model name | |
return ( | |
f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>' | |
) | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
with open('all_results.json', 'r') as f: | |
ALL_RESULTS = json.load(f) | |
MODEL_LIST = list(ALL_RESULTS.keys()) | |
NUM_MODELS = len(set(MODEL_LIST)) | |
MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST} | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True): | |
df_list = [] | |
for model in MODEL_LIST: | |
results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']] | |
try: | |
overall_acc = [results['overall_acc'] for results in results_list] | |
overall_acc = sum(overall_acc) / len(overall_acc) | |
consistency_score_3 = [results['consistency_score_3'] for results in results_list] | |
consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3) | |
AC3_3 = [results['AC3_3'] for results in results_list] | |
AC3_3 = sum(AC3_3) / len(AC3_3) | |
except: | |
print(results_list) | |
consistency_score_3 = -1 | |
overall_acc = -1 | |
AC3_3 = -1 | |
res = { | |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
"AC3": AC3_3, | |
"Cross-Lingual Consistency": consistency_score_3, | |
"Accuracy": overall_acc, | |
} | |
df_list.append(res) | |
df = pd.DataFrame(df_list) | |
# If there are any models that are the same, merge them | |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
df = df.groupby("Model", as_index=False).first() | |
# Put 'Model' column first | |
#cols = sorted(list(df.columns)) | |
cols = list(df.columns) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
if rank: | |
df = add_rank(df, compute_average=False) | |
if fillna: | |
df.fillna("", inplace=True) | |
return df | |
CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot") | |
CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot") | |
def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True): | |
df_list = [] | |
for model in MODEL_LIST: | |
results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']] | |
try: | |
English = [results['language_acc']['English'] for results in results_list] | |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] | |
Chinese = [results['language_acc']['Chinese'] for results in results_list] | |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list] | |
Filipino = [results['language_acc']['Filipino'] for results in results_list] | |
Spanish = [results['language_acc']['Spanish'] for results in results_list] | |
Malay = [results['language_acc']['Malay'] for results in results_list] | |
English = sum(English) / len(English) | |
Vietnamese = sum(Vietnamese) / len(Vietnamese) | |
Chinese = sum(Chinese) / len(Chinese) | |
Indonesian = sum(Indonesian) / len(Indonesian) | |
Filipino = sum(Filipino) / len(Filipino) | |
Spanish = sum(Spanish) / len(Spanish) | |
Malay = sum(Malay) / len(Malay) | |
except: | |
print(results_list) | |
English = -1 | |
Vietnamese = -1 | |
Chinese = -1 | |
Indonesian = -1 | |
Filipino = -1 | |
Spanish = -1 | |
Malay = -1 | |
res = { | |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
"English": English, | |
"Vietnamese": Vietnamese, | |
"Chinese": Chinese, | |
"Indonesian": Indonesian, | |
"Filipino": Filipino, | |
"Spanish": Spanish, | |
"Malay": Malay, | |
} | |
df_list.append(res) | |
df = pd.DataFrame(df_list) | |
# If there are any models that are the same, merge them | |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
df = df.groupby("Model", as_index=False).first() | |
# Put 'Model' column first | |
#cols = sorted(list(df.columns)) | |
cols = list(df.columns) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
if rank: | |
df = add_rank(df, compute_average=True) | |
if fillna: | |
df.fillna("", inplace=True) | |
return df | |
CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot") | |
CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot") | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True): | |
df_list = [] | |
for model in MODEL_LIST: | |
results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']] | |
try: | |
overall_acc = [results['overall_acc'] for results in results_list] | |
overall_acc = sum(overall_acc) / len(overall_acc) | |
consistency_score_3 = [results['consistency_score_3'] for results in results_list] | |
consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3) | |
AC3_3 = [results['AC3_3'] for results in results_list] | |
AC3_3 = sum(AC3_3) / len(AC3_3) | |
except: | |
print(results_list) | |
consistency_score_3 = -1 | |
overall_acc = -1 | |
AC3_3 = -1 | |
res = { | |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
"AC3": AC3_3, | |
"Cross-Lingual Consistency": consistency_score_3, | |
"Accuracy": overall_acc, | |
} | |
df_list.append(res) | |
df = pd.DataFrame(df_list) | |
# If there are any models that are the same, merge them | |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
df = df.groupby("Model", as_index=False).first() | |
# Put 'Model' column first | |
#cols = sorted(list(df.columns)) | |
cols = list(df.columns) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
if rank: | |
df = add_rank(df, compute_average=False) | |
if fillna: | |
df.fillna("", inplace=True) | |
return df | |
CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot") | |
CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot") | |
def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True): | |
df_list = [] | |
for model in MODEL_LIST: | |
results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']] | |
try: | |
English = [results['language_acc']['English'] for results in results_list] | |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] | |
Chinese = [results['language_acc']['Chinese'] for results in results_list] | |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list] | |
Filipino = [results['language_acc']['Filipino'] for results in results_list] | |
Spanish = [results['language_acc']['Spanish'] for results in results_list] | |
Malay = [results['language_acc']['Malay'] for results in results_list] | |
English = sum(English) / len(English) | |
Vietnamese = sum(Vietnamese) / len(Vietnamese) | |
Chinese = sum(Chinese) / len(Chinese) | |
Indonesian = sum(Indonesian) / len(Indonesian) | |
Filipino = sum(Filipino) / len(Filipino) | |
Spanish = sum(Spanish) / len(Spanish) | |
Malay = sum(Malay) / len(Malay) | |
except: | |
print(results_list) | |
English = -1 | |
Vietnamese = -1 | |
Chinese = -1 | |
Indonesian = -1 | |
Filipino = -1 | |
Spanish = -1 | |
Malay = -1 | |
res = { | |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
"English": English, | |
"Vietnamese": Vietnamese, | |
"Chinese": Chinese, | |
"Indonesian": Indonesian, | |
"Filipino": Filipino, | |
"Spanish": Spanish, | |
"Malay": Malay, | |
} | |
df_list.append(res) | |
df = pd.DataFrame(df_list) | |
# If there are any models that are the same, merge them | |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
df = df.groupby("Model", as_index=False).first() | |
# Put 'Model' column first | |
#cols = sorted(list(df.columns)) | |
cols = list(df.columns) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
if rank: | |
df = add_rank(df, compute_average=True) | |
if fillna: | |
df.fillna("", inplace=True) | |
return df | |
CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot") | |
CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot") | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
df_list = [] | |
for model in MODEL_LIST: | |
results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']] | |
try: | |
accuracy = [results['accuracy'] for results in results_list] | |
accuracy = sum(accuracy) / len(accuracy) | |
except: | |
print(results_list) | |
accuracy = -1 | |
res = { | |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
"Accuracy": accuracy, | |
} | |
df_list.append(res) | |
df = pd.DataFrame(df_list) | |
# If there are any models that are the same, merge them | |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
df = df.groupby("Model", as_index=False).first() | |
# Put 'Model' column first | |
#cols = sorted(list(df.columns)) | |
cols = list(df.columns) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
if rank: | |
df = add_rank(df, compute_average=True) | |
if fillna: | |
df.fillna("", inplace=True) | |
return df | |
SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot") | |
SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot") | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
df_list = [] | |
for model in MODEL_LIST: | |
results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']] | |
try: | |
accuracy = [results['accuracy'] for results in results_list] | |
accuracy = sum(accuracy) / len(accuracy) | |
except: | |
print(results_list) | |
accuracy = -1 | |
res = { | |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
"Accuracy": accuracy, | |
} | |
df_list.append(res) | |
df = pd.DataFrame(df_list) | |
# If there are any models that are the same, merge them | |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
df = df.groupby("Model", as_index=False).first() | |
# Put 'Model' column first | |
#cols = sorted(list(df.columns)) | |
cols = list(df.columns) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
if rank: | |
df = add_rank(df, compute_average=True) | |
if fillna: | |
df.fillna("", inplace=True) | |
return df | |
US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot") | |
US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot") | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
df_list = [] | |
for model in MODEL_LIST: | |
results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']] | |
try: | |
accuracy = [results['accuracy'] for results in results_list] | |
accuracy = sum(accuracy) / len(accuracy) | |
except: | |
print(results_list) | |
accuracy = -1 | |
res = { | |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
"Accuracy": accuracy, | |
} | |
df_list.append(res) | |
df = pd.DataFrame(df_list) | |
# If there are any models that are the same, merge them | |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
df = df.groupby("Model", as_index=False).first() | |
# Put 'Model' column first | |
#cols = sorted(list(df.columns)) | |
cols = list(df.columns) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
if rank: | |
df = add_rank(df, compute_average=True) | |
if fillna: | |
df.fillna("", inplace=True) | |
return df | |
CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot") | |
CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot") | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
df_list = [] | |
for model in MODEL_LIST: | |
results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']] | |
try: | |
accuracy = [results['accuracy'] for results in results_list] | |
accuracy = sum(accuracy) / len(accuracy) | |
except: | |
print(results_list) | |
accuracy = -1 | |
res = { | |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
"Accuracy": accuracy, | |
} | |
df_list.append(res) | |
df = pd.DataFrame(df_list) | |
# If there are any models that are the same, merge them | |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
df = df.groupby("Model", as_index=False).first() | |
# Put 'Model' column first | |
#cols = sorted(list(df.columns)) | |
cols = list(df.columns) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
if rank: | |
df = add_rank(df, compute_average=True) | |
if fillna: | |
df.fillna("", inplace=True) | |
return df | |
PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot") | |
PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot") | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
block = gr.Blocks() | |
with block: | |
gr.Markdown(f""" | |
SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a> Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models. | |
- **Total Datasets**: 31 | |
- **Total Languages**: 8 | |
- **Total Models**: {NUM_MODELS} | |
""") | |
with gr.Tabs(): | |
# dataset 1: cross-mmlu | |
with gr.TabItem("Cross-MMLU"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Cross-MMLU Leaderboard** 🔮 | |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3 | |
- **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino | |
""") | |
with gr.TabItem("zero_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
cross_mmlu_zero_shot_overall = gr.components.Dataframe( | |
CROSS_MMLU_ZERO_SHOT_OVERALL, | |
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns), | |
type="pandas", | |
) | |
with gr.TabItem("Language Performance"): | |
with gr.Row(): | |
cross_mmlu_zero_shot_overall = gr.components.Dataframe( | |
CROSS_MMLU_ZERO_SHOT_LANGUAGE, | |
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns), | |
type="pandas", | |
) | |
with gr.TabItem("five_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
cross_mmlu_zero_shot_overall = gr.components.Dataframe( | |
CROSS_MMLU_FIVE_SHOT_OVERALL, | |
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns), | |
type="pandas", | |
) | |
with gr.TabItem("Language Performance"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
CROSS_MMLU_FIVE_SHOT_LANGUAGE, | |
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns), | |
type="pandas", | |
) | |
# dataset 2: cross-logiqa | |
with gr.TabItem("Cross-LogiQA"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Cross-LogiQA Leaderboard** 🔮 | |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3 | |
- **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino | |
""") | |
with gr.TabItem("zero_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
CROSS_LOGIQA_ZERO_SHOT_OVERALL, | |
datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns), | |
type="pandas", | |
) | |
with gr.TabItem("Language Performance"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
CROSS_LOGIQA_ZERO_SHOT_LANGUAGE, | |
datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns), | |
type="pandas", | |
) | |
with gr.TabItem("five_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
CROSS_LOGIQA_FIVE_SHOT_OVERALL, | |
datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns), | |
type="pandas", | |
) | |
with gr.TabItem("Language Performance"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
CROSS_LOGIQA_FIVE_SHOT_LANGUAGE, | |
datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns), | |
type="pandas", | |
) | |
# dataset 3: SG_EVAL | |
with gr.TabItem("SG_EVAL"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**SG_EVAL Leaderboard** 🔮 | |
- **Metric:** Accuracy | |
- **Languages:** English | |
""") | |
with gr.TabItem("zero_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
SG_EVAL_ZERO_SHOT, | |
datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns), | |
type="pandas", | |
) | |
with gr.TabItem("five_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
SG_EVAL_FIVE_SHOT, | |
datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns), | |
type="pandas", | |
) | |
# dataset 4: | |
with gr.TabItem("US_EVAL"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**US_EVAL Leaderboard** 🔮 | |
- **Metric:** Accuracy | |
- **Languages:** English | |
""") | |
with gr.TabItem("zero_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
US_EVAL_ZERO_SHOT, | |
datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns), | |
type="pandas", | |
) | |
with gr.TabItem("five_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
US_EVAL_FIVE_SHOT, | |
datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns), | |
type="pandas", | |
) | |
# dataset 5: | |
with gr.TabItem("CN_EVAL"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**CN_EVAL Leaderboard** 🔮 | |
- **Metric:** Accuracy | |
- **Languages:** Chinese | |
""") | |
with gr.TabItem("zero_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
CN_EVAL_ZERO_SHOT, | |
datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns), | |
type="pandas", | |
) | |
with gr.TabItem("five_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
CN_EVAL_FIVE_SHOT, | |
datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns), | |
type="pandas", | |
) | |
# dataset 6: | |
with gr.TabItem("PH_EVAL"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**PH_EVAL Leaderboard** 🔮 | |
- **Metric:** Accuracy | |
- **Languages:** English | |
""") | |
with gr.TabItem("zero_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
PH_EVAL_ZERO_SHOT, | |
datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns), | |
type="pandas", | |
) | |
with gr.TabItem("five_shot"): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.components.Dataframe( | |
PH_EVAL_ZERO_SHOT, | |
datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns), | |
type="pandas", | |
) | |
gr.Markdown(r""" | |
If this work is useful to you, please citing our work: | |
```bibtex | |
@article{SeaEval2023, | |
title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, | |
author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, | |
journal={arXiv preprint arXiv:2309.04766}, | |
year={2023} | |
} | |
``` | |
""") | |
# Running the functions on page load in addition to when the button is clicked | |
# This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab | |
""" | |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining) | |
""" | |
block.queue(max_size=10) | |
block.launch(server_name="0.0.0.0", share=True) | |
# Possible changes: | |
# Could add graphs / other visual content | |
# Could add verification marks | |
# Sources: | |
# https://huggingface.co/spaces/gradio/leaderboard | |
# https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard | |
# https://getemoji.com/ | |