Spaces:

MERaLiON
/

SeaEval_Leaderboard

Running

File size: 29,427 Bytes

from functools import partial
import json

import gradio as gr
import pandas as pd


print("Loading datasets...")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def add_rank(df, compute_average=True):
    cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
    if len(cols_to_rank) == 1:
        df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
    else:
        if compute_average:
            df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
            df.sort_values("Average", ascending=False, inplace=True)
        else:
            df.sort_values(cols_to_rank[0], ascending=False, inplace=True)

    df.insert(0, "Rank", list(range(1, len(df) + 1)))
    df = df.round(2)
    # Fill NaN after averaging
    df.fillna("", inplace=True)
    return df

def make_clickable_model(model_name, link=None):
    if link is None:
        link = "https://huggingface.co/" + model_name
    # Remove user from model name
    return (
        f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
    )


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =




with open('all_results.json', 'r') as f:
    ALL_RESULTS = json.load(f)


MODEL_LIST = list(ALL_RESULTS.keys())
NUM_MODELS = len(set(MODEL_LIST))
MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =

def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]

       
        try:
            overall_acc = [results['overall_acc'] for results in results_list]
            overall_acc = sum(overall_acc) / len(overall_acc)

            consistency_score_3 = [results['consistency_score_3'] for results in results_list]
            consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3)

            AC3_3 = [results['AC3_3'] for results in results_list]
            AC3_3 = sum(AC3_3) / len(AC3_3)

        except:
            print(results_list)
            consistency_score_3 = -1
            overall_acc = -1
            AC3_3 = -1

        res = {
            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
            "AC3": AC3_3,
            "Cross-Lingual Consistency": consistency_score_3,
            "Accuracy": overall_acc,
        }

        df_list.append(res)


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=False)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")


def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]

       
        try:
            English = [results['language_acc']['English'] for results in results_list]
            Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
            Chinese = [results['language_acc']['Chinese'] for results in results_list]
            Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
            Filipino = [results['language_acc']['Filipino'] for results in results_list]
            Spanish = [results['language_acc']['Spanish'] for results in results_list]
            Malay = [results['language_acc']['Malay'] for results in results_list]

            English = sum(English) / len(English)
            Vietnamese = sum(Vietnamese) / len(Vietnamese)
            Chinese = sum(Chinese) / len(Chinese)
            Indonesian = sum(Indonesian) / len(Indonesian)
            Filipino = sum(Filipino) / len(Filipino)
            Spanish = sum(Spanish) / len(Spanish)
            Malay = sum(Malay) / len(Malay)


        except:
            print(results_list)
            English = -1
            Vietnamese = -1
            Chinese = -1
            Indonesian = -1
            Filipino = -1
            Spanish = -1
            Malay = -1
            
        res = {
            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
            "English": English,
            "Vietnamese": Vietnamese,
            "Chinese": Chinese,
            "Indonesian": Indonesian,
            "Filipino": Filipino,
            "Spanish": Spanish,
            "Malay": Malay,
        }

        df_list.append(res)


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =




def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]

       
        try:
            overall_acc = [results['overall_acc'] for results in results_list]
            overall_acc = sum(overall_acc) / len(overall_acc)

            consistency_score_3 = [results['consistency_score_3'] for results in results_list]
            consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3)

            AC3_3 = [results['AC3_3'] for results in results_list]
            AC3_3 = sum(AC3_3) / len(AC3_3)

        except:
            print(results_list)
            consistency_score_3 = -1
            overall_acc = -1
            AC3_3 = -1

        res = {
            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
            "AC3": AC3_3,
            "Cross-Lingual Consistency": consistency_score_3,
            "Accuracy": overall_acc,
        }

        df_list.append(res)


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=False)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot")
CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot")


def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]

       
        try:
            English = [results['language_acc']['English'] for results in results_list]
            Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
            Chinese = [results['language_acc']['Chinese'] for results in results_list]
            Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
            Filipino = [results['language_acc']['Filipino'] for results in results_list]
            Spanish = [results['language_acc']['Spanish'] for results in results_list]
            Malay = [results['language_acc']['Malay'] for results in results_list]

            English = sum(English) / len(English)
            Vietnamese = sum(Vietnamese) / len(Vietnamese)
            Chinese = sum(Chinese) / len(Chinese)
            Indonesian = sum(Indonesian) / len(Indonesian)
            Filipino = sum(Filipino) / len(Filipino)
            Spanish = sum(Spanish) / len(Spanish)
            Malay = sum(Malay) / len(Malay)


        except:
            print(results_list)
            English = -1
            Vietnamese = -1
            Chinese = -1
            Indonesian = -1
            Filipino = -1
            Spanish = -1
            Malay = -1
            
        res = {
            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
            "English": English,
            "Vietnamese": Vietnamese,
            "Chinese": Chinese,
            "Indonesian": Indonesian,
            "Filipino": Filipino,
            "Spanish": Spanish,
            "Malay": Malay,
        }

        df_list.append(res)


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot")
CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]

       
        try:
            accuracy = [results['accuracy'] for results in results_list]
            accuracy = sum(accuracy) / len(accuracy)

        except:
            print(results_list)
            accuracy = -1


        res = {
            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
            "Accuracy": accuracy,
        }

        df_list.append(res)


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot")
SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]

       
        try:
            accuracy = [results['accuracy'] for results in results_list]
            accuracy = sum(accuracy) / len(accuracy)

        except:
            print(results_list)
            accuracy = -1


        res = {
            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
            "Accuracy": accuracy,
        }

        df_list.append(res)


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot")
US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]

       
        try:
            accuracy = [results['accuracy'] for results in results_list]
            accuracy = sum(accuracy) / len(accuracy)

        except:
            print(results_list)
            accuracy = -1


        res = {
            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
            "Accuracy": accuracy,
        }

        df_list.append(res)


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]

       
        try:
            accuracy = [results['accuracy'] for results in results_list]
            accuracy = sum(accuracy) / len(accuracy)

        except:
            print(results_list)
            accuracy = -1


        res = {
            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
            "Accuracy": accuracy,
        }

        df_list.append(res)


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot")
PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =

block = gr.Blocks()
with block:
    gr.Markdown(f"""
    SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a>  Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.

    - **Total Datasets**: 31
    - **Total Languages**: 8
    - **Total Models**: {NUM_MODELS}
    """)
    with gr.Tabs():        


        # dataset 1: cross-mmlu
        with gr.TabItem("Cross-MMLU"):
            with gr.Row():
                gr.Markdown("""
                **Cross-MMLU Leaderboard** 🔮
                
                - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
                - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
                """)

            with gr.TabItem("zero_shot"):


                with gr.TabItem("Overall"):

                    with gr.Row():
                        cross_mmlu_zero_shot_overall = gr.components.Dataframe(
                            CROSS_MMLU_ZERO_SHOT_OVERALL,
                            datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
                            type="pandas",
                        )


                with gr.TabItem("Language Performance"):

                    with gr.Row():
                        cross_mmlu_zero_shot_overall = gr.components.Dataframe(
                            CROSS_MMLU_ZERO_SHOT_LANGUAGE,
                            datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
                            type="pandas",
                        )


            with gr.TabItem("five_shot"):


                with gr.TabItem("Overall"):

                    with gr.Row():
                        cross_mmlu_zero_shot_overall = gr.components.Dataframe(
                            CROSS_MMLU_FIVE_SHOT_OVERALL,
                            datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
                            type="pandas",
                        )


                with gr.TabItem("Language Performance"):

                    with gr.Row():
                        gr.components.Dataframe(
                            CROSS_MMLU_FIVE_SHOT_LANGUAGE,
                            datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
                            type="pandas",
                        )



        # dataset 2: cross-logiqa
        with gr.TabItem("Cross-LogiQA"):
            with gr.Row():
                gr.Markdown("""
                **Cross-LogiQA Leaderboard** 🔮
                
                - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
                - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
                """)

            with gr.TabItem("zero_shot"):


                with gr.TabItem("Overall"):

                    with gr.Row():
                        gr.components.Dataframe(
                            CROSS_LOGIQA_ZERO_SHOT_OVERALL,
                            datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
                            type="pandas",
                        )


                with gr.TabItem("Language Performance"):

                    with gr.Row():
                        gr.components.Dataframe(
                            CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
                            datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
                            type="pandas",
                        )


            with gr.TabItem("five_shot"):


                with gr.TabItem("Overall"):

                    with gr.Row():
                        gr.components.Dataframe(
                            CROSS_LOGIQA_FIVE_SHOT_OVERALL,
                            datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
                            type="pandas",
                        )


                with gr.TabItem("Language Performance"):

                    with gr.Row():
                        gr.components.Dataframe(
                            CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
                            datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
                            type="pandas",
                        )


        # dataset 3: SG_EVAL
        with gr.TabItem("SG_EVAL"):
            with gr.Row():
                gr.Markdown("""
                **SG_EVAL Leaderboard** 🔮
                
                - **Metric:** Accuracy
                - **Languages:** English
                """)

            with gr.TabItem("zero_shot"):
                with gr.TabItem("Overall"):
                    with gr.Row():
                        gr.components.Dataframe(
                            SG_EVAL_ZERO_SHOT,
                            datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
                            type="pandas",
                        )

            with gr.TabItem("five_shot"):
                with gr.TabItem("Overall"):
                    with gr.Row():
                        gr.components.Dataframe(
                            SG_EVAL_FIVE_SHOT,
                            datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
                            type="pandas",
                        )


        # dataset 4: 
        with gr.TabItem("US_EVAL"):
            with gr.Row():
                gr.Markdown("""
                **US_EVAL Leaderboard** 🔮
                
                - **Metric:** Accuracy
                - **Languages:** English
                """)

            with gr.TabItem("zero_shot"):
                with gr.TabItem("Overall"):
                    with gr.Row():
                        gr.components.Dataframe(
                            US_EVAL_ZERO_SHOT,
                            datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
                            type="pandas",
                        )

            with gr.TabItem("five_shot"):
                with gr.TabItem("Overall"):
                    with gr.Row():
                        gr.components.Dataframe(
                            US_EVAL_FIVE_SHOT,
                            datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
                            type="pandas",
                        )


        # dataset 5: 
        with gr.TabItem("CN_EVAL"):
            with gr.Row():
                gr.Markdown("""
                **CN_EVAL Leaderboard** 🔮
                
                - **Metric:** Accuracy
                - **Languages:** Chinese
                """)

            with gr.TabItem("zero_shot"):
                with gr.TabItem("Overall"):
                    with gr.Row():
                        gr.components.Dataframe(
                            CN_EVAL_ZERO_SHOT,
                            datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
                            type="pandas",
                        )

            with gr.TabItem("five_shot"):
                with gr.TabItem("Overall"):
                    with gr.Row():
                        gr.components.Dataframe(
                            CN_EVAL_FIVE_SHOT,
                            datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
                            type="pandas",
                        )



        # dataset 6: 
        with gr.TabItem("PH_EVAL"):
            with gr.Row():
                gr.Markdown("""
                **PH_EVAL Leaderboard** 🔮
                
                - **Metric:** Accuracy
                - **Languages:** English
                """)

            with gr.TabItem("zero_shot"):
                with gr.TabItem("Overall"):
                    with gr.Row():
                        gr.components.Dataframe(
                            PH_EVAL_ZERO_SHOT,
                            datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
                            type="pandas",
                        )

            with gr.TabItem("five_shot"):
                with gr.TabItem("Overall"):
                    with gr.Row():
                        gr.components.Dataframe(
                            PH_EVAL_ZERO_SHOT,
                            datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
                            type="pandas",
                        )




    gr.Markdown(r"""
    
    If this work is useful to you, please citing our work:

    ```bibtex
        @article{SeaEval2023,
        title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
        author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
        journal={arXiv preprint arXiv:2309.04766},
        year={2023}
        }
    ```
    """)
    # Running the functions on page load in addition to when the button is clicked
    # This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
    """
    block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
    """

block.queue(max_size=10)
block.launch(server_name="0.0.0.0", share=True)


# Possible changes:
# Could add graphs / other visual content
# Could add verification marks

# Sources:
# https://huggingface.co/spaces/gradio/leaderboard
# https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard
# https://getemoji.com/