Spaces:

MERaLiON
/

SeaEval_Leaderboard

Running

App Files Files Community

binwang commited on Mar 19, 2024

Commit

5346992

verified ·

1 Parent(s): 12d76ec

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -3

app.py CHANGED Viewed

@@ -46,6 +46,151 @@ NUM_MODELS = len(set(MODEL_LIST))
 MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
@@ -2089,6 +2234,50 @@ with block:
         with gr.TabItem("Cross-Lingual Consistency"):
             # dataset 1: cross-mmlu
             with gr.TabItem("Cross-MMLU"):
                 with gr.TabItem("Zero Shot"):
@@ -3046,11 +3235,11 @@ with block:
     gr.Markdown(r"""
     If our datasets and leaderboard are useful, please consider cite:
     ```bibtex
-        @article{SeaEval2023,
         title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
         author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
-        journal={arXiv preprint arXiv:2309.04766},
-        year={2023}}
     ```
     """)
     # Running the functions on page load in addition to when the button is clicked

 MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
+        try:
+            overall_acc = [results['overall_acc'] for results in results_list]
+            overall_acc = median(overall_acc)
+            consistency_score_3 = [results['consistency_score_3'] for results in results_list]
+            consistency_score_3 = median(consistency_score_3)
+            AC3_3 = [results['AC3_3'] for results in results_list]
+            AC3_3 = median(AC3_3)
+        except:
+            consistency_score_3 = -1
+            overall_acc = -1
+            AC3_3 = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "Accuracy": overall_acc,
+            "Cross-Lingual Consistency": consistency_score_3,
+            "AC3": AC3_3,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=False)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
+CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")
+def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
+        try:
+            English    = [results['language_acc']['English'] for results in results_list]
+            Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
+            Chinese    = [results['language_acc']['Chinese'] for results in results_list]
+            Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
+            Filipino   = [results['language_acc']['Filipino'] for results in results_list]
+            Spanish    = [results['language_acc']['Spanish'] for results in results_list]
+            Malay      = [results['language_acc']['Malay'] for results in results_list]
+            English    = median(English)
+            Vietnamese = median(Vietnamese)
+            Chinese    = median(Chinese)
+            Indonesian = median(Indonesian)
+            Filipino   = median(Filipino)
+            Spanish    = median(Spanish)
+            Malay      = median(Malay)
+        except:
+            English = -1
+            Vietnamese = -1
+            Chinese = -1
+            Indonesian = -1
+            Filipino = -1
+            Spanish = -1
+            Malay = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "English": English,
+            "Vietnamese": Vietnamese,
+            "Chinese": Chinese,
+            "Indonesian": Indonesian,
+            "Filipino": Filipino,
+            "Spanish": Spanish,
+            "Malay": Malay,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=False)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
+CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
         with gr.TabItem("Cross-Lingual Consistency"):
+            # dataset 1: cross-mmlu
+            with gr.TabItem("Cross-XQUAD"):
+                with gr.TabItem("Zero Shot"):
+                    with gr.TabItem("Overall"):
+                        with gr.Row():
+                            cross_xquad_zero_shot_overall = gr.components.Dataframe(
+                                CROSS_XQUAD_ZERO_SHOT_OVERALL,
+                                datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns),
+                                type="pandas",
+                            )
+                    with gr.TabItem("Language Performance"):
+                        with gr.Row():
+                            cross_xquad_zero_shot_overall = gr.components.Dataframe(
+                                CROSS_XQUAD_ZERO_SHOT_LANGUAGE,
+                                datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns),
+                                type="pandas",
+                            )
+                with gr.TabItem("Five Shot"):
+                    with gr.TabItem("Overall"):
+                        with gr.Row():
+                            cross_xquad_zero_shot_overall = gr.components.Dataframe(
+                                CROSS_XQUAD_FIVE_SHOT_OVERALL,
+                                datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns),
+                                type="pandas",
+                            )
+                    with gr.TabItem("Language Performance"):
+                        with gr.Row():
+                            gr.components.Dataframe(
+                                CROSS_XQUAD_FIVE_SHOT_LANGUAGE,
+                                datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns),
+                                type="pandas",
+                            )
+                with gr.Row():
+                    gr.Markdown("""
+                    **Cross-XQUAD Leaderboard** 🔮
+                    - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
+                    - **Languages:** English, Chinese, Spanish, Vietnamese
+                    """)
             # dataset 1: cross-mmlu
             with gr.TabItem("Cross-MMLU"):
                 with gr.TabItem("Zero Shot"):
     gr.Markdown(r"""
     If our datasets and leaderboard are useful, please consider cite:
     ```bibtex
+        @article{SeaEval,
         title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
         author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
+        journal={NAACL},
+        year={2024}}
     ```
     """)
     # Running the functions on page load in addition to when the button is clicked