Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -46,6 +46,151 @@ NUM_MODELS = len(set(MODEL_LIST))
|
|
46 |
MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
|
47 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
48 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
50 |
|
51 |
def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
|
@@ -2089,6 +2234,50 @@ with block:
|
|
2089 |
|
2090 |
with gr.TabItem("Cross-Lingual Consistency"):
|
2091 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2092 |
# dataset 1: cross-mmlu
|
2093 |
with gr.TabItem("Cross-MMLU"):
|
2094 |
with gr.TabItem("Zero Shot"):
|
@@ -3046,11 +3235,11 @@ with block:
|
|
3046 |
gr.Markdown(r"""
|
3047 |
If our datasets and leaderboard are useful, please consider cite:
|
3048 |
```bibtex
|
3049 |
-
@article{
|
3050 |
title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
|
3051 |
author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
|
3052 |
-
journal={
|
3053 |
-
year={
|
3054 |
```
|
3055 |
""")
|
3056 |
# Running the functions on page load in addition to when the button is clicked
|
|
|
46 |
MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
|
47 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
48 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
49 |
+
|
50 |
+
|
51 |
+
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
52 |
+
|
53 |
+
def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
|
54 |
+
|
55 |
+
df_list = []
|
56 |
+
|
57 |
+
for model in MODEL_LIST:
|
58 |
+
|
59 |
+
|
60 |
+
results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
|
61 |
+
|
62 |
+
|
63 |
+
try:
|
64 |
+
overall_acc = [results['overall_acc'] for results in results_list]
|
65 |
+
overall_acc = median(overall_acc)
|
66 |
+
|
67 |
+
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
68 |
+
consistency_score_3 = median(consistency_score_3)
|
69 |
+
|
70 |
+
AC3_3 = [results['AC3_3'] for results in results_list]
|
71 |
+
AC3_3 = median(AC3_3)
|
72 |
+
|
73 |
+
except:
|
74 |
+
consistency_score_3 = -1
|
75 |
+
overall_acc = -1
|
76 |
+
AC3_3 = -1
|
77 |
+
|
78 |
+
res = {
|
79 |
+
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
|
80 |
+
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
|
81 |
+
"Accuracy": overall_acc,
|
82 |
+
"Cross-Lingual Consistency": consistency_score_3,
|
83 |
+
"AC3": AC3_3,
|
84 |
+
}
|
85 |
+
|
86 |
+
df_list.append(res)
|
87 |
+
|
88 |
+
|
89 |
+
df = pd.DataFrame(df_list)
|
90 |
+
# If there are any models that are the same, merge them
|
91 |
+
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|
92 |
+
df = df.groupby("Model", as_index=False).first()
|
93 |
+
# Put 'Model' column first
|
94 |
+
#cols = sorted(list(df.columns))
|
95 |
+
cols = list(df.columns)
|
96 |
+
cols.insert(0, cols.pop(cols.index("Model")))
|
97 |
+
df = df[cols]
|
98 |
+
|
99 |
+
if rank:
|
100 |
+
df = add_rank(df, compute_average=False)
|
101 |
+
|
102 |
+
if fillna:
|
103 |
+
df.fillna("", inplace=True)
|
104 |
+
|
105 |
+
return df
|
106 |
+
|
107 |
+
|
108 |
+
CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
|
109 |
+
CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")
|
110 |
+
|
111 |
+
|
112 |
+
def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True):
|
113 |
+
|
114 |
+
df_list = []
|
115 |
+
|
116 |
+
for model in MODEL_LIST:
|
117 |
+
|
118 |
+
|
119 |
+
results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
|
120 |
+
|
121 |
+
|
122 |
+
try:
|
123 |
+
English = [results['language_acc']['English'] for results in results_list]
|
124 |
+
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
125 |
+
Chinese = [results['language_acc']['Chinese'] for results in results_list]
|
126 |
+
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
127 |
+
Filipino = [results['language_acc']['Filipino'] for results in results_list]
|
128 |
+
Spanish = [results['language_acc']['Spanish'] for results in results_list]
|
129 |
+
Malay = [results['language_acc']['Malay'] for results in results_list]
|
130 |
+
|
131 |
+
English = median(English)
|
132 |
+
Vietnamese = median(Vietnamese)
|
133 |
+
Chinese = median(Chinese)
|
134 |
+
Indonesian = median(Indonesian)
|
135 |
+
Filipino = median(Filipino)
|
136 |
+
Spanish = median(Spanish)
|
137 |
+
Malay = median(Malay)
|
138 |
+
|
139 |
+
|
140 |
+
except:
|
141 |
+
English = -1
|
142 |
+
Vietnamese = -1
|
143 |
+
Chinese = -1
|
144 |
+
Indonesian = -1
|
145 |
+
Filipino = -1
|
146 |
+
Spanish = -1
|
147 |
+
Malay = -1
|
148 |
+
|
149 |
+
res = {
|
150 |
+
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
|
151 |
+
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
|
152 |
+
"English": English,
|
153 |
+
"Vietnamese": Vietnamese,
|
154 |
+
"Chinese": Chinese,
|
155 |
+
"Indonesian": Indonesian,
|
156 |
+
"Filipino": Filipino,
|
157 |
+
"Spanish": Spanish,
|
158 |
+
"Malay": Malay,
|
159 |
+
}
|
160 |
+
|
161 |
+
df_list.append(res)
|
162 |
+
|
163 |
+
|
164 |
+
df = pd.DataFrame(df_list)
|
165 |
+
# If there are any models that are the same, merge them
|
166 |
+
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|
167 |
+
df = df.groupby("Model", as_index=False).first()
|
168 |
+
# Put 'Model' column first
|
169 |
+
#cols = sorted(list(df.columns))
|
170 |
+
cols = list(df.columns)
|
171 |
+
cols.insert(0, cols.pop(cols.index("Model")))
|
172 |
+
df = df[cols]
|
173 |
+
|
174 |
+
if rank:
|
175 |
+
df = add_rank(df, compute_average=False)
|
176 |
+
|
177 |
+
if fillna:
|
178 |
+
df.fillna("", inplace=True)
|
179 |
+
|
180 |
+
return df
|
181 |
+
|
182 |
+
|
183 |
+
CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
|
184 |
+
CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")
|
185 |
+
|
186 |
+
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
187 |
+
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
195 |
|
196 |
def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
|
|
|
2234 |
|
2235 |
with gr.TabItem("Cross-Lingual Consistency"):
|
2236 |
|
2237 |
+
# dataset 1: cross-mmlu
|
2238 |
+
with gr.TabItem("Cross-XQUAD"):
|
2239 |
+
with gr.TabItem("Zero Shot"):
|
2240 |
+
with gr.TabItem("Overall"):
|
2241 |
+
with gr.Row():
|
2242 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
2243 |
+
CROSS_XQUAD_ZERO_SHOT_OVERALL,
|
2244 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns),
|
2245 |
+
type="pandas",
|
2246 |
+
)
|
2247 |
+
with gr.TabItem("Language Performance"):
|
2248 |
+
|
2249 |
+
with gr.Row():
|
2250 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
2251 |
+
CROSS_XQUAD_ZERO_SHOT_LANGUAGE,
|
2252 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns),
|
2253 |
+
type="pandas",
|
2254 |
+
)
|
2255 |
+
with gr.TabItem("Five Shot"):
|
2256 |
+
with gr.TabItem("Overall"):
|
2257 |
+
|
2258 |
+
with gr.Row():
|
2259 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
2260 |
+
CROSS_XQUAD_FIVE_SHOT_OVERALL,
|
2261 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns),
|
2262 |
+
type="pandas",
|
2263 |
+
)
|
2264 |
+
with gr.TabItem("Language Performance"):
|
2265 |
+
|
2266 |
+
with gr.Row():
|
2267 |
+
gr.components.Dataframe(
|
2268 |
+
CROSS_XQUAD_FIVE_SHOT_LANGUAGE,
|
2269 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns),
|
2270 |
+
type="pandas",
|
2271 |
+
)
|
2272 |
+
|
2273 |
+
with gr.Row():
|
2274 |
+
gr.Markdown("""
|
2275 |
+
**Cross-XQUAD Leaderboard** 🔮
|
2276 |
+
|
2277 |
+
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
2278 |
+
- **Languages:** English, Chinese, Spanish, Vietnamese
|
2279 |
+
""")
|
2280 |
+
|
2281 |
# dataset 1: cross-mmlu
|
2282 |
with gr.TabItem("Cross-MMLU"):
|
2283 |
with gr.TabItem("Zero Shot"):
|
|
|
3235 |
gr.Markdown(r"""
|
3236 |
If our datasets and leaderboard are useful, please consider cite:
|
3237 |
```bibtex
|
3238 |
+
@article{SeaEval,
|
3239 |
title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
|
3240 |
author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
|
3241 |
+
journal={NAACL},
|
3242 |
+
year={2024}}
|
3243 |
```
|
3244 |
""")
|
3245 |
# Running the functions on page load in addition to when the button is clicked
|