binwang commited on
Commit
5346992
·
verified ·
1 Parent(s): 12d76ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -3
app.py CHANGED
@@ -46,6 +46,151 @@ NUM_MODELS = len(set(MODEL_LIST))
46
  MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
47
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
48
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
50
 
51
  def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
@@ -2089,6 +2234,50 @@ with block:
2089
 
2090
  with gr.TabItem("Cross-Lingual Consistency"):
2091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2092
  # dataset 1: cross-mmlu
2093
  with gr.TabItem("Cross-MMLU"):
2094
  with gr.TabItem("Zero Shot"):
@@ -3046,11 +3235,11 @@ with block:
3046
  gr.Markdown(r"""
3047
  If our datasets and leaderboard are useful, please consider cite:
3048
  ```bibtex
3049
- @article{SeaEval2023,
3050
  title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
3051
  author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
3052
- journal={arXiv preprint arXiv:2309.04766},
3053
- year={2023}}
3054
  ```
3055
  """)
3056
  # Running the functions on page load in addition to when the button is clicked
 
46
  MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
47
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
48
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
49
+
50
+
51
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
52
+
53
+ def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
54
+
55
+ df_list = []
56
+
57
+ for model in MODEL_LIST:
58
+
59
+
60
+ results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
61
+
62
+
63
+ try:
64
+ overall_acc = [results['overall_acc'] for results in results_list]
65
+ overall_acc = median(overall_acc)
66
+
67
+ consistency_score_3 = [results['consistency_score_3'] for results in results_list]
68
+ consistency_score_3 = median(consistency_score_3)
69
+
70
+ AC3_3 = [results['AC3_3'] for results in results_list]
71
+ AC3_3 = median(AC3_3)
72
+
73
+ except:
74
+ consistency_score_3 = -1
75
+ overall_acc = -1
76
+ AC3_3 = -1
77
+
78
+ res = {
79
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
80
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
81
+ "Accuracy": overall_acc,
82
+ "Cross-Lingual Consistency": consistency_score_3,
83
+ "AC3": AC3_3,
84
+ }
85
+
86
+ df_list.append(res)
87
+
88
+
89
+ df = pd.DataFrame(df_list)
90
+ # If there are any models that are the same, merge them
91
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
92
+ df = df.groupby("Model", as_index=False).first()
93
+ # Put 'Model' column first
94
+ #cols = sorted(list(df.columns))
95
+ cols = list(df.columns)
96
+ cols.insert(0, cols.pop(cols.index("Model")))
97
+ df = df[cols]
98
+
99
+ if rank:
100
+ df = add_rank(df, compute_average=False)
101
+
102
+ if fillna:
103
+ df.fillna("", inplace=True)
104
+
105
+ return df
106
+
107
+
108
+ CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
109
+ CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")
110
+
111
+
112
+ def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True):
113
+
114
+ df_list = []
115
+
116
+ for model in MODEL_LIST:
117
+
118
+
119
+ results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
120
+
121
+
122
+ try:
123
+ English = [results['language_acc']['English'] for results in results_list]
124
+ Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
125
+ Chinese = [results['language_acc']['Chinese'] for results in results_list]
126
+ Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
127
+ Filipino = [results['language_acc']['Filipino'] for results in results_list]
128
+ Spanish = [results['language_acc']['Spanish'] for results in results_list]
129
+ Malay = [results['language_acc']['Malay'] for results in results_list]
130
+
131
+ English = median(English)
132
+ Vietnamese = median(Vietnamese)
133
+ Chinese = median(Chinese)
134
+ Indonesian = median(Indonesian)
135
+ Filipino = median(Filipino)
136
+ Spanish = median(Spanish)
137
+ Malay = median(Malay)
138
+
139
+
140
+ except:
141
+ English = -1
142
+ Vietnamese = -1
143
+ Chinese = -1
144
+ Indonesian = -1
145
+ Filipino = -1
146
+ Spanish = -1
147
+ Malay = -1
148
+
149
+ res = {
150
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
151
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
152
+ "English": English,
153
+ "Vietnamese": Vietnamese,
154
+ "Chinese": Chinese,
155
+ "Indonesian": Indonesian,
156
+ "Filipino": Filipino,
157
+ "Spanish": Spanish,
158
+ "Malay": Malay,
159
+ }
160
+
161
+ df_list.append(res)
162
+
163
+
164
+ df = pd.DataFrame(df_list)
165
+ # If there are any models that are the same, merge them
166
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
167
+ df = df.groupby("Model", as_index=False).first()
168
+ # Put 'Model' column first
169
+ #cols = sorted(list(df.columns))
170
+ cols = list(df.columns)
171
+ cols.insert(0, cols.pop(cols.index("Model")))
172
+ df = df[cols]
173
+
174
+ if rank:
175
+ df = add_rank(df, compute_average=False)
176
+
177
+ if fillna:
178
+ df.fillna("", inplace=True)
179
+
180
+ return df
181
+
182
+
183
+ CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
184
+ CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")
185
+
186
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
187
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
188
+
189
+
190
+
191
+
192
+
193
+
194
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
195
 
196
  def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
 
2234
 
2235
  with gr.TabItem("Cross-Lingual Consistency"):
2236
 
2237
+ # dataset 1: cross-mmlu
2238
+ with gr.TabItem("Cross-XQUAD"):
2239
+ with gr.TabItem("Zero Shot"):
2240
+ with gr.TabItem("Overall"):
2241
+ with gr.Row():
2242
+ cross_xquad_zero_shot_overall = gr.components.Dataframe(
2243
+ CROSS_XQUAD_ZERO_SHOT_OVERALL,
2244
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns),
2245
+ type="pandas",
2246
+ )
2247
+ with gr.TabItem("Language Performance"):
2248
+
2249
+ with gr.Row():
2250
+ cross_xquad_zero_shot_overall = gr.components.Dataframe(
2251
+ CROSS_XQUAD_ZERO_SHOT_LANGUAGE,
2252
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns),
2253
+ type="pandas",
2254
+ )
2255
+ with gr.TabItem("Five Shot"):
2256
+ with gr.TabItem("Overall"):
2257
+
2258
+ with gr.Row():
2259
+ cross_xquad_zero_shot_overall = gr.components.Dataframe(
2260
+ CROSS_XQUAD_FIVE_SHOT_OVERALL,
2261
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns),
2262
+ type="pandas",
2263
+ )
2264
+ with gr.TabItem("Language Performance"):
2265
+
2266
+ with gr.Row():
2267
+ gr.components.Dataframe(
2268
+ CROSS_XQUAD_FIVE_SHOT_LANGUAGE,
2269
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns),
2270
+ type="pandas",
2271
+ )
2272
+
2273
+ with gr.Row():
2274
+ gr.Markdown("""
2275
+ **Cross-XQUAD Leaderboard** 🔮
2276
+
2277
+ - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2278
+ - **Languages:** English, Chinese, Spanish, Vietnamese
2279
+ """)
2280
+
2281
  # dataset 1: cross-mmlu
2282
  with gr.TabItem("Cross-MMLU"):
2283
  with gr.TabItem("Zero Shot"):
 
3235
  gr.Markdown(r"""
3236
  If our datasets and leaderboard are useful, please consider cite:
3237
  ```bibtex
3238
+ @article{SeaEval,
3239
  title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
3240
  author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
3241
+ journal={NAACL},
3242
+ year={2024}}
3243
  ```
3244
  """)
3245
  # Running the functions on page load in addition to when the button is clicked