Spaces:
Running
Running
new results with random
Browse files- all_results.json +0 -0
- app.py +39 -42
all_results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
-
|
2 |
import json
|
3 |
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
|
|
|
|
7 |
|
8 |
print("Loading datasets...")
|
9 |
|
@@ -67,13 +69,13 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
|
|
67 |
|
68 |
try:
|
69 |
overall_acc = [results['overall_acc'] for results in results_list]
|
70 |
-
overall_acc =
|
71 |
|
72 |
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
73 |
-
consistency_score_3 =
|
74 |
|
75 |
AC3_3 = [results['AC3_3'] for results in results_list]
|
76 |
-
AC3_3 =
|
77 |
|
78 |
except:
|
79 |
print(results_list)
|
@@ -126,21 +128,21 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
|
|
126 |
|
127 |
|
128 |
try:
|
129 |
-
English
|
130 |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
131 |
-
Chinese
|
132 |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
133 |
-
Filipino
|
134 |
-
Spanish
|
135 |
-
Malay
|
136 |
|
137 |
-
English
|
138 |
-
Vietnamese =
|
139 |
-
Chinese
|
140 |
-
Indonesian =
|
141 |
-
Filipino
|
142 |
-
Spanish
|
143 |
-
Malay
|
144 |
|
145 |
|
146 |
except:
|
@@ -208,13 +210,13 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
|
|
208 |
|
209 |
try:
|
210 |
overall_acc = [results['overall_acc'] for results in results_list]
|
211 |
-
overall_acc =
|
212 |
|
213 |
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
214 |
-
consistency_score_3 =
|
215 |
|
216 |
AC3_3 = [results['AC3_3'] for results in results_list]
|
217 |
-
AC3_3 =
|
218 |
|
219 |
except:
|
220 |
print(results_list)
|
@@ -267,21 +269,21 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
|
|
267 |
|
268 |
|
269 |
try:
|
270 |
-
English
|
271 |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
272 |
-
Chinese
|
273 |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
274 |
-
Filipino
|
275 |
-
Spanish
|
276 |
-
Malay
|
277 |
|
278 |
-
English
|
279 |
-
Vietnamese =
|
280 |
-
Chinese
|
281 |
-
Indonesian =
|
282 |
-
Filipino
|
283 |
-
Spanish
|
284 |
-
Malay
|
285 |
|
286 |
|
287 |
except:
|
@@ -346,14 +348,12 @@ def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
346 |
|
347 |
|
348 |
try:
|
349 |
-
accuracy = [results['accuracy'] for results in results_list]
|
350 |
-
accuracy = sum(accuracy) / len(accuracy)
|
351 |
|
352 |
except:
|
353 |
print(results_list)
|
354 |
accuracy = -1
|
355 |
|
356 |
-
|
357 |
res = {
|
358 |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
|
359 |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
|
@@ -401,8 +401,7 @@ def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
401 |
|
402 |
|
403 |
try:
|
404 |
-
accuracy = [results['accuracy'] for results in results_list]
|
405 |
-
accuracy = sum(accuracy) / len(accuracy)
|
406 |
|
407 |
except:
|
408 |
print(results_list)
|
@@ -456,8 +455,7 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
456 |
|
457 |
|
458 |
try:
|
459 |
-
accuracy = [results['accuracy'] for results in results_list]
|
460 |
-
accuracy = sum(accuracy) / len(accuracy)
|
461 |
|
462 |
except:
|
463 |
print(results_list)
|
@@ -511,8 +509,7 @@ def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
511 |
|
512 |
|
513 |
try:
|
514 |
-
accuracy = [results['accuracy'] for results in results_list]
|
515 |
-
accuracy = sum(accuracy) / len(accuracy)
|
516 |
|
517 |
except:
|
518 |
print(results_list)
|
@@ -789,8 +786,8 @@ with block:
|
|
789 |
with gr.TabItem("Overall"):
|
790 |
with gr.Row():
|
791 |
gr.components.Dataframe(
|
792 |
-
|
793 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
794 |
type="pandas",
|
795 |
)
|
796 |
|
|
|
1 |
+
|
2 |
import json
|
3 |
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
+
from statistics import median
|
8 |
+
|
9 |
|
10 |
print("Loading datasets...")
|
11 |
|
|
|
69 |
|
70 |
try:
|
71 |
overall_acc = [results['overall_acc'] for results in results_list]
|
72 |
+
overall_acc = median(overall_acc)
|
73 |
|
74 |
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
75 |
+
consistency_score_3 = median(consistency_score_3)
|
76 |
|
77 |
AC3_3 = [results['AC3_3'] for results in results_list]
|
78 |
+
AC3_3 = median(AC3_3)
|
79 |
|
80 |
except:
|
81 |
print(results_list)
|
|
|
128 |
|
129 |
|
130 |
try:
|
131 |
+
English = [results['language_acc']['English'] for results in results_list]
|
132 |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
133 |
+
Chinese = [results['language_acc']['Chinese'] for results in results_list]
|
134 |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
135 |
+
Filipino = [results['language_acc']['Filipino'] for results in results_list]
|
136 |
+
Spanish = [results['language_acc']['Spanish'] for results in results_list]
|
137 |
+
Malay = [results['language_acc']['Malay'] for results in results_list]
|
138 |
|
139 |
+
English = median(English)
|
140 |
+
Vietnamese = median(Vietnamese)
|
141 |
+
Chinese = median(Chinese)
|
142 |
+
Indonesian = median(Indonesian)
|
143 |
+
Filipino = median(Filipino)
|
144 |
+
Spanish = median(Spanish)
|
145 |
+
Malay = median(Malay)
|
146 |
|
147 |
|
148 |
except:
|
|
|
210 |
|
211 |
try:
|
212 |
overall_acc = [results['overall_acc'] for results in results_list]
|
213 |
+
overall_acc = median(overall_acc)
|
214 |
|
215 |
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
216 |
+
consistency_score_3 = median(consistency_score_3)
|
217 |
|
218 |
AC3_3 = [results['AC3_3'] for results in results_list]
|
219 |
+
AC3_3 = median(AC3_3)
|
220 |
|
221 |
except:
|
222 |
print(results_list)
|
|
|
269 |
|
270 |
|
271 |
try:
|
272 |
+
English = [results['language_acc']['English'] for results in results_list]
|
273 |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
274 |
+
Chinese = [results['language_acc']['Chinese'] for results in results_list]
|
275 |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
276 |
+
Filipino = [results['language_acc']['Filipino'] for results in results_list]
|
277 |
+
Spanish = [results['language_acc']['Spanish'] for results in results_list]
|
278 |
+
Malay = [results['language_acc']['Malay'] for results in results_list]
|
279 |
|
280 |
+
English = median(English)
|
281 |
+
Vietnamese = median(Vietnamese)
|
282 |
+
Chinese = median(Chinese)
|
283 |
+
Indonesian = median(Indonesian)
|
284 |
+
Filipino = median(Filipino)
|
285 |
+
Spanish = median(Spanish)
|
286 |
+
Malay = median(Malay)
|
287 |
|
288 |
|
289 |
except:
|
|
|
348 |
|
349 |
|
350 |
try:
|
351 |
+
accuracy = median([results['accuracy'] for results in results_list])
|
|
|
352 |
|
353 |
except:
|
354 |
print(results_list)
|
355 |
accuracy = -1
|
356 |
|
|
|
357 |
res = {
|
358 |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
|
359 |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
|
|
|
401 |
|
402 |
|
403 |
try:
|
404 |
+
accuracy = median([results['accuracy'] for results in results_list])
|
|
|
405 |
|
406 |
except:
|
407 |
print(results_list)
|
|
|
455 |
|
456 |
|
457 |
try:
|
458 |
+
accuracy = median([results['accuracy'] for results in results_list])
|
|
|
459 |
|
460 |
except:
|
461 |
print(results_list)
|
|
|
509 |
|
510 |
|
511 |
try:
|
512 |
+
accuracy = median([results['accuracy'] for results in results_list])
|
|
|
513 |
|
514 |
except:
|
515 |
print(results_list)
|
|
|
786 |
with gr.TabItem("Overall"):
|
787 |
with gr.Row():
|
788 |
gr.components.Dataframe(
|
789 |
+
PH_EVAL_FIVE_SHOT,
|
790 |
+
datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
|
791 |
type="pandas",
|
792 |
)
|
793 |
|