Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
7de3b23
1
Parent(s):
bef4eff
update
Browse files- src/display/css_html_js.py +4 -0
- src/display/utils.py +21 -14
- src/leaderboard/filter_models.py +0 -1
src/display/css_html_js.py
CHANGED
@@ -29,6 +29,10 @@ custom_css = """
|
|
29 |
margin-top: 15px
|
30 |
}
|
31 |
|
|
|
|
|
|
|
|
|
32 |
#leaderboard-table-lite {
|
33 |
margin-top: 15px
|
34 |
}
|
|
|
29 |
margin-top: 15px
|
30 |
}
|
31 |
|
32 |
+
#leaderboard-table table td {
|
33 |
+
text-align: center;
|
34 |
+
}
|
35 |
+
|
36 |
#leaderboard-table-lite {
|
37 |
margin-top: 15px
|
38 |
}
|
src/display/utils.py
CHANGED
@@ -24,27 +24,34 @@ class Tasks(Enum):
|
|
24 |
# gsm8k = Task("gsm8k", "acc", "GSM8K")
|
25 |
# drop = Task("drop", "f1", "DROP")
|
26 |
|
27 |
-
nqopen = Task("
|
28 |
-
triviaqa = Task("
|
29 |
|
30 |
-
truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "
|
31 |
-
truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "
|
|
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
|
37 |
-
|
38 |
-
|
|
|
39 |
|
40 |
-
|
|
|
41 |
|
42 |
-
|
|
|
43 |
|
44 |
-
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
#truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
|
48 |
|
49 |
# These classes are for user facing column names,
|
50 |
# to avoid having to change them all around the code
|
|
|
24 |
# gsm8k = Task("gsm8k", "acc", "GSM8K")
|
25 |
# drop = Task("drop", "f1", "DROP")
|
26 |
|
27 |
+
nqopen = Task("nq8", "em", "NQ Open/EM")
|
28 |
+
triviaqa = Task("tqa8", "em", "TriviaQA/EM")
|
29 |
|
30 |
+
truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthQA MC1/Acc")
|
31 |
+
truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc")
|
32 |
+
truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE")
|
33 |
|
34 |
+
xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE")
|
35 |
+
xsum_f = Task("xsum_v2", "factKB", "XSum/factKB")
|
36 |
+
xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P")
|
37 |
|
38 |
+
cnndm_r = Task("cnndm_v2", "rougeL", "CNN-DM/ROUGE")
|
39 |
+
cnndm_f = Task("cnndm_v2", "factKB", "CNN-DM/factKB")
|
40 |
+
cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
|
41 |
|
42 |
+
race = Task("race", "acc", "RACE/Acc")
|
43 |
+
squadv2 = Task("squadv2", "exact", "SQUaDv2/EM")
|
44 |
|
45 |
+
memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
|
46 |
+
ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
|
47 |
|
48 |
+
faithdial = Task("faithdial_hallu_v2", "acc", "FaithDial/Acc")
|
49 |
+
|
50 |
+
halueval_qa = Task("halueval_qa", "acc", "HaluQA/Acc")
|
51 |
+
halueval_summ = Task("halueval_summarization", "acc", "HaluSumm/Acc")
|
52 |
+
halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
53 |
|
54 |
+
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
|
|
55 |
|
56 |
# These classes are for user facing column names,
|
57 |
# to avoid having to change them all around the code
|
src/leaderboard/filter_models.py
CHANGED
@@ -20,7 +20,6 @@ DO_NOT_SUBMIT_MODELS = [
|
|
20 |
"Voicelab/trurl-2-13b", # trained on MMLU
|
21 |
]
|
22 |
|
23 |
-
|
24 |
def flag_models(leaderboard_data: list[dict]):
|
25 |
for model_data in leaderboard_data:
|
26 |
if model_data["model_name_for_query"] in FLAGGED_MODELS:
|
|
|
20 |
"Voicelab/trurl-2-13b", # trained on MMLU
|
21 |
]
|
22 |
|
|
|
23 |
def flag_models(leaderboard_data: list[dict]):
|
24 |
for model_data in leaderboard_data:
|
25 |
if model_data["model_name_for_query"] in FLAGGED_MODELS:
|