Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
d354e12
1
Parent(s):
7de3b23
update
Browse files- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +16 -0
src/display/utils.py
CHANGED
@@ -40,7 +40,7 @@ class Tasks(Enum):
|
|
40 |
cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
|
41 |
|
42 |
race = Task("race", "acc", "RACE/Acc")
|
43 |
-
squadv2 = Task("squadv2", "
|
44 |
|
45 |
memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
|
46 |
ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
|
|
|
40 |
cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
|
41 |
|
42 |
race = Task("race", "acc", "RACE/Acc")
|
43 |
+
# squadv2 = Task("squadv2", "exact_normalised", "SQUaDv2/EM")
|
44 |
|
45 |
memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
|
46 |
ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
|
src/leaderboard/read_evals.py
CHANGED
@@ -11,6 +11,14 @@ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, Weigh
|
|
11 |
from src.submission.check_validity import is_model_on_hub
|
12 |
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
@dataclass
|
15 |
class EvalResult:
|
16 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
@@ -84,6 +92,12 @@ class EvalResult:
|
|
84 |
for k, v in entry_copy.items():
|
85 |
if "exact_match" in k:
|
86 |
results[task_name][k.replace("exact_match", "em")] = v
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
entry_copy = results[task_name].copy()
|
89 |
|
@@ -99,6 +113,8 @@ class EvalResult:
|
|
99 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
100 |
continue
|
101 |
|
|
|
|
|
102 |
mean_acc = np.mean(accs) * 100.0
|
103 |
results[task.benchmark] = mean_acc
|
104 |
|
|
|
11 |
from src.submission.check_validity import is_model_on_hub
|
12 |
|
13 |
|
14 |
+
def is_float(string):
|
15 |
+
try:
|
16 |
+
float(string)
|
17 |
+
return True
|
18 |
+
except ValueError:
|
19 |
+
return False
|
20 |
+
|
21 |
+
|
22 |
@dataclass
|
23 |
class EvalResult:
|
24 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
|
|
92 |
for k, v in entry_copy.items():
|
93 |
if "exact_match" in k:
|
94 |
results[task_name][k.replace("exact_match", "em")] = v
|
95 |
+
if "squadv2" in task_name:
|
96 |
+
value = results[task_name][k]
|
97 |
+
if is_float(value) and 'normalised' not in k:
|
98 |
+
results[task_name][f"{k}_normalised"] = value / 100.0
|
99 |
+
else:
|
100 |
+
del results[task_name][k]
|
101 |
|
102 |
entry_copy = results[task_name].copy()
|
103 |
|
|
|
113 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
114 |
continue
|
115 |
|
116 |
+
# print(accs)
|
117 |
+
|
118 |
mean_acc = np.mean(accs) * 100.0
|
119 |
results[task.benchmark] = mean_acc
|
120 |
|