Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
•
36e3010
1
Parent(s):
5408125
Refactor code for adding generic tasks
Browse files- src/display/utils.py +49 -25
- src/leaderboard/read_evals.py +3 -1
src/display/utils.py
CHANGED
@@ -12,14 +12,16 @@ class Task:
|
|
12 |
benchmark: str
|
13 |
metric: str
|
14 |
col_name: str
|
|
|
|
|
15 |
|
16 |
class Tasks(Enum):
|
17 |
-
arc = Task("arc:challenge", "acc_norm", "ARC")
|
18 |
-
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
19 |
-
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
20 |
-
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
21 |
-
winogrande = Task("winogrande", "acc", "Winogrande")
|
22 |
-
gsm8k = Task("gsm8k", "acc", "GSM8K")
|
23 |
|
24 |
# These classes are for user facing column names,
|
25 |
# to avoid having to change them all around the code
|
@@ -75,26 +77,33 @@ baseline_row = {
|
|
75 |
AutoEvalColumn.revision.name: "N/A",
|
76 |
AutoEvalColumn.precision.name: None,
|
77 |
AutoEvalColumn.merged.name: False,
|
78 |
-
AutoEvalColumn.average.name: 31.0,
|
79 |
-
AutoEvalColumn.arc.name: 25.0,
|
80 |
-
AutoEvalColumn.hellaswag.name: 25.0,
|
81 |
-
AutoEvalColumn.mmlu.name: 25.0,
|
82 |
-
AutoEvalColumn.truthfulqa.name: 25.0,
|
83 |
-
AutoEvalColumn.winogrande.name: 50.0,
|
84 |
-
AutoEvalColumn.gsm8k.name: 0.21,
|
85 |
AutoEvalColumn.dummy.name: "baseline",
|
86 |
AutoEvalColumn.model_type.name: "",
|
87 |
AutoEvalColumn.flagged.name: False,
|
88 |
AutoEvalColumn.model_type_symbol.name: None,
|
89 |
AutoEvalColumn.architecture.name: None,
|
90 |
AutoEvalColumn.weight_type.name: None,
|
91 |
-
AutoEvalColumn.params.name:
|
92 |
-
AutoEvalColumn.likes.name:
|
93 |
-
AutoEvalColumn.license.name:
|
94 |
-
AutoEvalColumn.still_on_hub.name:
|
95 |
-
AutoEvalColumn.moe.name:
|
96 |
}
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
99 |
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
100 |
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
|
@@ -107,19 +116,34 @@ human_baseline_row = {
|
|
107 |
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
108 |
AutoEvalColumn.revision.name: "N/A",
|
109 |
AutoEvalColumn.precision.name: None,
|
110 |
-
AutoEvalColumn.average.name: 92.75,
|
111 |
AutoEvalColumn.merged.name: False,
|
112 |
-
AutoEvalColumn.arc.name: 80.0,
|
113 |
-
AutoEvalColumn.hellaswag.name: 95.0,
|
114 |
-
AutoEvalColumn.mmlu.name: 89.8,
|
115 |
-
AutoEvalColumn.truthfulqa.name: 94.0,
|
116 |
-
AutoEvalColumn.winogrande.name: 94.0,
|
117 |
-
AutoEvalColumn.gsm8k.name: 100,
|
118 |
AutoEvalColumn.dummy.name: "human_baseline",
|
119 |
AutoEvalColumn.model_type.name: "",
|
120 |
AutoEvalColumn.flagged.name: False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
}
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
@dataclass
|
124 |
class ModelDetails:
|
125 |
name: str
|
|
|
12 |
benchmark: str
|
13 |
metric: str
|
14 |
col_name: str
|
15 |
+
baseline: float = 0.0
|
16 |
+
human_baseline: float = 0.0
|
17 |
|
18 |
class Tasks(Enum):
|
19 |
+
arc = Task("arc:challenge", "acc_norm", "ARC", 25.0, 80.0)
|
20 |
+
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag", 25.0, 95.0)
|
21 |
+
mmlu = Task("hendrycksTest", "acc", "MMLU", 25.0, 89.8)
|
22 |
+
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA", 25.0, 94.0)
|
23 |
+
winogrande = Task("winogrande", "acc", "Winogrande", 50.0, 94.0)
|
24 |
+
gsm8k = Task("gsm8k", "acc", "GSM8K", 0.21, 100)
|
25 |
|
26 |
# These classes are for user facing column names,
|
27 |
# to avoid having to change them all around the code
|
|
|
77 |
AutoEvalColumn.revision.name: "N/A",
|
78 |
AutoEvalColumn.precision.name: None,
|
79 |
AutoEvalColumn.merged.name: False,
|
80 |
+
#AutoEvalColumn.average.name: 31.0,
|
81 |
+
#AutoEvalColumn.arc.name: 25.0,
|
82 |
+
#AutoEvalColumn.hellaswag.name: 25.0,
|
83 |
+
#AutoEvalColumn.mmlu.name: 25.0,
|
84 |
+
#AutoEvalColumn.truthfulqa.name: 25.0,
|
85 |
+
#AutoEvalColumn.winogrande.name: 50.0,
|
86 |
+
#AutoEvalColumn.gsm8k.name: 0.21,
|
87 |
AutoEvalColumn.dummy.name: "baseline",
|
88 |
AutoEvalColumn.model_type.name: "",
|
89 |
AutoEvalColumn.flagged.name: False,
|
90 |
AutoEvalColumn.model_type_symbol.name: None,
|
91 |
AutoEvalColumn.architecture.name: None,
|
92 |
AutoEvalColumn.weight_type.name: None,
|
93 |
+
AutoEvalColumn.params.name: 0,
|
94 |
+
AutoEvalColumn.likes.name: 0,
|
95 |
+
AutoEvalColumn.license.name: "",
|
96 |
+
AutoEvalColumn.still_on_hub.name: False,
|
97 |
+
AutoEvalColumn.moe.name: False
|
98 |
}
|
99 |
|
100 |
+
baseline_list = []
|
101 |
+
for task in Tasks:
|
102 |
+
baseline_row[task.name] = task.value.baseline
|
103 |
+
if task.value.baseline is not None:
|
104 |
+
baseline_list.append(task.value.baseline)
|
105 |
+
baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
|
106 |
+
|
107 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
108 |
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
109 |
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
|
|
|
116 |
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
117 |
AutoEvalColumn.revision.name: "N/A",
|
118 |
AutoEvalColumn.precision.name: None,
|
119 |
+
#AutoEvalColumn.average.name: 92.75,
|
120 |
AutoEvalColumn.merged.name: False,
|
121 |
+
#AutoEvalColumn.arc.name: 80.0,
|
122 |
+
#AutoEvalColumn.hellaswag.name: 95.0,
|
123 |
+
#AutoEvalColumn.mmlu.name: 89.8,
|
124 |
+
#AutoEvalColumn.truthfulqa.name: 94.0,
|
125 |
+
#AutoEvalColumn.winogrande.name: 94.0,
|
126 |
+
#AutoEvalColumn.gsm8k.name: 100,
|
127 |
AutoEvalColumn.dummy.name: "human_baseline",
|
128 |
AutoEvalColumn.model_type.name: "",
|
129 |
AutoEvalColumn.flagged.name: False,
|
130 |
+
AutoEvalColumn.model_type_symbol.name: None,
|
131 |
+
AutoEvalColumn.architecture.name: None,
|
132 |
+
AutoEvalColumn.weight_type.name: None,
|
133 |
+
AutoEvalColumn.params.name: 0,
|
134 |
+
AutoEvalColumn.likes.name: 0,
|
135 |
+
AutoEvalColumn.license.name: "",
|
136 |
+
AutoEvalColumn.still_on_hub.name: False,
|
137 |
+
AutoEvalColumn.moe.name: False
|
138 |
}
|
139 |
|
140 |
+
baseline_list = []
|
141 |
+
for task in Tasks:
|
142 |
+
human_baseline_row[task.name] = task.value.human_baseline
|
143 |
+
if task.value.human_baseline is not None:
|
144 |
+
baseline_list.append(task.value.human_baseline)
|
145 |
+
human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
|
146 |
+
|
147 |
@dataclass
|
148 |
class ModelDetails:
|
149 |
name: str
|
src/leaderboard/read_evals.py
CHANGED
@@ -66,6 +66,7 @@ class EvalResult:
|
|
66 |
results = {}
|
67 |
for task in Tasks:
|
68 |
task = task.value
|
|
|
69 |
# We skip old mmlu entries
|
70 |
wrong_mmlu_version = False
|
71 |
if task.benchmark == "hendrycksTest":
|
@@ -81,11 +82,12 @@ class EvalResult:
|
|
81 |
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
82 |
results[task.benchmark] = 0.0
|
83 |
continue
|
84 |
-
|
85 |
# We average all scores of a given metric (mostly for mmlu)
|
86 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
87 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
88 |
continue
|
|
|
89 |
|
90 |
mean_acc = np.mean(accs) * 100.0
|
91 |
results[task.benchmark] = mean_acc
|
|
|
66 |
results = {}
|
67 |
for task in Tasks:
|
68 |
task = task.value
|
69 |
+
"""
|
70 |
# We skip old mmlu entries
|
71 |
wrong_mmlu_version = False
|
72 |
if task.benchmark == "hendrycksTest":
|
|
|
82 |
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
83 |
results[task.benchmark] = 0.0
|
84 |
continue
|
85 |
+
"""
|
86 |
# We average all scores of a given metric (mostly for mmlu)
|
87 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
88 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
89 |
continue
|
90 |
+
|
91 |
|
92 |
mean_acc = np.mean(accs) * 100.0
|
93 |
results[task.benchmark] = mean_acc
|