eduagarcia commited on
Commit
36e3010
1 Parent(s): 5408125

Refactor code for adding generic tasks

Browse files
src/display/utils.py CHANGED
@@ -12,14 +12,16 @@ class Task:
12
  benchmark: str
13
  metric: str
14
  col_name: str
 
 
15
 
16
  class Tasks(Enum):
17
- arc = Task("arc:challenge", "acc_norm", "ARC")
18
- hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
19
- mmlu = Task("hendrycksTest", "acc", "MMLU")
20
- truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
21
- winogrande = Task("winogrande", "acc", "Winogrande")
22
- gsm8k = Task("gsm8k", "acc", "GSM8K")
23
 
24
  # These classes are for user facing column names,
25
  # to avoid having to change them all around the code
@@ -75,26 +77,33 @@ baseline_row = {
75
  AutoEvalColumn.revision.name: "N/A",
76
  AutoEvalColumn.precision.name: None,
77
  AutoEvalColumn.merged.name: False,
78
- AutoEvalColumn.average.name: 31.0,
79
- AutoEvalColumn.arc.name: 25.0,
80
- AutoEvalColumn.hellaswag.name: 25.0,
81
- AutoEvalColumn.mmlu.name: 25.0,
82
- AutoEvalColumn.truthfulqa.name: 25.0,
83
- AutoEvalColumn.winogrande.name: 50.0,
84
- AutoEvalColumn.gsm8k.name: 0.21,
85
  AutoEvalColumn.dummy.name: "baseline",
86
  AutoEvalColumn.model_type.name: "",
87
  AutoEvalColumn.flagged.name: False,
88
  AutoEvalColumn.model_type_symbol.name: None,
89
  AutoEvalColumn.architecture.name: None,
90
  AutoEvalColumn.weight_type.name: None,
91
- AutoEvalColumn.params.name: None,
92
- AutoEvalColumn.likes.name: None,
93
- AutoEvalColumn.license.name: None,
94
- AutoEvalColumn.still_on_hub.name: None,
95
- AutoEvalColumn.moe.name: None
96
  }
97
 
 
 
 
 
 
 
 
98
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
99
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
100
  # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
@@ -107,19 +116,34 @@ human_baseline_row = {
107
  AutoEvalColumn.model.name: "<p>Human performance</p>",
108
  AutoEvalColumn.revision.name: "N/A",
109
  AutoEvalColumn.precision.name: None,
110
- AutoEvalColumn.average.name: 92.75,
111
  AutoEvalColumn.merged.name: False,
112
- AutoEvalColumn.arc.name: 80.0,
113
- AutoEvalColumn.hellaswag.name: 95.0,
114
- AutoEvalColumn.mmlu.name: 89.8,
115
- AutoEvalColumn.truthfulqa.name: 94.0,
116
- AutoEvalColumn.winogrande.name: 94.0,
117
- AutoEvalColumn.gsm8k.name: 100,
118
  AutoEvalColumn.dummy.name: "human_baseline",
119
  AutoEvalColumn.model_type.name: "",
120
  AutoEvalColumn.flagged.name: False,
 
 
 
 
 
 
 
 
121
  }
122
 
 
 
 
 
 
 
 
123
  @dataclass
124
  class ModelDetails:
125
  name: str
 
12
  benchmark: str
13
  metric: str
14
  col_name: str
15
+ baseline: float = 0.0
16
+ human_baseline: float = 0.0
17
 
18
  class Tasks(Enum):
19
+ arc = Task("arc:challenge", "acc_norm", "ARC", 25.0, 80.0)
20
+ hellaswag = Task("hellaswag", "acc_norm", "HellaSwag", 25.0, 95.0)
21
+ mmlu = Task("hendrycksTest", "acc", "MMLU", 25.0, 89.8)
22
+ truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA", 25.0, 94.0)
23
+ winogrande = Task("winogrande", "acc", "Winogrande", 50.0, 94.0)
24
+ gsm8k = Task("gsm8k", "acc", "GSM8K", 0.21, 100)
25
 
26
  # These classes are for user facing column names,
27
  # to avoid having to change them all around the code
 
77
  AutoEvalColumn.revision.name: "N/A",
78
  AutoEvalColumn.precision.name: None,
79
  AutoEvalColumn.merged.name: False,
80
+ #AutoEvalColumn.average.name: 31.0,
81
+ #AutoEvalColumn.arc.name: 25.0,
82
+ #AutoEvalColumn.hellaswag.name: 25.0,
83
+ #AutoEvalColumn.mmlu.name: 25.0,
84
+ #AutoEvalColumn.truthfulqa.name: 25.0,
85
+ #AutoEvalColumn.winogrande.name: 50.0,
86
+ #AutoEvalColumn.gsm8k.name: 0.21,
87
  AutoEvalColumn.dummy.name: "baseline",
88
  AutoEvalColumn.model_type.name: "",
89
  AutoEvalColumn.flagged.name: False,
90
  AutoEvalColumn.model_type_symbol.name: None,
91
  AutoEvalColumn.architecture.name: None,
92
  AutoEvalColumn.weight_type.name: None,
93
+ AutoEvalColumn.params.name: 0,
94
+ AutoEvalColumn.likes.name: 0,
95
+ AutoEvalColumn.license.name: "",
96
+ AutoEvalColumn.still_on_hub.name: False,
97
+ AutoEvalColumn.moe.name: False
98
  }
99
 
100
+ baseline_list = []
101
+ for task in Tasks:
102
+ baseline_row[task.name] = task.value.baseline
103
+ if task.value.baseline is not None:
104
+ baseline_list.append(task.value.baseline)
105
+ baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
106
+
107
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
108
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
109
  # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
 
116
  AutoEvalColumn.model.name: "<p>Human performance</p>",
117
  AutoEvalColumn.revision.name: "N/A",
118
  AutoEvalColumn.precision.name: None,
119
+ #AutoEvalColumn.average.name: 92.75,
120
  AutoEvalColumn.merged.name: False,
121
+ #AutoEvalColumn.arc.name: 80.0,
122
+ #AutoEvalColumn.hellaswag.name: 95.0,
123
+ #AutoEvalColumn.mmlu.name: 89.8,
124
+ #AutoEvalColumn.truthfulqa.name: 94.0,
125
+ #AutoEvalColumn.winogrande.name: 94.0,
126
+ #AutoEvalColumn.gsm8k.name: 100,
127
  AutoEvalColumn.dummy.name: "human_baseline",
128
  AutoEvalColumn.model_type.name: "",
129
  AutoEvalColumn.flagged.name: False,
130
+ AutoEvalColumn.model_type_symbol.name: None,
131
+ AutoEvalColumn.architecture.name: None,
132
+ AutoEvalColumn.weight_type.name: None,
133
+ AutoEvalColumn.params.name: 0,
134
+ AutoEvalColumn.likes.name: 0,
135
+ AutoEvalColumn.license.name: "",
136
+ AutoEvalColumn.still_on_hub.name: False,
137
+ AutoEvalColumn.moe.name: False
138
  }
139
 
140
+ baseline_list = []
141
+ for task in Tasks:
142
+ human_baseline_row[task.name] = task.value.human_baseline
143
+ if task.value.human_baseline is not None:
144
+ baseline_list.append(task.value.human_baseline)
145
+ human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
146
+
147
  @dataclass
148
  class ModelDetails:
149
  name: str
src/leaderboard/read_evals.py CHANGED
@@ -66,6 +66,7 @@ class EvalResult:
66
  results = {}
67
  for task in Tasks:
68
  task = task.value
 
69
  # We skip old mmlu entries
70
  wrong_mmlu_version = False
71
  if task.benchmark == "hendrycksTest":
@@ -81,11 +82,12 @@ class EvalResult:
81
  if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
82
  results[task.benchmark] = 0.0
83
  continue
84
-
85
  # We average all scores of a given metric (mostly for mmlu)
86
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
87
  if accs.size == 0 or any([acc is None for acc in accs]):
88
  continue
 
89
 
90
  mean_acc = np.mean(accs) * 100.0
91
  results[task.benchmark] = mean_acc
 
66
  results = {}
67
  for task in Tasks:
68
  task = task.value
69
+ """
70
  # We skip old mmlu entries
71
  wrong_mmlu_version = False
72
  if task.benchmark == "hendrycksTest":
 
82
  if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
83
  results[task.benchmark] = 0.0
84
  continue
85
+ """
86
  # We average all scores of a given metric (mostly for mmlu)
87
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
88
  if accs.size == 0 or any([acc is None for acc in accs]):
89
  continue
90
+
91
 
92
  mean_acc = np.mean(accs) * 100.0
93
  results[task.benchmark] = mean_acc