lvkaokao commited on
Commit
b778b1a
1 Parent(s): 715b290

revise read.

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +4 -18
src/leaderboard/read_evals.py CHANGED
@@ -83,24 +83,10 @@ class EvalResult:
83
  results = {}
84
  for task in Tasks:
85
  task = task.value
86
- # We skip old mmlu entries
87
- wrong_mmlu_version = False
88
- if task.benchmark == "hendrycksTest":
89
- for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
90
- if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
91
- wrong_mmlu_version = True
92
-
93
- if wrong_mmlu_version:
94
- continue
95
-
96
- # Some truthfulQA values are NaNs
97
- if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
98
- if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
99
- results[task.benchmark] = 0.0
100
- continue
101
-
102
- # We average all scores of a given metric (mostly for mmlu)
103
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
104
  if accs.size == 0 or any([acc is None for acc in accs]):
105
  continue
106
 
 
83
  results = {}
84
  for task in Tasks:
85
  task = task.value
86
+ if task.benchmark == "mmlu":
87
+ accs = np.array([data["results"]["harness|mmlu|0"][task.metric]])
88
+ else:
89
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  if accs.size == 0 or any([acc is None for acc in accs]):
91
  continue
92