gblazex commited on
Commit
23ad4cc
·
1 Parent(s): 16eb640

Update src/leaderboard/read_evals.py

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +17 -2
src/leaderboard/read_evals.py CHANGED
@@ -76,9 +76,24 @@ class EvalResult:
76
  results = {}
77
  for task in Tasks:
78
  task = task.value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # We average all scores of a given metric (not all metrics are present in all files)
81
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
82
  if accs.size == 0 or any([acc is None for acc in accs]):
83
  continue
84
 
 
76
  results = {}
77
  for task in Tasks:
78
  task = task.value
79
+ # We skip old mmlu entries
80
+ wrong_mmlu_version = False
81
+ if task.benchmark == "hendrycksTest":
82
+ for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
83
+ if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
84
+ wrong_mmlu_version = True
85
+
86
+ if wrong_mmlu_version:
87
+ continue
88
+
89
+ # Some truthfulQA values are NaNs
90
+ if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
91
+ if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
92
+ results[task.benchmark] = 0.0
93
+ continue
94
 
95
+ # We average all scores of a given metric (mostly for mmlu)
96
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
97
  if accs.size == 0 or any([acc is None for acc in accs]):
98
  continue
99