Alina Lozovskaia commited on
Commit
c74b7d7
1 Parent(s): f86eaae

Changes as per comments

Browse files
Files changed (2) hide show
  1. app.py +16 -0
  2. src/leaderboard/read_evals.py +23 -15
app.py CHANGED
@@ -50,6 +50,9 @@ from src.tools.collections import update_collections
50
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
51
 
52
 
 
 
 
53
  # Start ephemeral Spaces on PRs (see config in README.md)
54
  enable_space_ci()
55
 
@@ -57,6 +60,19 @@ enable_space_ci()
57
  def restart_space():
58
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
61
  """Download dataset with exponential backoff retries."""
62
  attempt = 0
 
50
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
51
 
52
 
53
+ # Configure logging
54
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
55
+
56
  # Start ephemeral Spaces on PRs (see config in README.md)
57
  enable_space_ci()
58
 
 
60
  def restart_space():
61
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
62
 
63
+
64
+ def time_diff_wrapper(func):
65
+ def wrapper(*args, **kwargs):
66
+ start_time = time.time()
67
+ result = func(*args, **kwargs)
68
+ end_time = time.time()
69
+ diff = end_time - start_time
70
+ logging.info(f"Time taken for {func.__name__}: {diff} seconds")
71
+ return result
72
+ return wrapper
73
+
74
+
75
+ @time_diff_wrapper
76
  def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
77
  """Download dataset with exponential backoff retries."""
78
  attempt = 0
src/leaderboard/read_evals.py CHANGED
@@ -78,39 +78,47 @@ class EvalResult:
78
  @staticmethod
79
  def extract_results(data: Dict) -> Dict[str, float]:
80
  """
81
- Extracts and computes average scores from test result data for different benchmarks.
82
- Skips entries based on specific conditions and handles NaN values appropriately.
83
- Returns a dictionary with benchmarks as keys and their averaged scores as values in percentage.
84
 
85
  Parameters:
86
- - data (Dict): Input data with 'versions' and 'results'.
 
87
 
88
  Returns:
89
- - Dict[str, float]: A dictionary with benchmark names and their computed average scores.
 
 
 
 
 
 
90
  """
91
  results = {}
92
  for task in Tasks:
93
  task = task.value
94
-
95
  # We skip old mmlu entries
96
  if task.benchmark == "hendrycksTest":
97
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
98
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
99
  continue
100
 
101
- # Some truthfulQA values are NaNs
102
- if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
103
- if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
104
- results[task.benchmark] = 0.0
105
- continue
 
 
 
106
 
107
  # We average all scores of a given metric (mostly for mmlu)
108
- accs = [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]
109
- if accs or any([acc is None for acc in accs]):
110
  continue
111
 
112
- results[task.benchmark] = np.mean(accs) * 100.0
113
-
 
114
  return results
115
 
116
 
 
78
  @staticmethod
79
  def extract_results(data: Dict) -> Dict[str, float]:
80
  """
81
+ Extract and process benchmark results from a given dict.
 
 
82
 
83
  Parameters:
84
+ - data (Dict): A dictionary containing benchmark data. This dictionary must
85
+ include 'versions' and 'results' keys with respective sub-data.
86
 
87
  Returns:
88
+ - Dict[str, float]: A dictionary where keys are benchmark names and values
89
+ are the processed average scores as percentages.
90
+
91
+ Notes:
92
+ - The method specifically checks for certain benchmark names to skip outdated entries.
93
+ - Handles NaN values by setting the corresponding benchmark result to 0.0.
94
+ - Averages scores across metrics for benchmarks found in the data, in a percentage format.
95
  """
96
  results = {}
97
  for task in Tasks:
98
  task = task.value
 
99
  # We skip old mmlu entries
100
  if task.benchmark == "hendrycksTest":
101
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
102
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
103
  continue
104
 
105
+ # Some benchamrk values are NaNs, mostly truthfulQA
106
+ # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
107
+ # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
108
+ for k, v in data["results"].items():
109
+ if task.benchmark in k:
110
+ if math.isnan(float(v[task.metric])):
111
+ results[task.benchmark] = 0.0
112
+ continue
113
 
114
  # We average all scores of a given metric (mostly for mmlu)
115
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
116
+ if accs.size == 0 or any([acc is None for acc in accs]):
117
  continue
118
 
119
+ mean_acc = np.mean(accs) * 100.0
120
+ results[task.benchmark] = mean_acc
121
+
122
  return results
123
 
124