Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1141

Alina Lozovskaia commited on May 6, 2024

Commit

c74b7d7

1 Parent(s): f86eaae

Changes as per comments

Browse files

Files changed (2) hide show

app.py +16 -0
src/leaderboard/read_evals.py +23 -15

app.py CHANGED Viewed

@@ -50,6 +50,9 @@ from src.tools.collections import update_collections
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Start ephemeral Spaces on PRs (see config in README.md)
 enable_space_ci()
@@ -57,6 +60,19 @@ enable_space_ci()
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
     """Download dataset with exponential backoff retries."""
     attempt = 0

 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Start ephemeral Spaces on PRs (see config in README.md)
 enable_space_ci()
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
+def time_diff_wrapper(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        diff = end_time - start_time
+        logging.info(f"Time taken for {func.__name__}: {diff} seconds")
+        return result
+    return wrapper
+@time_diff_wrapper
 def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
     """Download dataset with exponential backoff retries."""
     attempt = 0

src/leaderboard/read_evals.py CHANGED Viewed

@@ -78,39 +78,47 @@ class EvalResult:
     @staticmethod
     def extract_results(data: Dict) -> Dict[str, float]:
         """
-        Extracts and computes average scores from test result data for different benchmarks.
-        Skips entries based on specific conditions and handles NaN values appropriately.
-        Returns a dictionary with benchmarks as keys and their averaged scores as values in percentage.
         Parameters:
-        - data (Dict): Input data with 'versions' and 'results'.
         Returns:
-        - Dict[str, float]: A dictionary with benchmark names and their computed average scores.
         """
         results = {}
         for task in Tasks:
             task = task.value
             # We skip old mmlu entries
             if task.benchmark == "hendrycksTest":
                 for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
                     if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
                         continue
-            # Some truthfulQA values are NaNs
-            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
-                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
-                    results[task.benchmark] = 0.0
-                    continue
             # We average all scores of a given metric (mostly for mmlu)
-            accs = [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]
-            if accs or any([acc is None for acc in accs]):
                 continue
-            results[task.benchmark] = np.mean(accs) * 100.0
         return results

     @staticmethod
     def extract_results(data: Dict) -> Dict[str, float]:
         """
+        Extract and process benchmark results from a given dict.
         Parameters:
+        - data (Dict): A dictionary containing benchmark data. This dictionary must
+        include 'versions' and 'results' keys with respective sub-data.
         Returns:
+        - Dict[str, float]: A dictionary where keys are benchmark names and values
+        are the processed average scores as percentages.
+        Notes:
+        - The method specifically checks for certain benchmark names to skip outdated entries.
+        - Handles NaN values by setting the corresponding benchmark result to 0.0.
+        - Averages scores across metrics for benchmarks found in the data, in a percentage format.
         """
         results = {}
         for task in Tasks:
             task = task.value
             # We skip old mmlu entries
             if task.benchmark == "hendrycksTest":
                 for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
                     if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
                         continue
+            # Some benchamrk values are NaNs, mostly truthfulQA
+            # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
+            # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
+            for k, v in data["results"].items():
+                if task.benchmark in k:
+                    if math.isnan(float(v[task.metric])):
+                        results[task.benchmark] = 0.0
+                        continue
             # We average all scores of a given metric (mostly for mmlu)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            mean_acc = np.mean(accs) * 100.0
+            results[task.benchmark] = mean_acc
         return results