Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Alina Lozovskaia
commited on
Commit
·
f86eaae
1
Parent(s):
87e47c2
Fixing WIP
Browse files- src/display/utils.py +21 -0
- src/leaderboard/filter_models.py +0 -3
- src/leaderboard/read_evals.py +39 -36
src/display/utils.py
CHANGED
@@ -1,9 +1,30 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
import json
|
|
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def load_json_data(file_path):
|
8 |
"""Safely load JSON data from a file."""
|
9 |
try:
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
import json
|
4 |
+
import logging
|
5 |
+
from datetime import datetime
|
6 |
import pandas as pd
|
7 |
|
8 |
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
11 |
+
|
12 |
+
def parse_datetime(datetime_str):
|
13 |
+
formats = [
|
14 |
+
"%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
|
15 |
+
"%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
|
16 |
+
"%Y-%m-%dT%H %M %S.%f", # Spaces as separator
|
17 |
+
]
|
18 |
+
|
19 |
+
for fmt in formats:
|
20 |
+
try:
|
21 |
+
return datetime.strptime(datetime_str, fmt)
|
22 |
+
except ValueError:
|
23 |
+
continue
|
24 |
+
# in rare cases set unix start time for files with incorrect time (legacy files)
|
25 |
+
logging.error(f"No valid date format found for: {datetime_str}")
|
26 |
+
return datetime(1970, 1, 1)
|
27 |
+
|
28 |
def load_json_data(file_path):
|
29 |
"""Safely load JSON data from a file."""
|
30 |
try:
|
src/leaderboard/filter_models.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
-
import logging
|
2 |
from src.display.formatting import model_hyperlink
|
3 |
from src.display.utils import AutoEvalColumn
|
4 |
|
5 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
6 |
|
7 |
# Models which have been flagged by users as being problematic for a reason or another
|
8 |
# (Model name to forum discussion link)
|
@@ -141,7 +139,6 @@ def flag_models(leaderboard_data: list[dict]):
|
|
141 |
else:
|
142 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
143 |
if flag_key in FLAGGED_MODELS:
|
144 |
-
# logging.info(f"Flagged model: {flag_key}") # Do we need to print out the list of flagged models?
|
145 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
146 |
issue_link = model_hyperlink(
|
147 |
FLAGGED_MODELS[flag_key],
|
|
|
|
|
1 |
from src.display.formatting import model_hyperlink
|
2 |
from src.display.utils import AutoEvalColumn
|
3 |
|
|
|
4 |
|
5 |
# Models which have been flagged by users as being problematic for a reason or another
|
6 |
# (Model name to forum discussion link)
|
|
|
139 |
else:
|
140 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
141 |
if flag_key in FLAGGED_MODELS:
|
|
|
142 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
143 |
issue_link = model_hyperlink(
|
144 |
FLAGGED_MODELS[flag_key],
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import json
|
2 |
from pathlib import Path
|
3 |
-
from datetime import datetime
|
4 |
from json import JSONDecodeError
|
5 |
import logging
|
6 |
import math
|
@@ -14,7 +13,7 @@ from tqdm.contrib.logging import logging_redirect_tqdm
|
|
14 |
import numpy as np
|
15 |
|
16 |
from src.display.formatting import make_clickable_model
|
17 |
-
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
|
18 |
|
19 |
# Configure logging
|
20 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -54,7 +53,14 @@ class EvalResult:
|
|
54 |
org_and_model = config.get("model_name", "").split("/", 1)
|
55 |
org = org_and_model[0] if len(org_and_model) > 1 else None
|
56 |
model = org_and_model[-1]
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
60 |
results = cls.extract_results(data) # Properly call the method to extract results
|
@@ -71,26 +77,39 @@ class EvalResult:
|
|
71 |
|
72 |
@staticmethod
|
73 |
def extract_results(data: Dict) -> Dict[str, float]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
results = {}
|
75 |
for task in Tasks:
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
continue
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
if math.isnan(float(task_metric_value)):
|
87 |
-
results[task_value.benchmark] = 0.0
|
88 |
-
continue
|
89 |
|
90 |
-
|
91 |
-
if accs:
|
92 |
-
mean_acc = np.mean(accs) * 100.0
|
93 |
-
results[task_value.benchmark] = mean_acc
|
94 |
|
95 |
return results
|
96 |
|
@@ -192,23 +211,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
192 |
return request_file
|
193 |
|
194 |
|
195 |
-
def parse_datetime(datetime_str):
|
196 |
-
formats = [
|
197 |
-
"%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
|
198 |
-
"%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
|
199 |
-
"%Y-%m-%dT%H %M %S.%f", # Spaces as separator
|
200 |
-
]
|
201 |
-
|
202 |
-
for fmt in formats:
|
203 |
-
try:
|
204 |
-
return datetime.strptime(datetime_str, fmt)
|
205 |
-
except ValueError:
|
206 |
-
continue
|
207 |
-
# in rare cases set unix start time for files with incorrect time (legacy files)
|
208 |
-
logging.error(f"No valid date format found for: {datetime_str}")
|
209 |
-
return datetime(1970, 1, 1)
|
210 |
-
|
211 |
-
|
212 |
def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
|
213 |
"""From the path of the results folder root, extract all needed info for results"""
|
214 |
with open(dynamic_path) as f:
|
@@ -246,7 +248,8 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
246 |
v.to_dict() # we test if the dict version is complete
|
247 |
results.append(v)
|
248 |
except KeyError as e:
|
249 |
-
logging.error(f"Error while checking model {k}
|
250 |
continue
|
251 |
|
252 |
-
return results
|
|
|
|
1 |
import json
|
2 |
from pathlib import Path
|
|
|
3 |
from json import JSONDecodeError
|
4 |
import logging
|
5 |
import math
|
|
|
13 |
import numpy as np
|
14 |
|
15 |
from src.display.formatting import make_clickable_model
|
16 |
+
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
17 |
|
18 |
# Configure logging
|
19 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
53 |
org_and_model = config.get("model_name", "").split("/", 1)
|
54 |
org = org_and_model[0] if len(org_and_model) > 1 else None
|
55 |
model = org_and_model[-1]
|
56 |
+
if len(org_and_model) == 1:
|
57 |
+
org = None
|
58 |
+
model = org_and_model[0]
|
59 |
+
result_key = f"{model}_{precision.value.name}"
|
60 |
+
else:
|
61 |
+
org = org_and_model[0]
|
62 |
+
model = org_and_model[1]
|
63 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
64 |
full_model = "/".join(org_and_model)
|
65 |
|
66 |
results = cls.extract_results(data) # Properly call the method to extract results
|
|
|
77 |
|
78 |
@staticmethod
|
79 |
def extract_results(data: Dict) -> Dict[str, float]:
|
80 |
+
"""
|
81 |
+
Extracts and computes average scores from test result data for different benchmarks.
|
82 |
+
Skips entries based on specific conditions and handles NaN values appropriately.
|
83 |
+
Returns a dictionary with benchmarks as keys and their averaged scores as values in percentage.
|
84 |
+
|
85 |
+
Parameters:
|
86 |
+
- data (Dict): Input data with 'versions' and 'results'.
|
87 |
+
|
88 |
+
Returns:
|
89 |
+
- Dict[str, float]: A dictionary with benchmark names and their computed average scores.
|
90 |
+
"""
|
91 |
results = {}
|
92 |
for task in Tasks:
|
93 |
+
task = task.value
|
94 |
|
95 |
+
# We skip old mmlu entries
|
96 |
+
if task.benchmark == "hendrycksTest":
|
97 |
+
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
98 |
+
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
99 |
+
continue
|
100 |
+
|
101 |
+
# Some truthfulQA values are NaNs
|
102 |
+
if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
|
103 |
+
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
104 |
+
results[task.benchmark] = 0.0
|
105 |
continue
|
106 |
|
107 |
+
# We average all scores of a given metric (mostly for mmlu)
|
108 |
+
accs = [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]
|
109 |
+
if accs or any([acc is None for acc in accs]):
|
110 |
+
continue
|
|
|
|
|
|
|
111 |
|
112 |
+
results[task.benchmark] = np.mean(accs) * 100.0
|
|
|
|
|
|
|
113 |
|
114 |
return results
|
115 |
|
|
|
211 |
return request_file
|
212 |
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
|
215 |
"""From the path of the results folder root, extract all needed info for results"""
|
216 |
with open(dynamic_path) as f:
|
|
|
248 |
v.to_dict() # we test if the dict version is complete
|
249 |
results.append(v)
|
250 |
except KeyError as e:
|
251 |
+
logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
|
252 |
continue
|
253 |
|
254 |
+
return results
|
255 |
+
|