Update space
Browse files- app.py +3 -2
- src/about.py +16 -1
- src/display/utils.py +57 -16
- src/leaderboard/read_evals.py +50 -18
- src/populate.py +17 -1
app.py
CHANGED
@@ -25,7 +25,7 @@ from src.display.utils import (
|
|
25 |
Precision
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
|
31 |
|
@@ -97,6 +97,7 @@ def init_leaderboard(dataframe):
|
|
97 |
)
|
98 |
|
99 |
|
|
|
100 |
|
101 |
def overall_leaderboard(dataframe):
|
102 |
if dataframe is None or dataframe.empty:
|
@@ -124,7 +125,7 @@ with demo:
|
|
124 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
125 |
|
126 |
with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
|
127 |
-
leaderboard =
|
128 |
|
129 |
|
130 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
|
|
25 |
Precision
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
|
31 |
|
|
|
97 |
)
|
98 |
|
99 |
|
100 |
+
model_leaderboard_df = get_model_leaderboard_df()
|
101 |
|
102 |
def overall_leaderboard(dataframe):
|
103 |
if dataframe is None or dataframe.empty:
|
|
|
125 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
126 |
|
127 |
with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
|
128 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
129 |
|
130 |
|
131 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
src/about.py
CHANGED
@@ -1,6 +1,21 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
@dataclass
|
5 |
class Task:
|
6 |
benchmark: str
|
@@ -14,7 +29,7 @@ class Tasks(Enum):
|
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
-
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
@dataclass
|
5 |
+
class Domain:
|
6 |
+
dimension: str
|
7 |
+
metric: str
|
8 |
+
col_name: str
|
9 |
+
|
10 |
+
|
11 |
+
class Domains(Enum):
|
12 |
+
# dimension_key in the json file, metric_key in the json file, name to display in the leaderboard
|
13 |
+
dim0 = Domain("overall", "Avg Rank", "Overall")
|
14 |
+
# dim1 = Task("math", "acc", "Math")
|
15 |
+
# dim2 = Task("math_algebra", "acc", "Algebra")
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
@dataclass
|
20 |
class Task:
|
21 |
benchmark: str
|
|
|
29 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
30 |
task0 = Task("anli_r1", "acc", "ANLI")
|
31 |
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
32 |
+
|
33 |
NUM_FEWSHOT = 0 # Change with your few shot
|
34 |
# ---------------------------------------------------
|
35 |
|
src/display/utils.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -22,26 +22,67 @@ class ColumnContent:
|
|
22 |
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["
|
27 |
-
auto_eval_column_dict.append(["
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
for task in Tasks:
|
31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
|
|
|
32 |
# Model information
|
33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["
|
38 |
-
auto_eval_column_dict.append(["
|
39 |
-
auto_eval_column_dict.append(["
|
40 |
-
auto_eval_column_dict.append(["
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass, field
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, Domains
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
22 |
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
+
# # Init
|
26 |
+
# auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
27 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
28 |
+
|
29 |
+
# # new columns
|
30 |
+
# for domain in Domains:
|
31 |
+
# auto_eval_column_dict.append([domain.name, ColumnContent, ColumnContent(domain.value.col_name, "number", True)])
|
32 |
+
|
33 |
+
# auto_eval_column_dict.append(["organization", ColumnContent, ColumnContent("Organization", "str", False)])
|
34 |
+
# auto_eval_column_dict.append(["knowledge_cutoff", ColumnContent, ColumnContent("Knowledge cutoff", "str", False)])
|
35 |
+
|
36 |
+
|
37 |
+
# for task in Tasks:
|
38 |
+
# auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
39 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
|
40 |
+
# #Scores
|
41 |
+
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
42 |
+
# # Model information
|
43 |
+
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
44 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
45 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
46 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
47 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
48 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
49 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
50 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
51 |
+
|
52 |
# Init
|
53 |
+
auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
|
54 |
+
auto_eval_column_dict.append(["license", ColumnContent, field(default_factory=lambda: ColumnContent("Hub License", "str", False))])
|
55 |
+
|
56 |
+
# new columns
|
57 |
+
for domain in Domains:
|
58 |
+
auto_eval_column_dict.append([domain.name, ColumnContent, field(default_factory=lambda: ColumnContent(domain.value.col_name, "number", True))])
|
59 |
+
|
60 |
+
auto_eval_column_dict.append(["organization", ColumnContent, field(default_factory=lambda: ColumnContent("Organization", "str", False))])
|
61 |
+
auto_eval_column_dict.append(["knowledge_cutoff", ColumnContent, field(default_factory=lambda: ColumnContent("Knowledge cutoff", "str", False))])
|
62 |
+
|
63 |
+
|
64 |
for task in Tasks:
|
65 |
+
auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
|
66 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
|
67 |
+
#Scores
|
68 |
+
auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
|
69 |
# Model information
|
70 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
|
71 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, field(default_factory=lambda: ColumnContent("Architecture", "str", False))])
|
72 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, field(default_factory=lambda: ColumnContent("Weight type", "str", False, True))])
|
73 |
+
auto_eval_column_dict.append(["precision", ColumnContent, field(default_factory=lambda: ColumnContent("Precision", "str", False))])
|
74 |
+
auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
|
75 |
+
auto_eval_column_dict.append(["likes", ColumnContent, field(default_factory=lambda: ColumnContent("Hub ❤️", "number", False))])
|
76 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, field(default_factory=lambda: ColumnContent("Available on the hub", "bool", False))])
|
77 |
+
auto_eval_column_dict.append(["revision", ColumnContent, field(default_factory=lambda: ColumnContent("Model sha", "str", False, False))])
|
|
|
78 |
|
79 |
# We use make dataclass to dynamically fill the scores from Tasks
|
80 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
81 |
+
# print all attributes of AutoEvalColumn
|
82 |
+
print(AutoEvalColumn.__annotations__.keys())
|
83 |
+
# preint precision attribute
|
84 |
+
print(AutoEvalColumn.precision)
|
85 |
+
|
86 |
|
87 |
## For the queue columns in the submission tab
|
88 |
@dataclass(frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -18,34 +18,57 @@ class ModelResult:
|
|
18 |
"""
|
19 |
eval_name: str
|
20 |
full_model: str
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
@classmethod
|
23 |
-
def
|
24 |
-
try:
|
25 |
-
with open(json_filepath) as fp:
|
26 |
-
data = json.load(fp)
|
27 |
-
except:
|
28 |
-
data = eval(open(json_filepath).read()) # a list of dicts
|
29 |
-
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def to_dict(self):
|
36 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
37 |
-
|
|
|
|
|
38 |
data_dict = {
|
39 |
"eval_name": self.eval_name, # not a column, just a save name,
|
|
|
|
|
|
|
|
|
|
|
40 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
41 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
42 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
43 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
44 |
AutoEvalColumn.architecture.name: self.architecture,
|
45 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
46 |
AutoEvalColumn.revision.name: self.revision,
|
47 |
AutoEvalColumn.average.name: average,
|
48 |
-
AutoEvalColumn.license.name: self.license,
|
49 |
AutoEvalColumn.likes.name: self.likes,
|
50 |
AutoEvalColumn.params.name: self.num_params,
|
51 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
@@ -54,6 +77,9 @@ class ModelResult:
|
|
54 |
for task in Tasks:
|
55 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
56 |
|
|
|
|
|
|
|
57 |
return data_dict
|
58 |
|
59 |
@dataclass
|
@@ -154,6 +180,7 @@ class EvalResult:
|
|
154 |
def to_dict(self):
|
155 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
156 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
157 |
data_dict = {
|
158 |
"eval_name": self.eval_name, # not a column, just a save name,
|
159 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -242,14 +269,19 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
242 |
|
243 |
def get_raw_model_results(results_path: str) -> list[EvalResult]:
|
244 |
"""From the path of the results folder root, extract all needed info for results"""
|
245 |
-
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
eval_results = {}
|
248 |
-
|
249 |
-
for model_result_filepath in model_result_filepaths:
|
250 |
-
# Creation of result
|
251 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
252 |
|
|
|
|
|
|
|
|
|
253 |
# Store results of same eval together
|
254 |
eval_name = eval_result.eval_name
|
255 |
if eval_name in eval_results.keys():
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Domains
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
18 |
"""
|
19 |
eval_name: str
|
20 |
full_model: str
|
21 |
+
org: str
|
22 |
+
model: str
|
23 |
+
results: dict
|
24 |
+
license: str = "?"
|
25 |
+
knowledge_cutoff: str = ""
|
26 |
|
27 |
@classmethod
|
28 |
+
def init_from_json_dict(self, data):
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
config = data.get("config")
|
31 |
+
# Get model and org
|
32 |
+
model = config.get("model_name")
|
33 |
+
org = config.get("org_name")
|
34 |
+
license = config.get("license")
|
35 |
+
knowledge_cutoff = config.get("knowledge_cutoff")
|
36 |
+
|
37 |
+
# Extract results available in this file (some results are split in several files)
|
38 |
+
results = {}
|
39 |
+
for domain in Domains:
|
40 |
+
domain = domain.value
|
41 |
+
results[domain.dimension] = data.get("results").get(domain.metric)
|
42 |
|
43 |
+
return self(
|
44 |
+
eval_name=f"{org}_{model}",
|
45 |
+
full_model=f"{org}/{model}",
|
46 |
+
org=org,
|
47 |
+
model=model,
|
48 |
+
results=results,
|
49 |
+
license=license,
|
50 |
+
knowledge_cutoff=knowledge_cutoff
|
51 |
+
)
|
52 |
|
53 |
def to_dict(self):
|
54 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
55 |
+
|
56 |
+
average = 1 / self.results[Domains.dim0.dimension] if self.results[Domains.dim0.dimension] != 0 else 0
|
57 |
+
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
58 |
data_dict = {
|
59 |
"eval_name": self.eval_name, # not a column, just a save name,
|
60 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
61 |
+
AutoEvalColumn.license.name: self.license,
|
62 |
+
AutoEvalColumn.organization.name: self.org,
|
63 |
+
AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
|
64 |
+
|
65 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
66 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
67 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
68 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
69 |
AutoEvalColumn.architecture.name: self.architecture,
|
|
|
70 |
AutoEvalColumn.revision.name: self.revision,
|
71 |
AutoEvalColumn.average.name: average,
|
|
|
72 |
AutoEvalColumn.likes.name: self.likes,
|
73 |
AutoEvalColumn.params.name: self.num_params,
|
74 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
|
|
77 |
for task in Tasks:
|
78 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
79 |
|
80 |
+
for domain in Domains:
|
81 |
+
data_dict[domain.value.col_name] = self.results[domain.value.dimension]
|
82 |
+
|
83 |
return data_dict
|
84 |
|
85 |
@dataclass
|
|
|
180 |
def to_dict(self):
|
181 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
182 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
183 |
+
print(AutoEvalColumn.precision.name, self.precision.value.name)
|
184 |
data_dict = {
|
185 |
"eval_name": self.eval_name, # not a column, just a save name,
|
186 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
269 |
|
270 |
def get_raw_model_results(results_path: str) -> list[EvalResult]:
|
271 |
"""From the path of the results folder root, extract all needed info for results"""
|
272 |
+
|
273 |
+
try:
|
274 |
+
with open(results_path) as fp:
|
275 |
+
data = json.load(fp)
|
276 |
+
except:
|
277 |
+
data = eval(open(results_path).read()) # a list of dicts
|
278 |
|
279 |
eval_results = {}
|
|
|
|
|
|
|
|
|
280 |
|
281 |
+
for result in data:
|
282 |
+
# Creation of result
|
283 |
+
eval_result = ModelResult.init_from_json_dict(result)
|
284 |
+
|
285 |
# Store results of same eval together
|
286 |
eval_name = eval_result.eval_name
|
287 |
if eval_name in eval_results.keys():
|
src/populate.py
CHANGED
@@ -5,12 +5,28 @@ import pandas as pd
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
+
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_results
|
9 |
|
10 |
|
11 |
+
def get_model_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
+
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_raw_model_results(results_path)
|
14 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
+
|
16 |
+
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
+
df = df[cols].round(decimals=2)
|
19 |
+
|
20 |
+
# filter out if any of the benchmarks have not been produced
|
21 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
+
return df
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
27 |
"""Creates a dataframe from all the individual experiment results"""
|
28 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
29 |
+
# raw_data = get_raw_model_results(results_path)
|
30 |
all_data_json = [v.to_dict() for v in raw_data]
|
31 |
|
32 |
df = pd.DataFrame.from_records(all_data_json)
|