Update space
Browse files- app.py +18 -3
- src/display/utils.py +4 -1
- src/leaderboard/read_evals.py +5 -1
- src/populate.py +5 -1
app.py
CHANGED
@@ -102,7 +102,8 @@ def init_leaderboard(dataframe):
|
|
102 |
# model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
|
103 |
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
|
104 |
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
105 |
-
model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
|
|
106 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
107 |
|
108 |
|
@@ -312,7 +313,6 @@ with demo:
|
|
312 |
|
313 |
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
|
314 |
CURRENT_TEXT = """
|
315 |
-
# Coming soon!
|
316 |
We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
|
317 |
We have diversely and aggressively collected recent science datasets, including but not limited to
|
318 |
[GPQA](https://arxiv.org/abs/2311.12022),
|
@@ -323,7 +323,22 @@ with demo:
|
|
323 |
[SciEval](https://arxiv.org/abs/2308.13149).
|
324 |
"""
|
325 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
|
328 |
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
|
329 |
CURRENT_TEXT = """
|
|
|
102 |
# model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
|
103 |
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
|
104 |
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
105 |
+
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
106 |
+
model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
|
107 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
108 |
|
109 |
|
|
|
313 |
|
314 |
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
|
315 |
CURRENT_TEXT = """
|
|
|
316 |
We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
|
317 |
We have diversely and aggressively collected recent science datasets, including but not limited to
|
318 |
[GPQA](https://arxiv.org/abs/2311.12022),
|
|
|
323 |
[SciEval](https://arxiv.org/abs/2308.13149).
|
324 |
"""
|
325 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
326 |
+
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
|
327 |
+
leaderboard = overall_leaderboard(
|
328 |
+
get_model_leaderboard_df(
|
329 |
+
model_result_path,
|
330 |
+
benchmark_cols=[
|
331 |
+
AutoEvalColumn.rank_chemistry.name,
|
332 |
+
AutoEvalColumn.model.name,
|
333 |
+
AutoEvalColumn.score_chemistry.name,
|
334 |
+
# AutoEvalColumn.sd_reason_social.name,
|
335 |
+
AutoEvalColumn.license.name,
|
336 |
+
AutoEvalColumn.organization.name,
|
337 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
338 |
+
],
|
339 |
+
rank_col=[AutoEvalColumn.rank_chemistry.name],
|
340 |
+
)
|
341 |
+
)
|
342 |
|
343 |
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
|
344 |
CURRENT_TEXT = """
|
src/display/utils.py
CHANGED
@@ -63,7 +63,7 @@ auto_eval_column_dict.append(["score", ColumnContent, field(default_factory=lamb
|
|
63 |
auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
|
64 |
auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
|
65 |
|
66 |
-
# fine-
|
67 |
auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
|
68 |
auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
|
69 |
auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
|
@@ -85,6 +85,9 @@ auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(defa
|
|
85 |
auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
|
86 |
auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
|
87 |
|
|
|
|
|
|
|
88 |
|
89 |
for task in Tasks:
|
90 |
auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
|
|
|
63 |
auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
|
64 |
auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
|
65 |
|
66 |
+
# fine-grained dimensions
|
67 |
auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
|
68 |
auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
|
69 |
auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
|
|
|
85 |
auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
|
86 |
auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
|
87 |
|
88 |
+
auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Chemistry)", "number", True))])
|
89 |
+
auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
|
90 |
+
auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
|
91 |
|
92 |
for task in Tasks:
|
93 |
auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
|
src/leaderboard/read_evals.py
CHANGED
@@ -115,7 +115,7 @@ class ModelResult:
|
|
115 |
new_v[kk] = vv
|
116 |
|
117 |
new_results[k] = new_v
|
118 |
-
|
119 |
# Extract results available in this file (some results are split in several files)
|
120 |
# results = {}
|
121 |
# for domain in Domains:
|
@@ -185,6 +185,10 @@ class ModelResult:
|
|
185 |
AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
|
186 |
AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
|
187 |
|
|
|
|
|
|
|
|
|
188 |
AutoEvalColumn.license.name: self.license,
|
189 |
AutoEvalColumn.organization.name: self.org,
|
190 |
AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
|
|
|
115 |
new_v[kk] = vv
|
116 |
|
117 |
new_results[k] = new_v
|
118 |
+
|
119 |
# Extract results available in this file (some results are split in several files)
|
120 |
# results = {}
|
121 |
# for domain in Domains:
|
|
|
185 |
AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
|
186 |
AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
|
187 |
|
188 |
+
AutoEvalColumn.score_chemistry.name: self.results.get("Chemistry").get("Average Score", None) if self.results.get("Chemistry") else None,
|
189 |
+
AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
|
190 |
+
AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
|
191 |
+
|
192 |
AutoEvalColumn.license.name: self.license,
|
193 |
AutoEvalColumn.organization.name: self.org,
|
194 |
AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
|
src/populate.py
CHANGED
@@ -24,6 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
24 |
if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
|
25 |
df = df.dropna(subset=benchmark_cols)
|
26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
|
|
27 |
else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
|
28 |
avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
|
29 |
df["Average Rank"] = avg_rank
|
@@ -37,7 +38,10 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
37 |
# print(col)
|
38 |
# if 'Std dev' in col or 'Score' in col:
|
39 |
if 'Std dev' in col or 'Score' in col:
|
40 |
-
|
|
|
|
|
|
|
41 |
# df[col] = df[col].round(decimals=2)
|
42 |
|
43 |
# df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
|
|
|
24 |
if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
|
25 |
df = df.dropna(subset=benchmark_cols)
|
26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
27 |
+
# print(rank_col)
|
28 |
else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
|
29 |
avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
|
30 |
df["Average Rank"] = avg_rank
|
|
|
38 |
# print(col)
|
39 |
# if 'Std dev' in col or 'Score' in col:
|
40 |
if 'Std dev' in col or 'Score' in col:
|
41 |
+
if "Chemistry" in col:
|
42 |
+
df[col] = (df[col]).map('{:.2f}'.format)
|
43 |
+
else:
|
44 |
+
df[col] = (df[col]*100).map('{:.2f}'.format)
|
45 |
# df[col] = df[col].round(decimals=2)
|
46 |
|
47 |
# df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
|