Update space
Browse files- app.py +18 -1
- src/populate.py +9 -4
app.py
CHANGED
@@ -156,6 +156,7 @@ with demo:
|
|
156 |
AutoEvalColumn.rank_math_probability.name,
|
157 |
AutoEvalColumn.rank_reason_logical.name,
|
158 |
AutoEvalColumn.rank_reason_social.name,
|
|
|
159 |
],
|
160 |
rank_col=[],
|
161 |
)
|
@@ -313,7 +314,8 @@ with demo:
|
|
313 |
|
314 |
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
|
315 |
CURRENT_TEXT = """
|
316 |
-
|
|
|
317 |
We have diversely and aggressively collected recent science datasets, including but not limited to
|
318 |
[GPQA](https://arxiv.org/abs/2311.12022),
|
319 |
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
|
@@ -323,6 +325,7 @@ with demo:
|
|
323 |
[SciEval](https://arxiv.org/abs/2308.13149).
|
324 |
"""
|
325 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
|
|
326 |
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
|
327 |
leaderboard = overall_leaderboard(
|
328 |
get_model_leaderboard_df(
|
@@ -340,6 +343,20 @@ with demo:
|
|
340 |
)
|
341 |
)
|
342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
|
344 |
CURRENT_TEXT = """
|
345 |
# Coming soon!
|
|
|
156 |
AutoEvalColumn.rank_math_probability.name,
|
157 |
AutoEvalColumn.rank_reason_logical.name,
|
158 |
AutoEvalColumn.rank_reason_social.name,
|
159 |
+
# AutoEvalColumn.rank_chemistry.name,
|
160 |
],
|
161 |
rank_col=[],
|
162 |
)
|
|
|
314 |
|
315 |
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
|
316 |
CURRENT_TEXT = """
|
317 |
+
Sicnece domain is a critical area for evaluating LLMs.
|
318 |
+
We are working on adding several tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
|
319 |
We have diversely and aggressively collected recent science datasets, including but not limited to
|
320 |
[GPQA](https://arxiv.org/abs/2311.12022),
|
321 |
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
|
|
|
325 |
[SciEval](https://arxiv.org/abs/2308.13149).
|
326 |
"""
|
327 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
328 |
+
|
329 |
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
|
330 |
leaderboard = overall_leaderboard(
|
331 |
get_model_leaderboard_df(
|
|
|
343 |
)
|
344 |
)
|
345 |
|
346 |
+
with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=1, elem_classes="subtab"):
|
347 |
+
CURRENT_TEXT = """
|
348 |
+
# Coming soon!
|
349 |
+
"""
|
350 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
351 |
+
|
352 |
+
|
353 |
+
with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=2, elem_classes="subtab"):
|
354 |
+
CURRENT_TEXT = """
|
355 |
+
# Coming soon!
|
356 |
+
"""
|
357 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
358 |
+
|
359 |
+
|
360 |
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
|
361 |
CURRENT_TEXT = """
|
362 |
# Coming soon!
|
src/populate.py
CHANGED
@@ -25,11 +25,16 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
25 |
df = df.dropna(subset=benchmark_cols)
|
26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
27 |
# print(rank_col)
|
28 |
-
else:
|
29 |
-
|
30 |
-
df[
|
31 |
-
df
|
|
|
|
|
|
|
|
|
32 |
df = df.fillna('--')
|
|
|
33 |
rank = np.arange(1, len(df)+1)
|
34 |
df.insert(0, 'Rank', rank)
|
35 |
|
|
|
25 |
df = df.dropna(subset=benchmark_cols)
|
26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
27 |
# print(rank_col)
|
28 |
+
else:
|
29 |
+
# when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
|
30 |
+
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
31 |
+
df["Average Rank"] = avg_rank.round(decimals=4)
|
32 |
+
df = df.sort_values(by=["Average Rank"], ascending=True)
|
33 |
+
df["Average Rank"] = df["Average Rank"].map('{:.4f}'.format)
|
34 |
+
|
35 |
+
# we'll skip NaN, instrad of deleting the whole row
|
36 |
df = df.fillna('--')
|
37 |
+
# insert a rank column
|
38 |
rank = np.arange(1, len(df)+1)
|
39 |
df.insert(0, 'Rank', rank)
|
40 |
|