de-arena

Sleeping

App Files Files Community

yzabc007 commited on Oct 9

Commit

74f7f31

•

1 Parent(s): 9f4c149

Update space

Browse files

Files changed (4) hide show

app.py +18 -3
src/display/utils.py +4 -1
src/leaderboard/read_evals.py +5 -1
src/populate.py +5 -1

app.py CHANGED Viewed

@@ -102,7 +102,8 @@ def init_leaderboard(dataframe):
 # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
 # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
 # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
-model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
 # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
@@ -312,7 +313,6 @@ with demo:
         with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
             CURRENT_TEXT = """
-            # Coming soon!
             We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
             We have diversely and aggressively collected recent science datasets, including but not limited to
             [GPQA](https://arxiv.org/abs/2311.12022),
@@ -323,7 +323,22 @@ with demo:
             [SciEval](https://arxiv.org/abs/2308.13149).
             """
             gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
             CURRENT_TEXT = """

 # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
 # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
 # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
+# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
+model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
 # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
         with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
             CURRENT_TEXT = """
             We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
             We have diversely and aggressively collected recent science datasets, including but not limited to
             [GPQA](https://arxiv.org/abs/2311.12022),
             [SciEval](https://arxiv.org/abs/2308.13149).
             """
             gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
+                leaderboard = overall_leaderboard(
+                    get_model_leaderboard_df(
+                        model_result_path,
+                        benchmark_cols=[
+                            AutoEvalColumn.rank_chemistry.name,
+                            AutoEvalColumn.model.name,
+                            AutoEvalColumn.score_chemistry.name,
+                            # AutoEvalColumn.sd_reason_social.name,
+                            AutoEvalColumn.license.name,
+                            AutoEvalColumn.organization.name,
+                            AutoEvalColumn.knowledge_cutoff.name,
+                            ],
+                        rank_col=[AutoEvalColumn.rank_chemistry.name],
+                    )
+                )
         with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
             CURRENT_TEXT = """

src/display/utils.py CHANGED Viewed

@@ -63,7 +63,7 @@ auto_eval_column_dict.append(["score", ColumnContent, field(default_factory=lamb
 auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
 auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
-# fine-graine dimensions
 auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
 auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
 auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
@@ -85,6 +85,9 @@ auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(defa
 auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
 auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])

 auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
 auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
+# fine-grained dimensions
 auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
 auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
 auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
 auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
 auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
+auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Chemistry)", "number", True))])
+auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
+auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])

src/leaderboard/read_evals.py CHANGED Viewed

@@ -115,7 +115,7 @@ class ModelResult:
                     new_v[kk] = vv
             new_results[k] = new_v
         # Extract results available in this file (some results are split in several files)
         # results = {}
         # for domain in Domains:
@@ -185,6 +185,10 @@ class ModelResult:
             AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
             AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
             AutoEvalColumn.license.name: self.license,
             AutoEvalColumn.organization.name: self.org,
             AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,

                     new_v[kk] = vv
             new_results[k] = new_v
         # Extract results available in this file (some results are split in several files)
         # results = {}
         # for domain in Domains:
             AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
             AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
+            AutoEvalColumn.score_chemistry.name: self.results.get("Chemistry").get("Average Score", None) if self.results.get("Chemistry") else None,
+            AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
+            AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
             AutoEvalColumn.license.name: self.license,
             AutoEvalColumn.organization.name: self.org,
             AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,

src/populate.py CHANGED Viewed

@@ -24,6 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
     if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
         df = df.dropna(subset=benchmark_cols)
         df = df.sort_values(by=[rank_col[0]], ascending=True)
     else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
         avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
         df["Average Rank"] = avg_rank
@@ -37,7 +38,10 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
         # print(col)
         # if 'Std dev' in col or 'Score' in col:
         if 'Std dev' in col or 'Score' in col:
-            df[col] = (df[col]*100).map('{:.2f}'.format)
             # df[col] = df[col].round(decimals=2)
     # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)

     if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
         df = df.dropna(subset=benchmark_cols)
         df = df.sort_values(by=[rank_col[0]], ascending=True)
+        # print(rank_col)
     else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
         avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
         df["Average Rank"] = avg_rank
         # print(col)
         # if 'Std dev' in col or 'Score' in col:
         if 'Std dev' in col or 'Score' in col:
+            if "Chemistry" in col:
+                df[col] = (df[col]).map('{:.2f}'.format)
+            else:
+                df[col] = (df[col]*100).map('{:.2f}'.format)
             # df[col] = df[col].round(decimals=2)
     # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)