yzabc007 commited on
Commit
74f7f31
1 Parent(s): 9f4c149

Update space

Browse files
app.py CHANGED
@@ -102,7 +102,8 @@ def init_leaderboard(dataframe):
102
  # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
103
  # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
104
  # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
- model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
 
106
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
107
 
108
 
@@ -312,7 +313,6 @@ with demo:
312
 
313
  with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
314
  CURRENT_TEXT = """
315
- # Coming soon!
316
  We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
317
  We have diversely and aggressively collected recent science datasets, including but not limited to
318
  [GPQA](https://arxiv.org/abs/2311.12022),
@@ -323,7 +323,22 @@ with demo:
323
  [SciEval](https://arxiv.org/abs/2308.13149).
324
  """
325
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
326
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
  with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
329
  CURRENT_TEXT = """
 
102
  # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
103
  # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
104
  # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
+ # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
+ model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
108
 
109
 
 
313
 
314
  with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
315
  CURRENT_TEXT = """
 
316
  We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
317
  We have diversely and aggressively collected recent science datasets, including but not limited to
318
  [GPQA](https://arxiv.org/abs/2311.12022),
 
323
  [SciEval](https://arxiv.org/abs/2308.13149).
324
  """
325
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
326
+ with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
327
+ leaderboard = overall_leaderboard(
328
+ get_model_leaderboard_df(
329
+ model_result_path,
330
+ benchmark_cols=[
331
+ AutoEvalColumn.rank_chemistry.name,
332
+ AutoEvalColumn.model.name,
333
+ AutoEvalColumn.score_chemistry.name,
334
+ # AutoEvalColumn.sd_reason_social.name,
335
+ AutoEvalColumn.license.name,
336
+ AutoEvalColumn.organization.name,
337
+ AutoEvalColumn.knowledge_cutoff.name,
338
+ ],
339
+ rank_col=[AutoEvalColumn.rank_chemistry.name],
340
+ )
341
+ )
342
 
343
  with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
344
  CURRENT_TEXT = """
src/display/utils.py CHANGED
@@ -63,7 +63,7 @@ auto_eval_column_dict.append(["score", ColumnContent, field(default_factory=lamb
63
  auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
- # fine-graine dimensions
67
  auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
68
  auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
69
  auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
@@ -85,6 +85,9 @@ auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(defa
85
  auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
86
  auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
87
 
 
 
 
88
 
89
  for task in Tasks:
90
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
 
63
  auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
+ # fine-grained dimensions
67
  auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
68
  auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
69
  auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
 
85
  auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
86
  auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
87
 
88
+ auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Chemistry)", "number", True))])
89
+ auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
90
+ auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
91
 
92
  for task in Tasks:
93
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
src/leaderboard/read_evals.py CHANGED
@@ -115,7 +115,7 @@ class ModelResult:
115
  new_v[kk] = vv
116
 
117
  new_results[k] = new_v
118
-
119
  # Extract results available in this file (some results are split in several files)
120
  # results = {}
121
  # for domain in Domains:
@@ -185,6 +185,10 @@ class ModelResult:
185
  AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
186
  AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
187
 
 
 
 
 
188
  AutoEvalColumn.license.name: self.license,
189
  AutoEvalColumn.organization.name: self.org,
190
  AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
 
115
  new_v[kk] = vv
116
 
117
  new_results[k] = new_v
118
+
119
  # Extract results available in this file (some results are split in several files)
120
  # results = {}
121
  # for domain in Domains:
 
185
  AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
186
  AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
187
 
188
+ AutoEvalColumn.score_chemistry.name: self.results.get("Chemistry").get("Average Score", None) if self.results.get("Chemistry") else None,
189
+ AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
190
+ AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
191
+
192
  AutoEvalColumn.license.name: self.license,
193
  AutoEvalColumn.organization.name: self.org,
194
  AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
src/populate.py CHANGED
@@ -24,6 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
24
  if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
  df = df.dropna(subset=benchmark_cols)
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
 
27
  else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
28
  avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
29
  df["Average Rank"] = avg_rank
@@ -37,7 +38,10 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
37
  # print(col)
38
  # if 'Std dev' in col or 'Score' in col:
39
  if 'Std dev' in col or 'Score' in col:
40
- df[col] = (df[col]*100).map('{:.2f}'.format)
 
 
 
41
  # df[col] = df[col].round(decimals=2)
42
 
43
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
 
24
  if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
  df = df.dropna(subset=benchmark_cols)
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
27
+ # print(rank_col)
28
  else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
29
  avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
30
  df["Average Rank"] = avg_rank
 
38
  # print(col)
39
  # if 'Std dev' in col or 'Score' in col:
40
  if 'Std dev' in col or 'Score' in col:
41
+ if "Chemistry" in col:
42
+ df[col] = (df[col]).map('{:.2f}'.format)
43
+ else:
44
+ df[col] = (df[col]*100).map('{:.2f}'.format)
45
  # df[col] = df[col].round(decimals=2)
46
 
47
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)