yzabc007 commited on
Commit
5db0911
1 Parent(s): 190ad0c

Update space

Browse files
Files changed (2) hide show
  1. app.py +16 -22
  2. src/populate.py +5 -18
app.py CHANGED
@@ -119,26 +119,7 @@ def overall_leaderboard(dataframe):
119
  interactive=False,
120
  )
121
 
122
-
123
-
124
- def overview_leaderboard(dataframe):
125
- if dataframe is None or dataframe.empty:
126
- raise ValueError("Leaderboard DataFrame is empty or None.")
127
-
128
- return Leaderboard(
129
- value=dataframe,
130
- datatype=[c.type for c in fields(AutoEvalColumn)],
131
- select_columns=None,
132
- search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
133
- placeholder="Search by the model name",
134
- label="Searching"),
135
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
136
- filter_columns=None,
137
- interactive=False,
138
- )
139
-
140
 
141
-
142
 
143
  demo = gr.Blocks(css=custom_css)
144
  with demo:
@@ -148,10 +129,23 @@ with demo:
148
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
149
 
150
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
151
- leaderboard = init_leaderboard(LEADERBOARD_DF)
152
- # leaderboard = overview_leaderboard(model_leaderboard_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
-
155
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
156
 
157
  leaderboard = overall_leaderboard(
 
119
  interactive=False,
120
  )
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
 
123
 
124
  demo = gr.Blocks(css=custom_css)
125
  with demo:
 
129
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
130
 
131
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
132
+ leaderboard = overall_leaderboard(
133
+ get_model_leaderboard_df(
134
+ model_result_path,
135
+ benchmark_cols=[
136
+ # AutoEvalColumn.rank_overall.name,
137
+ AutoEvalColumn.model.name,
138
+ AutoEvalColumn.rank_overall.name,
139
+ AutoEvalColumn.rank_math_algebra.name,
140
+ AutoEvalColumn.rank_math_geometry.name,
141
+ AutoEvalColumn.rank_math_probability.name,
142
+ AutoEvalColumn.rank_reason_logical.name,
143
+ AutoEvalColumn.rank_reason_social.name,
144
+ ],
145
+ rank_col=[],
146
+ )
147
+ )
148
 
 
149
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
150
 
151
  leaderboard = overall_leaderboard(
src/populate.py CHANGED
@@ -9,24 +9,6 @@ from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_resul
9
 
10
 
11
 
12
- # def get_overview_leaderboard_df(results_path: str) -> pd.DataFrame:
13
- # """Creates a dataframe from all the individual experiment results"""
14
- # raw_data = get_raw_eval_results(results_path, requests_path)
15
- # all_data_json = [v.to_dict() for v in raw_data]
16
-
17
- # df = pd.DataFrame.from_records(all_data_json)
18
- # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
- # for col in cols:
20
- # if col not in df.columns:
21
- # df[col] = None
22
- # else:
23
- # df[col] = df[col].round(decimals=2)
24
-
25
- # # filter out if any of the benchmarks have not been produced
26
- # df = df[has_no_nan_values(df, benchmark_cols)]
27
- # return df
28
-
29
-
30
 
31
  def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[], rank_col: list=[]) -> pd.DataFrame:
32
  """Creates a dataframe from all the individual experiment results"""
@@ -40,6 +22,11 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
40
 
41
  if rank_col:
42
  df = df.sort_values(by=[rank_col[0]], ascending=True)
 
 
 
 
 
43
 
44
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
45
  # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
 
9
 
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[], rank_col: list=[]) -> pd.DataFrame:
14
  """Creates a dataframe from all the individual experiment results"""
 
22
 
23
  if rank_col:
24
  df = df.sort_values(by=[rank_col[0]], ascending=True)
25
+ else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
26
+ avg_rank = df.iloc[:, 1:].mean(axis=1)
27
+ df["Average Rank"] = avg_rank
28
+ df = df.sort_values(by=["Average Rank"], ascending=True)
29
+
30
 
31
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
32
  # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")