Update space
Browse files- app.py +16 -22
- src/populate.py +5 -18
app.py
CHANGED
@@ -119,26 +119,7 @@ def overall_leaderboard(dataframe):
|
|
119 |
interactive=False,
|
120 |
)
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
def overview_leaderboard(dataframe):
|
125 |
-
if dataframe is None or dataframe.empty:
|
126 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
127 |
-
|
128 |
-
return Leaderboard(
|
129 |
-
value=dataframe,
|
130 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
131 |
-
select_columns=None,
|
132 |
-
search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
|
133 |
-
placeholder="Search by the model name",
|
134 |
-
label="Searching"),
|
135 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
136 |
-
filter_columns=None,
|
137 |
-
interactive=False,
|
138 |
-
)
|
139 |
-
|
140 |
|
141 |
-
|
142 |
|
143 |
demo = gr.Blocks(css=custom_css)
|
144 |
with demo:
|
@@ -148,10 +129,23 @@ with demo:
|
|
148 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
149 |
|
150 |
with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
|
151 |
-
leaderboard =
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
-
|
155 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
156 |
|
157 |
leaderboard = overall_leaderboard(
|
|
|
119 |
interactive=False,
|
120 |
)
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
|
|
123 |
|
124 |
demo = gr.Blocks(css=custom_css)
|
125 |
with demo:
|
|
|
129 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
130 |
|
131 |
with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
|
132 |
+
leaderboard = overall_leaderboard(
|
133 |
+
get_model_leaderboard_df(
|
134 |
+
model_result_path,
|
135 |
+
benchmark_cols=[
|
136 |
+
# AutoEvalColumn.rank_overall.name,
|
137 |
+
AutoEvalColumn.model.name,
|
138 |
+
AutoEvalColumn.rank_overall.name,
|
139 |
+
AutoEvalColumn.rank_math_algebra.name,
|
140 |
+
AutoEvalColumn.rank_math_geometry.name,
|
141 |
+
AutoEvalColumn.rank_math_probability.name,
|
142 |
+
AutoEvalColumn.rank_reason_logical.name,
|
143 |
+
AutoEvalColumn.rank_reason_social.name,
|
144 |
+
],
|
145 |
+
rank_col=[],
|
146 |
+
)
|
147 |
+
)
|
148 |
|
|
|
149 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
150 |
|
151 |
leaderboard = overall_leaderboard(
|
src/populate.py
CHANGED
@@ -9,24 +9,6 @@ from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_resul
|
|
9 |
|
10 |
|
11 |
|
12 |
-
# def get_overview_leaderboard_df(results_path: str) -> pd.DataFrame:
|
13 |
-
# """Creates a dataframe from all the individual experiment results"""
|
14 |
-
# raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
-
# all_data_json = [v.to_dict() for v in raw_data]
|
16 |
-
|
17 |
-
# df = pd.DataFrame.from_records(all_data_json)
|
18 |
-
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
19 |
-
# for col in cols:
|
20 |
-
# if col not in df.columns:
|
21 |
-
# df[col] = None
|
22 |
-
# else:
|
23 |
-
# df[col] = df[col].round(decimals=2)
|
24 |
-
|
25 |
-
# # filter out if any of the benchmarks have not been produced
|
26 |
-
# df = df[has_no_nan_values(df, benchmark_cols)]
|
27 |
-
# return df
|
28 |
-
|
29 |
-
|
30 |
|
31 |
def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[], rank_col: list=[]) -> pd.DataFrame:
|
32 |
"""Creates a dataframe from all the individual experiment results"""
|
@@ -40,6 +22,11 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
40 |
|
41 |
if rank_col:
|
42 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
|
45 |
# df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
|
|
|
9 |
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[], rank_col: list=[]) -> pd.DataFrame:
|
14 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
22 |
|
23 |
if rank_col:
|
24 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
25 |
+
else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
|
26 |
+
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
27 |
+
df["Average Rank"] = avg_rank
|
28 |
+
df = df.sort_values(by=["Average Rank"], ascending=True)
|
29 |
+
|
30 |
|
31 |
# df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
|
32 |
# df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
|