Spaces:
Running
Running
fix filter bug
Browse files- app.py +47 -43
- gen_table.py +44 -36
app.py
CHANGED
@@ -7,32 +7,32 @@ from meta_data import *
|
|
7 |
# import pandas as pd
|
8 |
# pd.set_option('display.max_colwidth', 0)
|
9 |
|
10 |
-
head_style = """
|
11 |
-
<style>
|
12 |
-
@media (min-width: 1536px)
|
13 |
-
{
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
}
|
18 |
-
|
19 |
-
/* 添加复选框样式 */
|
20 |
-
.gr-checkbox {
|
21 |
-
|
22 |
-
}
|
23 |
-
|
24 |
-
.gr-checkbox-group label input[type="checkbox"] {
|
25 |
-
|
26 |
-
}
|
27 |
-
|
28 |
-
.gr-checkbox-group input[type="checkbox"]:checked {
|
29 |
-
|
30 |
-
|
31 |
-
}
|
32 |
-
</style>
|
33 |
-
"""
|
34 |
-
|
35 |
-
with gr.Blocks(title="Open Agent Leaderboard"
|
36 |
struct = load_results(OVERALL_MATH_SCORE_FILE)
|
37 |
timestamp = struct['time']
|
38 |
EVAL_TIME = format_timestamp(timestamp)
|
@@ -51,7 +51,7 @@ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
|
|
51 |
with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
|
52 |
gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
|
53 |
check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
|
54 |
-
|
55 |
|
56 |
type_map = check_box['type_map']
|
57 |
type_map['Rank'] = 'number'
|
@@ -63,12 +63,11 @@ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
|
|
63 |
interactive=True,
|
64 |
)
|
65 |
|
66 |
-
# 修改这里:确保初始显示的列都存在于表格中
|
67 |
initial_headers = ['Rank'] + check_box['essential'] + checkbox_group.value
|
68 |
-
available_headers = [h for h in initial_headers if h in
|
69 |
-
|
70 |
data_component = gr.components.DataFrame(
|
71 |
-
value=
|
72 |
type='pandas',
|
73 |
datatype=[type_map[x] for x in available_headers],
|
74 |
interactive=False,
|
@@ -77,17 +76,21 @@ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
|
|
77 |
|
78 |
def filter_df(fields, *args):
|
79 |
headers = ['Rank'] + check_box['essential'] + fields
|
80 |
-
df =
|
81 |
-
|
82 |
# 确保所有请求的列都存在
|
83 |
-
available_headers = [h for h in headers if h in
|
|
|
|
|
|
|
|
|
84 |
|
85 |
# 如果没有可用的列,返回一个带有基本列的空DataFrame
|
86 |
if not available_headers:
|
87 |
available_headers = ['Rank'] + check_box['essential']
|
88 |
|
89 |
comp = gr.components.DataFrame(
|
90 |
-
value=
|
91 |
type='pandas',
|
92 |
datatype=[type_map[x] for x in available_headers],
|
93 |
interactive=False,
|
@@ -95,7 +98,6 @@ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
|
|
95 |
visible=True)
|
96 |
return comp
|
97 |
|
98 |
-
# checkbox_group的change事件只需要传入checkbox_group
|
99 |
checkbox_group.change(
|
100 |
fn=filter_df,
|
101 |
inputs=[checkbox_group],
|
@@ -111,7 +113,6 @@ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
|
|
111 |
|
112 |
table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
|
113 |
# table = generate_table_detail(results_detail, DEFAULT_MATH_BENCH)
|
114 |
-
|
115 |
type_map = check_box['type_map']
|
116 |
type_map['Rank'] = 'number'
|
117 |
|
@@ -154,8 +155,8 @@ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
|
|
154 |
wrap=True,
|
155 |
visible=True)
|
156 |
|
157 |
-
def
|
158 |
-
headers =
|
159 |
df = table.copy()
|
160 |
|
161 |
# 过滤数据
|
@@ -176,6 +177,9 @@ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
|
|
176 |
# 确保排名为整数
|
177 |
df['Rank'] = df['Rank'].astype(int)
|
178 |
|
|
|
|
|
|
|
179 |
comp = gr.components.DataFrame(
|
180 |
value=df[headers],
|
181 |
type='pandas',
|
@@ -187,25 +191,25 @@ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
|
|
187 |
|
188 |
# 为所有复选框组添加change事件
|
189 |
checkbox_group.change(
|
190 |
-
fn=
|
191 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
192 |
outputs=data_component
|
193 |
)
|
194 |
|
195 |
algo_name.change(
|
196 |
-
fn=
|
197 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
198 |
outputs=data_component
|
199 |
)
|
200 |
|
201 |
dataset_name.change(
|
202 |
-
fn=
|
203 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
204 |
outputs=data_component
|
205 |
)
|
206 |
|
207 |
llm_name.change(
|
208 |
-
fn=
|
209 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
210 |
outputs=data_component
|
211 |
)
|
|
|
7 |
# import pandas as pd
|
8 |
# pd.set_option('display.max_colwidth', 0)
|
9 |
|
10 |
+
# head_style = """
|
11 |
+
# <style>
|
12 |
+
# @media (min-width: 1536px)
|
13 |
+
# {
|
14 |
+
# .gradio-container {
|
15 |
+
# min-width: var(--size-full) !important;
|
16 |
+
# }
|
17 |
+
# }
|
18 |
+
|
19 |
+
# /* 添加复选框样式 */
|
20 |
+
# .gr-checkbox {
|
21 |
+
# accent-color: rgb(59, 130, 246) !important; /* 蓝色 */
|
22 |
+
# }
|
23 |
+
|
24 |
+
# .gr-checkbox-group label input[type="checkbox"] {
|
25 |
+
# accent-color: rgb(59, 130, 246) !important;
|
26 |
+
# }
|
27 |
+
|
28 |
+
# .gr-checkbox-group input[type="checkbox"]:checked {
|
29 |
+
# background-color: rgb(59, 130, 246) !important;
|
30 |
+
# border-color: rgb(59, 130, 246) !important;
|
31 |
+
# }
|
32 |
+
# </style>
|
33 |
+
# """
|
34 |
+
|
35 |
+
with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
36 |
struct = load_results(OVERALL_MATH_SCORE_FILE)
|
37 |
timestamp = struct['time']
|
38 |
EVAL_TIME = format_timestamp(timestamp)
|
|
|
51 |
with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
|
52 |
gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
|
53 |
check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
|
54 |
+
overall_table = generate_table(results, DEFAULT_MATH_BENCH)
|
55 |
|
56 |
type_map = check_box['type_map']
|
57 |
type_map['Rank'] = 'number'
|
|
|
63 |
interactive=True,
|
64 |
)
|
65 |
|
|
|
66 |
initial_headers = ['Rank'] + check_box['essential'] + checkbox_group.value
|
67 |
+
available_headers = [h for h in initial_headers if h in overall_table.columns]
|
68 |
+
|
69 |
data_component = gr.components.DataFrame(
|
70 |
+
value=overall_table[available_headers],
|
71 |
type='pandas',
|
72 |
datatype=[type_map[x] for x in available_headers],
|
73 |
interactive=False,
|
|
|
76 |
|
77 |
def filter_df(fields, *args):
|
78 |
headers = ['Rank'] + check_box['essential'] + fields
|
79 |
+
# df = overall_table.copy()
|
80 |
+
|
81 |
# 确保所有请求的列都存在
|
82 |
+
available_headers = [h for h in headers if h in overall_table.columns]
|
83 |
+
|
84 |
+
original_columns = overall_table.columns.tolist()
|
85 |
+
available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
|
86 |
+
|
87 |
|
88 |
# 如果没有可用的列,返回一个带有基本列的空DataFrame
|
89 |
if not available_headers:
|
90 |
available_headers = ['Rank'] + check_box['essential']
|
91 |
|
92 |
comp = gr.components.DataFrame(
|
93 |
+
value=overall_table[available_headers],
|
94 |
type='pandas',
|
95 |
datatype=[type_map[x] for x in available_headers],
|
96 |
interactive=False,
|
|
|
98 |
visible=True)
|
99 |
return comp
|
100 |
|
|
|
101 |
checkbox_group.change(
|
102 |
fn=filter_df,
|
103 |
inputs=[checkbox_group],
|
|
|
113 |
|
114 |
table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
|
115 |
# table = generate_table_detail(results_detail, DEFAULT_MATH_BENCH)
|
|
|
116 |
type_map = check_box['type_map']
|
117 |
type_map['Rank'] = 'number'
|
118 |
|
|
|
155 |
wrap=True,
|
156 |
visible=True)
|
157 |
|
158 |
+
def filter_df2(fields, algos, datasets, llms):
|
159 |
+
headers = ['Rank'] + fields
|
160 |
df = table.copy()
|
161 |
|
162 |
# 过滤数据
|
|
|
177 |
# 确保排名为整数
|
178 |
df['Rank'] = df['Rank'].astype(int)
|
179 |
|
180 |
+
|
181 |
+
original_columns = df.columns.tolist()
|
182 |
+
headers = sorted(headers, key=lambda x: original_columns.index(x))
|
183 |
comp = gr.components.DataFrame(
|
184 |
value=df[headers],
|
185 |
type='pandas',
|
|
|
191 |
|
192 |
# 为所有复选框组添加change事件
|
193 |
checkbox_group.change(
|
194 |
+
fn=filter_df2,
|
195 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
196 |
outputs=data_component
|
197 |
)
|
198 |
|
199 |
algo_name.change(
|
200 |
+
fn=filter_df2,
|
201 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
202 |
outputs=data_component
|
203 |
)
|
204 |
|
205 |
dataset_name.change(
|
206 |
+
fn=filter_df2,
|
207 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
208 |
outputs=data_component
|
209 |
)
|
210 |
|
211 |
llm_name.change(
|
212 |
+
fn=filter_df2,
|
213 |
inputs=[checkbox_group, algo_name, dataset_name, llm_name],
|
214 |
outputs=data_component
|
215 |
)
|
gen_table.py
CHANGED
@@ -33,15 +33,33 @@ def nth_large(val, vals):
|
|
33 |
def BUILD_L1_DF(results, fields):
|
34 |
check_box = {}
|
35 |
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
type_map = defaultdict(lambda: 'number')
|
42 |
type_map['Algorithm'] = 'html'
|
43 |
type_map['LLM'] = type_map['Vision Model'] = 'html'
|
44 |
type_map['Eval Date'] = 'str'
|
|
|
|
|
|
|
|
|
|
|
45 |
check_box['type_map'] = type_map
|
46 |
|
47 |
return check_box
|
@@ -115,58 +133,48 @@ def generate_table(results, fields):
|
|
115 |
for k in META_FIELDS:
|
116 |
res[k].append(meta[k])
|
117 |
scores, costs = [], []
|
|
|
|
|
118 |
for d in fields:
|
119 |
-
if d in item
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
124 |
else:
|
125 |
-
res[d
|
126 |
-
res[d
|
127 |
-
scores.append(None)
|
128 |
-
costs.append(None)
|
129 |
|
130 |
-
|
|
|
131 |
|
132 |
df = pd.DataFrame(res)
|
133 |
|
134 |
-
#
|
135 |
valid = df[~pd.isna(df['Avg Score'])].copy()
|
136 |
missing = df[pd.isna(df['Avg Score'])].copy()
|
137 |
|
138 |
-
# Assign rank to valid rows (using integer type)
|
139 |
valid = valid.sort_values('Avg Score', ascending=False)
|
140 |
-
valid['Rank'] =
|
141 |
|
142 |
-
# Assign last rank to missing rows (using integer type)
|
143 |
if not missing.empty:
|
144 |
-
missing['Rank'] =
|
145 |
|
146 |
-
# Merge and sort by Rank
|
147 |
df = pd.concat([valid, missing])
|
148 |
df = df.sort_values('Rank')
|
149 |
|
150 |
-
#
|
151 |
-
columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
|
152 |
for d in fields:
|
153 |
-
columns.extend([f"{d}-Score", f"{d}-Cost($)"])
|
154 |
|
155 |
-
# Ensure all columns exist and reorder
|
156 |
existing_columns = [col for col in columns if col in df.columns]
|
157 |
-
|
158 |
-
df = df[existing_columns + remaining_columns] # Reorder columns
|
159 |
-
|
160 |
-
# Sort by Score in descending order
|
161 |
-
df = df.sort_values(['Avg Score'], ascending=[False])
|
162 |
|
163 |
-
# Add rank for each dataset separately
|
164 |
-
df['Rank'] = range(1, len(df) + 1)
|
165 |
-
|
166 |
-
# Rearrange column order
|
167 |
-
columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
|
168 |
-
remaining_columns = [col for col in df.columns if col not in columns]
|
169 |
-
df = df[columns + remaining_columns]
|
170 |
return df
|
171 |
|
172 |
|
|
|
33 |
def BUILD_L1_DF(results, fields):
|
34 |
check_box = {}
|
35 |
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
|
36 |
+
|
37 |
+
# 首先检查实际的数据结构中有哪些列
|
38 |
+
sample_data = next(iter(results.values()))
|
39 |
+
available_fields = []
|
40 |
+
for field in fields:
|
41 |
+
if field in sample_data:
|
42 |
+
available_fields.append(field)
|
43 |
+
|
44 |
+
# 构建列名,确保与generate_table函数中的列名完全一致
|
45 |
+
score_columns = [f"{field}-Score" for field in available_fields]
|
46 |
+
cost_columns = [f"{field}-Cost($)" for field in available_fields]
|
47 |
+
|
48 |
+
combined_columns = score_columns + cost_columns
|
49 |
+
combined_columns_sorted = sorted(combined_columns, key=lambda x: x.split('-')[0])
|
50 |
+
|
51 |
+
check_box['required'] = ['Avg Score'] + combined_columns_sorted
|
52 |
+
check_box['all'] = ['Avg Score'] + combined_columns_sorted
|
53 |
|
54 |
type_map = defaultdict(lambda: 'number')
|
55 |
type_map['Algorithm'] = 'html'
|
56 |
type_map['LLM'] = type_map['Vision Model'] = 'html'
|
57 |
type_map['Eval Date'] = 'str'
|
58 |
+
type_map['Avg Score'] = 'number'
|
59 |
+
type_map['gsm8k-Score'] = 'number'
|
60 |
+
type_map['AQuA-Score'] = 'number'
|
61 |
+
type_map['gsm8k-Cost($)'] = 'number'
|
62 |
+
type_map['AQuA-Cost($)'] = 'number'
|
63 |
check_box['type_map'] = type_map
|
64 |
|
65 |
return check_box
|
|
|
133 |
for k in META_FIELDS:
|
134 |
res[k].append(meta[k])
|
135 |
scores, costs = [], []
|
136 |
+
|
137 |
+
# 确保列名格式与BUILD_L1_DF中的一致
|
138 |
for d in fields:
|
139 |
+
if d in item:
|
140 |
+
score = item[d].get("Score")
|
141 |
+
cost = item[d].get("Cost($)")
|
142 |
+
res[f"{d}-Score"].append(score)
|
143 |
+
res[f"{d}-Cost($)"].append(cost)
|
144 |
+
if score is not None:
|
145 |
+
scores.append(score)
|
146 |
+
if cost is not None:
|
147 |
+
costs.append(cost)
|
148 |
else:
|
149 |
+
res[f"{d}-Score"].append(None)
|
150 |
+
res[f"{d}-Cost($)"].append(None)
|
|
|
|
|
151 |
|
152 |
+
# 计算平均分
|
153 |
+
res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
|
154 |
|
155 |
df = pd.DataFrame(res)
|
156 |
|
157 |
+
# 排序和排名逻辑保持不变
|
158 |
valid = df[~pd.isna(df['Avg Score'])].copy()
|
159 |
missing = df[pd.isna(df['Avg Score'])].copy()
|
160 |
|
|
|
161 |
valid = valid.sort_values('Avg Score', ascending=False)
|
162 |
+
valid['Rank'] = range(1, len(valid) + 1)
|
163 |
|
|
|
164 |
if not missing.empty:
|
165 |
+
missing['Rank'] = len(valid) + 1
|
166 |
|
|
|
167 |
df = pd.concat([valid, missing])
|
168 |
df = df.sort_values('Rank')
|
169 |
|
170 |
+
# 重新排列列顺序
|
171 |
+
columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
|
172 |
for d in fields:
|
173 |
+
columns.extend([f"{d}-Score", f"{d}-Cost($)"])
|
174 |
|
|
|
175 |
existing_columns = [col for col in columns if col in df.columns]
|
176 |
+
df = df[existing_columns]
|
|
|
|
|
|
|
|
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
return df
|
179 |
|
180 |
|