Spaces:
Running
Running
clear typo
Browse files- app.py +8 -8
- gen_table.py +5 -5
- meta_data.py +7 -0
- src/detail_math_score.json +3 -3
app.py
CHANGED
@@ -16,9 +16,9 @@ from meta_data import *
|
|
16 |
# }
|
17 |
# }
|
18 |
|
19 |
-
# /*
|
20 |
# .gr-checkbox {
|
21 |
-
# accent-color: rgb(59, 130, 246) !important; /*
|
22 |
# }
|
23 |
|
24 |
# .gr-checkbox-group label input[type="checkbox"] {
|
@@ -78,14 +78,14 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
78 |
headers = ['Rank'] + check_box['essential'] + fields
|
79 |
# df = overall_table.copy()
|
80 |
|
81 |
-
#
|
82 |
available_headers = [h for h in headers if h in overall_table.columns]
|
83 |
|
84 |
original_columns = overall_table.columns.tolist()
|
85 |
available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
|
86 |
|
87 |
|
88 |
-
#
|
89 |
if not available_headers:
|
90 |
available_headers = ['Rank'] + check_box['essential']
|
91 |
|
@@ -159,7 +159,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
159 |
headers = ['Rank'] + fields
|
160 |
df = table.copy()
|
161 |
|
162 |
-
#
|
163 |
df['flag'] = df.apply(lambda row: (
|
164 |
row['Algorithm'] in algos and
|
165 |
row['Dataset'] in datasets and
|
@@ -169,12 +169,12 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
169 |
df = df[df['flag']].copy()
|
170 |
df.pop('flag')
|
171 |
|
172 |
-
#
|
173 |
if 'Score' in df.columns:
|
174 |
-
#
|
175 |
df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
|
176 |
|
177 |
-
#
|
178 |
df['Rank'] = df['Rank'].astype(int)
|
179 |
|
180 |
|
|
|
16 |
# }
|
17 |
# }
|
18 |
|
19 |
+
# /* Add checkbox styles */
|
20 |
# .gr-checkbox {
|
21 |
+
# accent-color: rgb(59, 130, 246) !important; /* blue */
|
22 |
# }
|
23 |
|
24 |
# .gr-checkbox-group label input[type="checkbox"] {
|
|
|
78 |
headers = ['Rank'] + check_box['essential'] + fields
|
79 |
# df = overall_table.copy()
|
80 |
|
81 |
+
# Ensure all requested columns exist
|
82 |
available_headers = [h for h in headers if h in overall_table.columns]
|
83 |
|
84 |
original_columns = overall_table.columns.tolist()
|
85 |
available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
|
86 |
|
87 |
|
88 |
+
# If no columns are available, return an empty DataFrame with basic columns
|
89 |
if not available_headers:
|
90 |
available_headers = ['Rank'] + check_box['essential']
|
91 |
|
|
|
159 |
headers = ['Rank'] + fields
|
160 |
df = table.copy()
|
161 |
|
162 |
+
# Filter data
|
163 |
df['flag'] = df.apply(lambda row: (
|
164 |
row['Algorithm'] in algos and
|
165 |
row['Dataset'] in datasets and
|
|
|
169 |
df = df[df['flag']].copy()
|
170 |
df.pop('flag')
|
171 |
|
172 |
+
# Group by dataset and calculate ranking within each group based on Score
|
173 |
if 'Score' in df.columns:
|
174 |
+
# Create a temporary ranking column
|
175 |
df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
|
176 |
|
177 |
+
# Ensure ranking is integer
|
178 |
df['Rank'] = df['Rank'].astype(int)
|
179 |
|
180 |
|
gen_table.py
CHANGED
@@ -34,14 +34,14 @@ def BUILD_L1_DF(results, fields):
|
|
34 |
check_box = {}
|
35 |
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
|
36 |
|
37 |
-
#
|
38 |
sample_data = next(iter(results.values()))
|
39 |
available_fields = []
|
40 |
for field in fields:
|
41 |
if field in sample_data:
|
42 |
available_fields.append(field)
|
43 |
|
44 |
-
#
|
45 |
score_columns = [f"{field}-Score" for field in available_fields]
|
46 |
cost_columns = [f"{field}-Cost($)" for field in available_fields]
|
47 |
|
@@ -134,7 +134,7 @@ def generate_table(results, fields):
|
|
134 |
res[k].append(meta[k])
|
135 |
scores, costs = [], []
|
136 |
|
137 |
-
#
|
138 |
for d in fields:
|
139 |
if d in item:
|
140 |
score = item[d].get("Score")
|
@@ -149,12 +149,12 @@ def generate_table(results, fields):
|
|
149 |
res[f"{d}-Score"].append(None)
|
150 |
res[f"{d}-Cost($)"].append(None)
|
151 |
|
152 |
-
#
|
153 |
res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
|
154 |
|
155 |
df = pd.DataFrame(res)
|
156 |
|
157 |
-
#
|
158 |
valid = df[~pd.isna(df['Avg Score'])].copy()
|
159 |
missing = df[pd.isna(df['Avg Score'])].copy()
|
160 |
|
|
|
34 |
check_box = {}
|
35 |
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
|
36 |
|
37 |
+
# First check which columns exist in the actual data structure
|
38 |
sample_data = next(iter(results.values()))
|
39 |
available_fields = []
|
40 |
for field in fields:
|
41 |
if field in sample_data:
|
42 |
available_fields.append(field)
|
43 |
|
44 |
+
# Build column names, ensure they match exactly with those in generate_table function
|
45 |
score_columns = [f"{field}-Score" for field in available_fields]
|
46 |
cost_columns = [f"{field}-Cost($)" for field in available_fields]
|
47 |
|
|
|
134 |
res[k].append(meta[k])
|
135 |
scores, costs = [], []
|
136 |
|
137 |
+
# Ensure column names format matches with BUILD_L1_DF
|
138 |
for d in fields:
|
139 |
if d in item:
|
140 |
score = item[d].get("Score")
|
|
|
149 |
res[f"{d}-Score"].append(None)
|
150 |
res[f"{d}-Cost($)"].append(None)
|
151 |
|
152 |
+
# Calculate average score
|
153 |
res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
|
154 |
|
155 |
df = pd.DataFrame(res)
|
156 |
|
157 |
+
# Sorting and ranking logic remains unchanged
|
158 |
valid = df[~pd.isna(df['Avg Score'])].copy()
|
159 |
missing = df[pd.isna(df['Avg Score'])].copy()
|
160 |
|
meta_data.py
CHANGED
@@ -26,12 +26,19 @@ LEADERBOARD_MD['MATH_MAIN'] = f"""
|
|
26 |
- Cost: The cost on each math Benchmarks (the lower the better).
|
27 |
|
28 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
|
|
|
29 |
"""
|
30 |
|
31 |
LEADERBOARD_MD['MATH_DETAIL'] = f"""
|
32 |
## Math task detail Evaluation Results
|
33 |
|
34 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
- default parameters: temperature=0.0
|
36 |
- LLM prices:
|
37 |
- gpt-3.5-turbo:
|
|
|
26 |
- Cost: The cost on each math Benchmarks (the lower the better).
|
27 |
|
28 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
|
29 |
+
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
|
30 |
"""
|
31 |
|
32 |
LEADERBOARD_MD['MATH_DETAIL'] = f"""
|
33 |
## Math task detail Evaluation Results
|
34 |
|
35 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
|
36 |
+
- Metrics:
|
37 |
+
- Score: The evaluation score on each math Benchmarks (the higher the better).
|
38 |
+
- Pass rate: The percentage of response that are valid, where a response is valid if it is neither empty nor null.
|
39 |
+
- Cost: The cost on each math Benchmarks (the lower the better).
|
40 |
+
- Rank: The rank on each math Benchmarks (the lower the better).
|
41 |
+
|
42 |
- default parameters: temperature=0.0
|
43 |
- LLM prices:
|
44 |
- gpt-3.5-turbo:
|
src/detail_math_score.json
CHANGED
@@ -273,10 +273,10 @@
|
|
273 |
}
|
274 |
}
|
275 |
},
|
276 |
-
"ReAct-Pro": {
|
277 |
"gpt-3.5-turbo": {
|
278 |
"META": {
|
279 |
-
"Algorithm": "ReAct-Pro",
|
280 |
"LLM": "gpt-3.5-turbo",
|
281 |
"Eval Date": "2025/01/07"
|
282 |
},
|
@@ -309,7 +309,7 @@
|
|
309 |
},
|
310 |
"Doubao-lite-32k": {
|
311 |
"META": {
|
312 |
-
"Algorithm": "ReAct-Pro",
|
313 |
"LLM": "Doubao-lite-32k",
|
314 |
"Eval Date": "2025/01/07"
|
315 |
},
|
|
|
273 |
}
|
274 |
}
|
275 |
},
|
276 |
+
"ReAct-Pro*": {
|
277 |
"gpt-3.5-turbo": {
|
278 |
"META": {
|
279 |
+
"Algorithm": "ReAct-Pro*",
|
280 |
"LLM": "gpt-3.5-turbo",
|
281 |
"Eval Date": "2025/01/07"
|
282 |
},
|
|
|
309 |
},
|
310 |
"Doubao-lite-32k": {
|
311 |
"META": {
|
312 |
+
"Algorithm": "ReAct-Pro*",
|
313 |
"LLM": "Doubao-lite-32k",
|
314 |
"Eval Date": "2025/01/07"
|
315 |
},
|