qq-hzlh commited on
Commit
a7d1809
·
1 Parent(s): e90e797

clear typo

Browse files
Files changed (4) hide show
  1. app.py +8 -8
  2. gen_table.py +5 -5
  3. meta_data.py +7 -0
  4. src/detail_math_score.json +3 -3
app.py CHANGED
@@ -16,9 +16,9 @@ from meta_data import *
16
  # }
17
  # }
18
 
19
- # /* 添加复选框样式 */
20
  # .gr-checkbox {
21
- # accent-color: rgb(59, 130, 246) !important; /* 蓝色 */
22
  # }
23
 
24
  # .gr-checkbox-group label input[type="checkbox"] {
@@ -78,14 +78,14 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
78
  headers = ['Rank'] + check_box['essential'] + fields
79
  # df = overall_table.copy()
80
 
81
- # 确保所有请求的列都存在
82
  available_headers = [h for h in headers if h in overall_table.columns]
83
 
84
  original_columns = overall_table.columns.tolist()
85
  available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
86
 
87
 
88
- # 如果没有可用的列,返回一个带有基本列的空DataFrame
89
  if not available_headers:
90
  available_headers = ['Rank'] + check_box['essential']
91
 
@@ -159,7 +159,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
159
  headers = ['Rank'] + fields
160
  df = table.copy()
161
 
162
- # 过滤数据
163
  df['flag'] = df.apply(lambda row: (
164
  row['Algorithm'] in algos and
165
  row['Dataset'] in datasets and
@@ -169,12 +169,12 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
169
  df = df[df['flag']].copy()
170
  df.pop('flag')
171
 
172
- # 按数据集分组,在每个组内根据Score排序并计算排名
173
  if 'Score' in df.columns:
174
- # 创建一个临时的排名列
175
  df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
176
 
177
- # 确保排名为整数
178
  df['Rank'] = df['Rank'].astype(int)
179
 
180
 
 
16
  # }
17
  # }
18
 
19
+ # /* Add checkbox styles */
20
  # .gr-checkbox {
21
+ # accent-color: rgb(59, 130, 246) !important; /* blue */
22
  # }
23
 
24
  # .gr-checkbox-group label input[type="checkbox"] {
 
78
  headers = ['Rank'] + check_box['essential'] + fields
79
  # df = overall_table.copy()
80
 
81
+ # Ensure all requested columns exist
82
  available_headers = [h for h in headers if h in overall_table.columns]
83
 
84
  original_columns = overall_table.columns.tolist()
85
  available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
86
 
87
 
88
+ # If no columns are available, return an empty DataFrame with basic columns
89
  if not available_headers:
90
  available_headers = ['Rank'] + check_box['essential']
91
 
 
159
  headers = ['Rank'] + fields
160
  df = table.copy()
161
 
162
+ # Filter data
163
  df['flag'] = df.apply(lambda row: (
164
  row['Algorithm'] in algos and
165
  row['Dataset'] in datasets and
 
169
  df = df[df['flag']].copy()
170
  df.pop('flag')
171
 
172
+ # Group by dataset and calculate ranking within each group based on Score
173
  if 'Score' in df.columns:
174
+ # Create a temporary ranking column
175
  df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
176
 
177
+ # Ensure ranking is integer
178
  df['Rank'] = df['Rank'].astype(int)
179
 
180
 
gen_table.py CHANGED
@@ -34,14 +34,14 @@ def BUILD_L1_DF(results, fields):
34
  check_box = {}
35
  check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
36
 
37
- # 首先检查实际的数据结构中有哪些列
38
  sample_data = next(iter(results.values()))
39
  available_fields = []
40
  for field in fields:
41
  if field in sample_data:
42
  available_fields.append(field)
43
 
44
- # 构建列名,确保与generate_table函数中的列名完全一致
45
  score_columns = [f"{field}-Score" for field in available_fields]
46
  cost_columns = [f"{field}-Cost($)" for field in available_fields]
47
 
@@ -134,7 +134,7 @@ def generate_table(results, fields):
134
  res[k].append(meta[k])
135
  scores, costs = [], []
136
 
137
- # 确保列名格式与BUILD_L1_DF中的一致
138
  for d in fields:
139
  if d in item:
140
  score = item[d].get("Score")
@@ -149,12 +149,12 @@ def generate_table(results, fields):
149
  res[f"{d}-Score"].append(None)
150
  res[f"{d}-Cost($)"].append(None)
151
 
152
- # 计算平均分
153
  res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
154
 
155
  df = pd.DataFrame(res)
156
 
157
- # 排序和排名逻辑保持不变
158
  valid = df[~pd.isna(df['Avg Score'])].copy()
159
  missing = df[pd.isna(df['Avg Score'])].copy()
160
 
 
34
  check_box = {}
35
  check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
36
 
37
+ # First check which columns exist in the actual data structure
38
  sample_data = next(iter(results.values()))
39
  available_fields = []
40
  for field in fields:
41
  if field in sample_data:
42
  available_fields.append(field)
43
 
44
+ # Build column names, ensure they match exactly with those in generate_table function
45
  score_columns = [f"{field}-Score" for field in available_fields]
46
  cost_columns = [f"{field}-Cost($)" for field in available_fields]
47
 
 
134
  res[k].append(meta[k])
135
  scores, costs = [], []
136
 
137
+ # Ensure column names format matches with BUILD_L1_DF
138
  for d in fields:
139
  if d in item:
140
  score = item[d].get("Score")
 
149
  res[f"{d}-Score"].append(None)
150
  res[f"{d}-Cost($)"].append(None)
151
 
152
+ # Calculate average score
153
  res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
154
 
155
  df = pd.DataFrame(res)
156
 
157
+ # Sorting and ranking logic remains unchanged
158
  valid = df[~pd.isna(df['Avg Score'])].copy()
159
  missing = df[pd.isna(df['Avg Score'])].copy()
160
 
meta_data.py CHANGED
@@ -26,12 +26,19 @@ LEADERBOARD_MD['MATH_MAIN'] = f"""
26
  - Cost: The cost on each math Benchmarks (the lower the better).
27
 
28
  - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
 
29
  """
30
 
31
  LEADERBOARD_MD['MATH_DETAIL'] = f"""
32
  ## Math task detail Evaluation Results
33
 
34
  - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
 
 
 
 
 
 
35
  - default parameters: temperature=0.0
36
  - LLM prices:
37
  - gpt-3.5-turbo:
 
26
  - Cost: The cost on each math Benchmarks (the lower the better).
27
 
28
  - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
29
+ - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
30
  """
31
 
32
  LEADERBOARD_MD['MATH_DETAIL'] = f"""
33
  ## Math task detail Evaluation Results
34
 
35
  - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
36
+ - Metrics:
37
+ - Score: The evaluation score on each math Benchmarks (the higher the better).
38
+ - Pass rate: The percentage of response that are valid, where a response is valid if it is neither empty nor null.
39
+ - Cost: The cost on each math Benchmarks (the lower the better).
40
+ - Rank: The rank on each math Benchmarks (the lower the better).
41
+
42
  - default parameters: temperature=0.0
43
  - LLM prices:
44
  - gpt-3.5-turbo:
src/detail_math_score.json CHANGED
@@ -273,10 +273,10 @@
273
  }
274
  }
275
  },
276
- "ReAct-Pro": {
277
  "gpt-3.5-turbo": {
278
  "META": {
279
- "Algorithm": "ReAct-Pro",
280
  "LLM": "gpt-3.5-turbo",
281
  "Eval Date": "2025/01/07"
282
  },
@@ -309,7 +309,7 @@
309
  },
310
  "Doubao-lite-32k": {
311
  "META": {
312
- "Algorithm": "ReAct-Pro",
313
  "LLM": "Doubao-lite-32k",
314
  "Eval Date": "2025/01/07"
315
  },
 
273
  }
274
  }
275
  },
276
+ "ReAct-Pro*": {
277
  "gpt-3.5-turbo": {
278
  "META": {
279
+ "Algorithm": "ReAct-Pro*",
280
  "LLM": "gpt-3.5-turbo",
281
  "Eval Date": "2025/01/07"
282
  },
 
309
  },
310
  "Doubao-lite-32k": {
311
  "META": {
312
+ "Algorithm": "ReAct-Pro*",
313
  "LLM": "Doubao-lite-32k",
314
  "Eval Date": "2025/01/07"
315
  },