open-agent-leaderboard / gen_table.py
qq-hzlh's picture
fix filter bug
e90e797
raw
history blame
7.49 kB
import copy as cp
import json
from collections import defaultdict
from urllib.request import urlopen
import gradio as gr
import numpy as np
import pandas as pd
from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
def listinstr(lst, s):
assert isinstance(lst, list)
for item in lst:
if item in s:
return True
return False
def load_results(file_name=OVERALL_MATH_SCORE_FILE):
data = json.loads(open(file_name, "r").read())
return data
def format_timestamp(timestamp):
date = timestamp[:10]
time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19]
return date + ' ' + time
def nth_large(val, vals):
return sum([1 for v in vals if v > val]) + 1
def BUILD_L1_DF(results, fields):
check_box = {}
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
# 首先检查实际的数据结构中有哪些列
sample_data = next(iter(results.values()))
available_fields = []
for field in fields:
if field in sample_data:
available_fields.append(field)
# 构建列名,确保与generate_table函数中的列名完全一致
score_columns = [f"{field}-Score" for field in available_fields]
cost_columns = [f"{field}-Cost($)" for field in available_fields]
combined_columns = score_columns + cost_columns
combined_columns_sorted = sorted(combined_columns, key=lambda x: x.split('-')[0])
check_box['required'] = ['Avg Score'] + combined_columns_sorted
check_box['all'] = ['Avg Score'] + combined_columns_sorted
type_map = defaultdict(lambda: 'number')
type_map['Algorithm'] = 'html'
type_map['LLM'] = type_map['Vision Model'] = 'html'
type_map['Eval Date'] = 'str'
type_map['Avg Score'] = 'number'
type_map['gsm8k-Score'] = 'number'
type_map['AQuA-Score'] = 'number'
type_map['gsm8k-Cost($)'] = 'number'
type_map['AQuA-Cost($)'] = 'number'
check_box['type_map'] = type_map
return check_box
def BUILD_L2_DF(results, fields):
res = defaultdict(list)
# Iterate over each algorithm and its corresponding models
for algo_name, algo_data in results.items():
for model_name, model_data in algo_data.items():
# Get META information
meta = model_data['META']
# Create a record for each dataset
for dataset in fields:
if dataset not in model_data:
continue
# Add metadata
for k, v in meta.items():
res[k].append(v)
# Add dataset name
res['Dataset'].append(dataset)
# Get dataset data
dataset_data = model_data[dataset]
# Add all fields
for field, value in dataset_data.items():
res[field].append(value)
# Create DataFrame
df = pd.DataFrame(res)
# Sort by Dataset and Score in descending order
df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
# Add rank for each dataset separately
df['Rank'] = df.groupby('Dataset').cumcount() + 1
# Rearrange column order
columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot']
remaining_columns = [col for col in df.columns if col not in columns]
df = df[columns + remaining_columns]
# Set checkbox configuration
check_box = {}
check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date']
check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Samples', 'All tokens', 'Cost($)']
check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)']
type_map = defaultdict(lambda: 'number')
type_map['Algorithm'] = 'html'
type_map['LLM'] = type_map['Vision Model'] = 'html'
type_map['Eval Date'] = 'str'
type_map['Dataset'] = 'str'
type_map['All tokens'] = 'number'
type_map['Cost($)'] = 'number'
check_box['type_map'] = type_map
return df, check_box
def generate_table(results, fields):
res = defaultdict(list)
for i, m in enumerate(results):
item = results[m]
meta = item['META']
for k in META_FIELDS:
res[k].append(meta[k])
scores, costs = [], []
# 确保列名格式与BUILD_L1_DF中的一致
for d in fields:
if d in item:
score = item[d].get("Score")
cost = item[d].get("Cost($)")
res[f"{d}-Score"].append(score)
res[f"{d}-Cost($)"].append(cost)
if score is not None:
scores.append(score)
if cost is not None:
costs.append(cost)
else:
res[f"{d}-Score"].append(None)
res[f"{d}-Cost($)"].append(None)
# 计算平均分
res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
df = pd.DataFrame(res)
# 排序和排名逻辑保持不变
valid = df[~pd.isna(df['Avg Score'])].copy()
missing = df[pd.isna(df['Avg Score'])].copy()
valid = valid.sort_values('Avg Score', ascending=False)
valid['Rank'] = range(1, len(valid) + 1)
if not missing.empty:
missing['Rank'] = len(valid) + 1
df = pd.concat([valid, missing])
df = df.sort_values('Rank')
# 重新排列列顺序
columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
for d in fields:
columns.extend([f"{d}-Score", f"{d}-Cost($)"])
existing_columns = [col for col in columns if col in df.columns]
df = df[existing_columns]
return df
def generate_table_detail(results, fields):
res = defaultdict(list)
# Iterate over each algorithm and its corresponding models
for algo_name, algo_data in results.items():
for model_name, model_data in algo_data.items():
# Get META information
meta = model_data['META']
# Create a record for each dataset
for dataset in fields:
if dataset not in model_data:
continue
# Add metadata
for k, v in meta.items():
res[k].append(v)
# Add dataset name
res['Dataset'].append(dataset)
# Get dataset data
dataset_data = model_data[dataset]
# Add all fields
for field, value in dataset_data.items():
res[field].append(value)
# Create DataFrame
df = pd.DataFrame(res)
# Sort by Dataset and Score in descending order
df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
# Add rank for each dataset separately
df['Rank'] = df.groupby('Dataset').cumcount() + 1
# Rearrange column order
columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot']
remaining_columns = [col for col in df.columns if col not in columns]
df = df[columns + remaining_columns]
return df