import copy as cp import json from collections import defaultdict from urllib.request import urlopen import gradio as gr import numpy as np import pandas as pd from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS def listinstr(lst, s): assert isinstance(lst, list) for item in lst: if item in s: return True return False def load_results(file_name=OVERALL_MATH_SCORE_FILE): data = json.loads(open(file_name, "r").read()) return data def format_timestamp(timestamp): date = timestamp[:10] time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19] return date + ' ' + time def nth_large(val, vals): return sum([1 for v in vals if v > val]) + 1 def BUILD_L1_DF(results, fields): check_box = {} check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date'] # First check which columns exist in the actual data structure sample_data = next(iter(results.values())) available_fields = [] for field in fields: if field in sample_data: available_fields.append(field) # Build column names, ensure they match exactly with those in generate_table function score_columns = [f"{field}-Score" for field in available_fields] cost_columns = [f"{field}-Cost($)" for field in available_fields] combined_columns = score_columns + cost_columns combined_columns_sorted = sorted(combined_columns, key=lambda x: x.split('-')[0]) check_box['required'] = ['Avg Score'] + combined_columns_sorted check_box['all'] = ['Avg Score'] + combined_columns_sorted type_map = defaultdict(lambda: 'number') type_map['Algorithm'] = 'html' type_map['LLM'] = type_map['Vision Model'] = 'html' type_map['Eval Date'] = 'str' type_map['Avg Score'] = 'number' type_map['gsm8k-Score'] = 'number' type_map['AQuA-Score'] = 'number' type_map['gsm8k-Cost($)'] = 'number' type_map['AQuA-Cost($)'] = 'number' check_box['type_map'] = type_map return check_box def BUILD_L2_DF(results, fields): res = defaultdict(list) # Iterate over each algorithm and its corresponding models for algo_name, algo_data in results.items(): for model_name, model_data in algo_data.items(): # Get META information meta = model_data['META'] # Create a record for each dataset for dataset in fields: if dataset not in model_data: continue # Add metadata for k, v in meta.items(): res[k].append(v) # Add dataset name res['Dataset'].append(dataset) # Get dataset data dataset_data = model_data[dataset] # Add all fields for field, value in dataset_data.items(): res[field].append(value) # Create DataFrame df = pd.DataFrame(res) # Sort by Dataset and Score in descending order df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) # Add rank for each dataset separately df['Rank'] = df.groupby('Dataset').cumcount() + 1 # Rearrange column order columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot'] remaining_columns = [col for col in df.columns if col not in columns] df = df[columns + remaining_columns] # Set checkbox configuration check_box = {} check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date'] check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Samples', 'All tokens', 'Cost($)'] check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)'] type_map = defaultdict(lambda: 'number') type_map['Algorithm'] = 'html' type_map['LLM'] = type_map['Vision Model'] = 'html' type_map['Eval Date'] = 'str' type_map['Dataset'] = 'str' type_map['All tokens'] = 'number' type_map['Cost($)'] = 'number' check_box['type_map'] = type_map return df, check_box def generate_table(results, fields): res = defaultdict(list) for i, m in enumerate(results): item = results[m] meta = item['META'] for k in META_FIELDS: res[k].append(meta[k]) scores, costs = [], [] # Ensure column names format matches with BUILD_L1_DF for d in fields: if d in item: score = item[d].get("Score") cost = item[d].get("Cost($)") res[f"{d}-Score"].append(score) res[f"{d}-Cost($)"].append(cost) if score is not None: scores.append(score) if cost is not None: costs.append(cost) else: res[f"{d}-Score"].append(None) res[f"{d}-Cost($)"].append(None) # Calculate average score res['Avg Score'].append(round(np.mean(scores), 2) if scores else None) df = pd.DataFrame(res) # Sorting and ranking logic remains unchanged valid = df[~pd.isna(df['Avg Score'])].copy() missing = df[pd.isna(df['Avg Score'])].copy() valid = valid.sort_values('Avg Score', ascending=False) valid['Rank'] = range(1, len(valid) + 1) if not missing.empty: missing['Rank'] = len(valid) + 1 df = pd.concat([valid, missing]) df = df.sort_values('Rank') # 重新排列列顺序 columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] for d in fields: columns.extend([f"{d}-Score", f"{d}-Cost($)"]) existing_columns = [col for col in columns if col in df.columns] df = df[existing_columns] return df def generate_table_detail(results, fields): res = defaultdict(list) # Iterate over each algorithm and its corresponding models for algo_name, algo_data in results.items(): for model_name, model_data in algo_data.items(): # Get META information meta = model_data['META'] # Create a record for each dataset for dataset in fields: if dataset not in model_data: continue # Add metadata for k, v in meta.items(): res[k].append(v) # Add dataset name res['Dataset'].append(dataset) # Get dataset data dataset_data = model_data[dataset] # Add all fields for field, value in dataset_data.items(): res[field].append(value) # Create DataFrame df = pd.DataFrame(res) # Sort by Dataset and Score in descending order df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) # Add rank for each dataset separately df['Rank'] = df.groupby('Dataset').cumcount() + 1 # Rearrange column order columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot'] remaining_columns = [col for col in df.columns if col not in columns] df = df[columns + remaining_columns] return df