import copy as cp import json from collections import defaultdict from urllib.request import urlopen import gradio as gr import numpy as np import pandas as pd from decimal import Decimal, ROUND_HALF_UP from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS def listinstr(lst, s): assert isinstance(lst, list) for item in lst: if item in s: return True return False def load_results(file_name=OVERALL_MATH_SCORE_FILE): data = json.loads(open(file_name, "r").read()) return data def format_timestamp(timestamp): date = timestamp[:10] time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19] return date + ' ' + time def nth_large(val, vals): return sum([1 for v in vals if v > val]) + 1 def BUILD_L1_DF(results, fields): check_box = {} check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date'] # First check which columns exist in the actual data structure sample_data = next(iter(results.values())) available_fields = [] for field in fields: if field in sample_data: available_fields.append(field) # Build column names, ensure they match exactly with those in generate_table function score_columns = [f"{field}-Score" for field in available_fields] cost_columns = [f"{field}-Cost($)" for field in available_fields] combined_columns = score_columns + cost_columns combined_columns_sorted = sorted(combined_columns, key=lambda x: x.split('-')[0]) check_box['required'] = ['Avg Score'] + combined_columns_sorted check_box['all'] = ['Avg Score'] + combined_columns_sorted type_map = defaultdict(lambda: 'number') type_map['Algorithm'] = 'html' type_map['LLM'] = type_map['Vision Model'] = 'html' type_map['Eval Date'] = 'str' type_map['Avg Score'] = 'number' type_map['gsm8k-Score'] = 'number' type_map['AQuA-Score'] = 'number' type_map['gsm8k-Cost($)'] = 'number' type_map['AQuA-Cost($)'] = 'number' check_box['type_map'] = type_map return check_box def BUILD_L2_DF(results, fields): res = defaultdict(list) # Iterate over each algorithm and its corresponding models for algo_name, algo_data in results.items(): for model_name, model_data in algo_data.items(): # Get META information meta = model_data['META'] # Create a record for each dataset for dataset in fields: if dataset not in model_data: continue # Add metadata for k, v in meta.items(): res[k].append(v) # Add dataset name res['Dataset'].append(dataset) # Get dataset data dataset_data = model_data[dataset] # Add all fields for field, value in dataset_data.items(): res[field].append(value) # Create DataFrame df = pd.DataFrame(res) # Sort by Dataset and Score in descending order df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) # Add rank for each dataset separately df['Rank'] = df.groupby('Dataset').cumcount() + 1 # Rearrange column order columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot'] remaining_columns = [col for col in df.columns if col not in columns] df = df[columns + remaining_columns] # Set checkbox configuration check_box = {} check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date'] check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Samples', 'All tokens', 'Cost($)'] check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)'] type_map = defaultdict(lambda: 'number') type_map['Algorithm'] = 'html' type_map['LLM'] = type_map['Vision Model'] = 'html' type_map['Eval Date'] = 'str' type_map['Dataset'] = 'str' type_map['All tokens'] = 'number' type_map['Cost($)'] = 'number' check_box['type_map'] = type_map return df, check_box def generate_table(results, fields): res = defaultdict(list) for i, m in enumerate(results): item = results[m] meta = item['META'] for k in META_FIELDS: res[k].append(meta[k]) scores, costs = [], [] # Ensure column names format matches with BUILD_L1_DF for d in fields: if d in item: score = item[d].get("Score") cost = item[d].get("Cost($)") res[f"{d}-Score"].append(score) res[f"{d}-Cost($)"].append(cost) if score is not None: scores.append(score) if cost is not None: costs.append(cost) else: res[f"{d}-Score"].append(None) res[f"{d}-Cost($)"].append(None) # Calculate average score if scores: decimal_numbers = [Decimal(str(num)) for num in scores] avg_score = Decimal(str(np.mean(scores) if scores else None)) avg_score = sum(decimal_numbers) / len(decimal_numbers) else: avg_score = None formatted_average = avg_score.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP) res['Avg Score'].append(formatted_average) df = pd.DataFrame(res) # Sorting and ranking logic remains unchanged valid = df[~pd.isna(df['Avg Score'])].copy() missing = df[pd.isna(df['Avg Score'])].copy() valid = valid.sort_values('Avg Score', ascending=False) valid['Rank'] = range(1, len(valid) + 1) if not missing.empty: missing['Rank'] = len(valid) + 1 df = pd.concat([valid, missing]) df = df.sort_values('Rank') # 重新排列列顺序 columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] for d in fields: columns.extend([f"{d}-Score", f"{d}-Cost($)"]) existing_columns = [col for col in columns if col in df.columns] df = df[existing_columns] return df def generate_table_detail(results, fields): res = defaultdict(list) # Iterate over each algorithm and its corresponding models for algo_name, algo_data in results.items(): for model_name, model_data in algo_data.items(): # Get META information meta = model_data['META'] # Create a record for each dataset for dataset in fields: if dataset not in model_data: continue # Add metadata for k, v in meta.items(): res[k].append(v) # Add dataset name res['Dataset'].append(dataset) # Get dataset data dataset_data = model_data[dataset] # Add all fields for field, value in dataset_data.items(): res[field].append(value) # Create DataFrame df = pd.DataFrame(res) # Sort by Dataset and Score in descending order df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) # Add rank for each dataset separately df['Rank'] = df.groupby('Dataset').cumcount() + 1 # Rearrange column order columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot'] remaining_columns = [col for col in df.columns if col not in columns] df = df[columns + remaining_columns] return df