import copy as cp import json from collections import defaultdict from urllib.request import urlopen import gradio as gr import numpy as np import pandas as pd from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS def listinstr(lst, s): assert isinstance(lst, list) for item in lst: if item in s: return True return False def load_results(file_name=OVERALL_MATH_SCORE_FILE): data = json.loads(open(file_name, "r").read()) return data def format_timestamp(timestamp): date = timestamp[:10] time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19] return date + ' ' + time def nth_large(val, vals): return sum([1 for v in vals if v > val]) + 1 def BUILD_L1_DF(results, fields): check_box = {} check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date'] # revise there to set default dataset check_box['required'] = ['Avg Score'] + [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')] check_box['avg'] = ['Avg Score'] check_box['all'] = check_box['avg'] + [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')] type_map = defaultdict(lambda: 'number') type_map['Algorithm'] = 'html' type_map['LLM'] = type_map['Vision Model'] = 'html' type_map['Eval Date'] = 'str' check_box['type_map'] = type_map # df = generate_table(results, fields) return check_box def BUILD_L2_DF(results, fields): res = defaultdict(list) # Iterate over each algorithm and its corresponding models for algo_name, algo_data in results.items(): for model_name, model_data in algo_data.items(): # Get META information meta = model_data['META'] # Create a record for each dataset for dataset in fields: if dataset not in model_data: continue # Add metadata for k, v in meta.items(): res[k].append(v) # Add dataset name res['Dataset'].append(dataset) # Get dataset data dataset_data = model_data[dataset] # Add all fields for field, value in dataset_data.items(): res[field].append(value) # Create DataFrame df = pd.DataFrame(res) # Sort by Dataset and Score in descending order df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) # Add rank for each dataset separately df['Rank'] = df.groupby('Dataset').cumcount() + 1 # Rearrange column order columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters'] remaining_columns = [col for col in df.columns if col not in columns] df = df[columns + remaining_columns] # Set checkbox configuration check_box = {} check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date'] check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'All tokens', 'Cost($)'] check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)'] type_map = defaultdict(lambda: 'number') type_map['Algorithm'] = 'html' type_map['LLM'] = type_map['Vision Model'] = 'html' type_map['Eval Date'] = 'str' type_map['Dataset'] = 'str' type_map['Parameters'] = 'str' type_map['All tokens'] = 'number' type_map['Cost($)'] = 'number' check_box['type_map'] = type_map return df, check_box def generate_table(results, fields): res = defaultdict(list) for i, m in enumerate(results): item = results[m] meta = item['META'] for k in META_FIELDS: res[k].append(meta[k]) scores, costs = [], [] for d in fields: if d in item.keys(): res[d+"-Score"].append(item[d]["Score"]) res[d+"-Cost($)"].append(item[d]["Cost($)"]) scores.append(item[d]["Score"]) costs.append(item[d]["Cost($)"]) else: res[d+"-Score"].append(None) res[d+"-Cost($)"].append(None) scores.append(None) costs.append(None) res['Avg Score'].append(round(np.mean(scores), 2) if None not in scores else None) df = pd.DataFrame(res) # Sort by Avg Score and assign rank valid = df[~pd.isna(df['Avg Score'])].copy() missing = df[pd.isna(df['Avg Score'])].copy() # Assign rank to valid rows (using integer type) valid = valid.sort_values('Avg Score', ascending=False) valid['Rank'] = pd.Series(range(1, len(valid) + 1)[::-1], dtype=int) # Assign last rank to missing rows (using integer type) if not missing.empty: missing['Rank'] = pd.Series([len(valid) + 1] * len(missing), dtype=int) # Merge and sort by Rank df = pd.concat([valid, missing]) df = df.sort_values('Rank') # Rearrange column order to ensure Rank is the first column columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] # Fixed column order for d in fields: columns.extend([f"{d}-Score", f"{d}-Cost($)"]) # Add dataset-related columns # Ensure all columns exist and reorder existing_columns = [col for col in columns if col in df.columns] remaining_columns = [col for col in df.columns if col not in columns] df = df[existing_columns + remaining_columns] # Reorder columns # Sort by Score in descending order df = df.sort_values(['Avg Score'], ascending=[False]) # Add rank for each dataset separately df['Rank'] = range(1, len(df) + 1) # Rearrange column order columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] remaining_columns = [col for col in df.columns if col not in columns] df = df[columns + remaining_columns] return df def generate_table_detail(results, fields): res = defaultdict(list) # Iterate over each algorithm and its corresponding models for algo_name, algo_data in results.items(): for model_name, model_data in algo_data.items(): # Get META information meta = model_data['META'] # Create a record for each dataset for dataset in fields: if dataset not in model_data: continue # Add metadata for k, v in meta.items(): res[k].append(v) # Add dataset name res['Dataset'].append(dataset) # Get dataset data dataset_data = model_data[dataset] # Add all fields for field, value in dataset_data.items(): res[field].append(value) # Create DataFrame df = pd.DataFrame(res) # Sort by Dataset and Score in descending order df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) # Add rank for each dataset separately df['Rank'] = df.groupby('Dataset').cumcount() + 1 # Rearrange column order columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters'] remaining_columns = [col for col in df.columns if col not in columns] df = df[columns + remaining_columns] return df