Spaces:
Running
Running
import copy as cp | |
import json | |
from collections import defaultdict | |
from urllib.request import urlopen | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS | |
def listinstr(lst, s): | |
assert isinstance(lst, list) | |
for item in lst: | |
if item in s: | |
return True | |
return False | |
def load_results(file_name=OVERALL_MATH_SCORE_FILE): | |
data = json.loads(open(file_name, "r").read()) | |
return data | |
def format_timestamp(timestamp): | |
date = timestamp[:10] | |
time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19] | |
return date + ' ' + time | |
def nth_large(val, vals): | |
return sum([1 for v in vals if v > val]) + 1 | |
def BUILD_L1_DF(results, fields): | |
check_box = {} | |
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date'] | |
# revise there to set default dataset | |
check_box['required'] = ['Avg Score'] + [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')] | |
check_box['avg'] = ['Avg Score'] | |
check_box['all'] = check_box['avg'] + [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')] | |
type_map = defaultdict(lambda: 'number') | |
type_map['Algorithm'] = 'html' | |
type_map['LLM'] = type_map['Vision Model'] = 'html' | |
type_map['Eval Date'] = 'str' | |
check_box['type_map'] = type_map | |
# df = generate_table(results, fields) | |
return check_box | |
def BUILD_L2_DF(results, fields): | |
res = defaultdict(list) | |
# Iterate over each algorithm and its corresponding models | |
for algo_name, algo_data in results.items(): | |
for model_name, model_data in algo_data.items(): | |
# Get META information | |
meta = model_data['META'] | |
# Create a record for each dataset | |
for dataset in fields: | |
if dataset not in model_data: | |
continue | |
# Add metadata | |
for k, v in meta.items(): | |
res[k].append(v) | |
# Add dataset name | |
res['Dataset'].append(dataset) | |
# Get dataset data | |
dataset_data = model_data[dataset] | |
# Add all fields | |
for field, value in dataset_data.items(): | |
res[field].append(value) | |
# Create DataFrame | |
df = pd.DataFrame(res) | |
# Sort by Dataset and Score in descending order | |
df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) | |
# Add rank for each dataset separately | |
df['Rank'] = df.groupby('Dataset').cumcount() + 1 | |
# Rearrange column order | |
columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters'] | |
remaining_columns = [col for col in df.columns if col not in columns] | |
df = df[columns + remaining_columns] | |
# Set checkbox configuration | |
check_box = {} | |
check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date'] | |
check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'All tokens', 'Cost($)'] | |
check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)'] | |
type_map = defaultdict(lambda: 'number') | |
type_map['Algorithm'] = 'html' | |
type_map['LLM'] = type_map['Vision Model'] = 'html' | |
type_map['Eval Date'] = 'str' | |
type_map['Dataset'] = 'str' | |
type_map['Parameters'] = 'str' | |
type_map['All tokens'] = 'number' | |
type_map['Cost($)'] = 'number' | |
check_box['type_map'] = type_map | |
return df, check_box | |
def generate_table(results, fields): | |
res = defaultdict(list) | |
for i, m in enumerate(results): | |
item = results[m] | |
meta = item['META'] | |
for k in META_FIELDS: | |
res[k].append(meta[k]) | |
scores, costs = [], [] | |
for d in fields: | |
if d in item.keys(): | |
res[d+"-Score"].append(item[d]["Score"]) | |
res[d+"-Cost($)"].append(item[d]["Cost($)"]) | |
scores.append(item[d]["Score"]) | |
costs.append(item[d]["Cost($)"]) | |
else: | |
res[d+"-Score"].append(None) | |
res[d+"-Cost($)"].append(None) | |
scores.append(None) | |
costs.append(None) | |
res['Avg Score'].append(round(np.mean(scores), 2) if None not in scores else None) | |
df = pd.DataFrame(res) | |
# Sort by Avg Score and assign rank | |
valid = df[~pd.isna(df['Avg Score'])].copy() | |
missing = df[pd.isna(df['Avg Score'])].copy() | |
# Assign rank to valid rows (using integer type) | |
valid = valid.sort_values('Avg Score', ascending=False) | |
valid['Rank'] = pd.Series(range(1, len(valid) + 1)[::-1], dtype=int) | |
# Assign last rank to missing rows (using integer type) | |
if not missing.empty: | |
missing['Rank'] = pd.Series([len(valid) + 1] * len(missing), dtype=int) | |
# Merge and sort by Rank | |
df = pd.concat([valid, missing]) | |
df = df.sort_values('Rank') | |
# Rearrange column order to ensure Rank is the first column | |
columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] # Fixed column order | |
for d in fields: | |
columns.extend([f"{d}-Score", f"{d}-Cost($)"]) # Add dataset-related columns | |
# Ensure all columns exist and reorder | |
existing_columns = [col for col in columns if col in df.columns] | |
remaining_columns = [col for col in df.columns if col not in columns] | |
df = df[existing_columns + remaining_columns] # Reorder columns | |
# Sort by Score in descending order | |
df = df.sort_values(['Avg Score'], ascending=[False]) | |
# Add rank for each dataset separately | |
df['Rank'] = range(1, len(df) + 1) | |
# Rearrange column order | |
columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] | |
remaining_columns = [col for col in df.columns if col not in columns] | |
df = df[columns + remaining_columns] | |
return df | |
def generate_table_detail(results, fields): | |
res = defaultdict(list) | |
# Iterate over each algorithm and its corresponding models | |
for algo_name, algo_data in results.items(): | |
for model_name, model_data in algo_data.items(): | |
# Get META information | |
meta = model_data['META'] | |
# Create a record for each dataset | |
for dataset in fields: | |
if dataset not in model_data: | |
continue | |
# Add metadata | |
for k, v in meta.items(): | |
res[k].append(v) | |
# Add dataset name | |
res['Dataset'].append(dataset) | |
# Get dataset data | |
dataset_data = model_data[dataset] | |
# Add all fields | |
for field, value in dataset_data.items(): | |
res[field].append(value) | |
# Create DataFrame | |
df = pd.DataFrame(res) | |
# Sort by Dataset and Score in descending order | |
df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) | |
# Add rank for each dataset separately | |
df['Rank'] = df.groupby('Dataset').cumcount() + 1 | |
# Rearrange column order | |
columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters'] | |
remaining_columns = [col for col in df.columns if col not in columns] | |
df = df[columns + remaining_columns] | |
return df |