Spaces:
Sleeping
Sleeping
import os | |
from glob import glob | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import seaborn as sns | |
from matplotlib.colors import BoundaryNorm, ListedColormap | |
all_results = pd.read_pickle("final_df.pkl") | |
def get_accuracy_dataframe(df_mother, category): | |
# Calculate overall model accuracy | |
# filter for category only | |
df = df_mother[df_mother["category"] == category].copy() | |
df["is_answer_correct"] = df["is_answer_correct"].astype(float) | |
model_accuracy = df.groupby("model")["is_answer_correct"].mean().reset_index() | |
# Calculate model accuracy per difficulty level | |
df["difficulty_level"] = df["difficulty_level"].astype(int) | |
model_accuracy_per_level = ( | |
df.groupby(["model", "difficulty_level"])["is_answer_correct"] | |
.mean() | |
.reset_index() | |
) | |
model_accuracy_per_level_df = model_accuracy_per_level.pivot( | |
index="model", columns="difficulty_level", values="is_answer_correct" | |
) | |
# Merge overall accuracy and level-based accuracy into a single DataFrame | |
model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on="model") | |
model_accuracy_df.rename( | |
columns={"is_answer_correct": "Overall Accuracy"}, inplace=True | |
) | |
model_accuracy_df['model'] = model_accuracy_df['model'].apply(lambda x: x.split('/')[-1]) | |
# Ensure all expected difficulty levels are present | |
expected_levels = [1, 2, 3, 4] # Adjust based on your data | |
for level in expected_levels: | |
if level not in model_accuracy_df.columns: | |
model_accuracy_df[ | |
level | |
] = None # Fill missing levels with None or an appropriate value | |
# Rename columns to include levels | |
level_columns = {level: f"Level {level} Accuracy" for level in expected_levels} | |
model_accuracy_df.rename(columns=level_columns, inplace=True) | |
# Multiply by 100 and format to one decimal point | |
model_accuracy_df = model_accuracy_df.applymap( | |
lambda x: round(x * 100, 1) if isinstance(x, float) else x | |
) | |
# Add headers with icons | |
model_accuracy_df.columns = [ | |
"π€ Model Name", | |
"β Overall", | |
"π Level 1", | |
"π Level 2", | |
"π Level 3", | |
"π¬ Level 4", | |
] | |
model_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True) | |
return model_accuracy_df | |
# categories = array(['1shot', 'CoT', 'Textonly', 'vision', 'vision-CoT'], dtype=object) | |
accuracy_df_textonly = get_accuracy_dataframe(all_results, "Textonly") | |
accuracy_df_cot = get_accuracy_dataframe(all_results, "CoT") | |
accuracy_df_vision = get_accuracy_dataframe(all_results, "vision") | |
accuracy_df_vision_cot = get_accuracy_dataframe(all_results, "vision-CoT") | |
accuracy_df_1shot = get_accuracy_dataframe(all_results, "1shot") | |
# Define the column names with icons | |
headers_with_icons = [ | |
"π€ Model Name", | |
"β Overall", | |
"π Level 1", | |
"π Level 2", | |
"π Level 3", | |
"π¬ Level 4", | |
] | |
column_names = [ | |
"Model Name", | |
"Overall Accuracy", | |
"Level 1 Accuracy", | |
"Level 2 Accuracy", | |
"Level 3 Accuracy", | |
"Level 4 Accuracy", | |
] | |
def load_heatmap_textonly(evt: gr.SelectData): | |
print(f"./heatmaps/{evt.value}_Textonly.jpg") | |
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_Textonly.jpg") | |
return heatmap_image | |
def load_heatmap_cot(evt: gr.SelectData): | |
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_CoT.jpg") | |
return heatmap_image | |
def load_heatmap_vision(evt: gr.SelectData): | |
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision.jpg") | |
return heatmap_image | |
def load_heatmap_vision_cot(evt: gr.SelectData): | |
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision-CoT.jpg") | |
return heatmap_image | |
def load_heatmap_1shot(evt: gr.SelectData): | |
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_1shot.jpg") | |
return heatmap_image | |
# Then, use these functions in the corresponding select method calls: | |
with gr.Blocks() as demo: | |
gr.Markdown("# FSM Benchmark Leaderboard") | |
# Text-only Benchmark | |
with gr.Tab("Text-only Benchmark"): | |
leader_board_textonly = gr.Dataframe( | |
accuracy_df_textonly, headers=headers_with_icons | |
) | |
gr.Markdown("## Heatmap") | |
heatmap_image_textonly = gr.Image(label="", show_label=False) | |
leader_board_textonly.select( | |
fn=load_heatmap_textonly, outputs=[heatmap_image_textonly] | |
) | |
# CoT Benchmark | |
with gr.Tab("CoT Benchmark"): | |
leader_board_cot = gr.Dataframe(accuracy_df_cot, headers=headers_with_icons) | |
gr.Markdown("## Heatmap") | |
heatmap_image_cot = gr.Image(label="", show_label=False) | |
leader_board_cot.select(fn=load_heatmap_cot, outputs=[heatmap_image_cot]) | |
# Vision Benchmark | |
with gr.Tab("Vision Benchmark"): | |
leader_board_vision = gr.Dataframe( | |
accuracy_df_vision, headers=headers_with_icons | |
) | |
gr.Markdown("## Heatmap") | |
heatmap_image_vision = gr.Image(label="", show_label=False) | |
leader_board_vision.select( | |
fn=load_heatmap_vision, outputs=[heatmap_image_vision] | |
) | |
# Vision-CoT Benchmark | |
with gr.Tab("Vision-CoT Benchmark"): | |
leader_board_vision_cot = gr.Dataframe( | |
accuracy_df_vision_cot, headers=headers_with_icons | |
) | |
gr.Markdown("## Heatmap") | |
heatmap_image_vision_cot = gr.Image(label="", show_label=False) | |
leader_board_vision_cot.select( | |
fn=load_heatmap_vision_cot, outputs=[heatmap_image_vision_cot] | |
) | |
# 1shot Benchmark | |
with gr.Tab("1shot Benchmark"): | |
leader_board_1shot = gr.Dataframe(accuracy_df_1shot, headers=headers_with_icons) | |
gr.Markdown("## Heatmap") | |
heatmap_image_1shot = gr.Image(label="", show_label=False) | |
leader_board_1shot.select(fn=load_heatmap_1shot, outputs=[heatmap_image_1shot]) | |
demo.launch() | |