import os from glob import glob import gradio as gr import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from matplotlib.colors import BoundaryNorm, ListedColormap all_results = pd.read_pickle("all_results.pkl") def get_accuracy_dataframe(df): # Calculate overall model accuracy df['parsed_judge_response'] = df['parsed_judge_response'].astype(float) model_accuracy = df.groupby('model_name')['parsed_judge_response'].mean().reset_index() # Calculate model accuracy per difficulty level df['difficulty_level'] = df['difficulty_level'].astype(int) model_accuracy_per_level = df.groupby(['model_name', 'difficulty_level'])['parsed_judge_response'].mean().reset_index() model_accuracy_per_level_df = model_accuracy_per_level.pivot(index='model_name', columns='difficulty_level', values='parsed_judge_response') # Merge overall accuracy and level-based accuracy into a single DataFrame model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on='model_name') model_accuracy_df.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5'}, inplace=True) model_accuracy_df.rename(columns={'parsed_judge_response': 'Accuracy'}, inplace=True) # Multiply by 100 and format to one decimal point model_accuracy_df = model_accuracy_df.applymap(lambda x: round(x * 100, 1) if isinstance(x, float) else x) # Add headers with icons model_accuracy_df.columns = [ "🤖 Model Name", "⭐ Overall", "📈 Level 1", "🔍 Level 2", "📘 Level 3", "🔬 Level 4", ] model_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True) return model_accuracy_df accuracy_df = get_accuracy_dataframe(all_results) # Define the column names with icons headers_with_icons = [ "🤖 Model Name", "⭐ Overall", "📈 Level 1", "🔍 Level 2", "📘 Level 3", "🔬 Level 4", ] column_names = [ "Model Name", "Overall Accuracy", "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy", "Level 4 Accuracy", ] def load_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results/{evt.value}.jpg") return heatmap_image # # Function to process data # def process_data(data): # data_for_df = [] # for file, df in data.items(): # overall_accuracy = round(calculate_accuracy(df), 2) # breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)] # model_name = file.split("/")[-1].replace(".pkl", "") # data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy) # return data_for_df # # Function to finalize DataFrame # def finalize_df(df): # df = df.round(1) # Round to one decimal place # df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x) # df.columns = headers_with_icons # df.sort_values(by="⭐ Overall", ascending=False, inplace=True) # # add a new column with the order (index) # df["#"] = range(1, len(df) + 1) # # bring rank to the first column # cols = df.columns.tolist() # cols = cols[-1:] + cols[:-1] # df = df[cols] # return df def load_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results/{evt.value}.jpg") return heatmap_image with gr.Blocks() as demo: gr.Markdown("# FSM Benchmark Leaderboard") with gr.Tab("Text-only Benchmark"): leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) gr.Markdown("## Heatmap") heatmap_image_qwen = gr.Image(label="", show_label=False) leader_board.select(fn=load_heatmap, outputs=[heatmap_image_qwen]) # with gr.Tab("Vision Benchmark", visible=False): # gr.Markdown("# Vision Benchmark Leaderboard") # leader_board_vision = gr.Dataframe( # vision_accuracy_df, headers=headers_with_icons # ) # gr.Markdown("## Heatmap") # heatmap_image_vision = gr.Image(label="", show_label=False) # leader_board_vision.select( # fn=load_vision_heatmap, outputs=[heatmap_image_vision] # ) # with gr.Tab("Text-only Benchmark (CoT)", visible=False): # gr.Markdown("# Text-only Leaderboard (CoT)") # cot_leader_board_text = gr.Dataframe( # cot_text_accuracy_df, headers=headers_with_icons # ) # gr.Markdown("## Heatmap") # cot_heatmap_image_text = gr.Image(label="", show_label=False) # cot_leader_board_text.select( # fn=load_cot_heatmap, outputs=[cot_heatmap_image_text] # ) # with gr.Tab("Constraint Text-only Results (CoT)", visible=False): # gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)") # included_models_cot = gr.CheckboxGroup( # label="Models to include", # choices=all_cot_text_only_models, # value=all_cot_text_only_models, # interactive=True, # ) # with gr.Row(): # number_of_queries_cot = gr.Textbox(label="Number of included queries") # number_of_fsms_cot = gr.Textbox(label="Number of included FSMs") # constrained_leader_board_text_cot = gr.Dataframe() # constrained_leader_board_plot_cot = gr.Plot() # with gr.Tab("Majority Vote (Subset 1)", visible=False): # gr.Markdown("## Majority Vote (Subset 1)") # intersection_leader_board = gr.Dataframe( # intersection_df_acc, headers=headers_with_icons # ) # heatmap_image = gr.Plot(label="Model Heatmap") # with gr.Tab("Text-only Benchmark (deprecated)", visible=False): # gr.Markdown("# Text-only Leaderboard") # leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) # gr.Markdown("## Heatmap") # heatmap_image = gr.Image(label="", show_label=False) # leader_board.select(fn=load_heatmap, outputs=[heatmap_image]) # # ============ Callbacks ============ # included_models_cot.select( # fn=calculate_order_by_first_substring_cot, # inputs=[included_models_cot], # outputs=[ # constrained_leader_board_text_cot, # number_of_queries_cot, # number_of_fsms_cot, # ], # queue=True, # ) # constrained_leader_board_text.select( # fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot] # ) # constrained_leader_board_text_cot.select( # fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot] # ) # intersection_leader_board.select( # fn=show_intersection_heatmap, outputs=[heatmap_image] # ) demo.launch()