BreakLee's picture
SEED Benchmark Leaderboard Update
c4d90ef
raw
history blame
11.1 kB
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
import gradio as gr
import pandas as pd
import json
import pdb
import tempfile
from constants import *
from src.auto_leaderboard.model_metadata_type import ModelType
global data_component, filter_component
def upload_file(files):
file_paths = [file.name for file in files]
return file_paths
def prediction_analyse(prediction_content):
# pdb.set_trace()
predictions = prediction_content.split("\n")
# 读取 ground_truth JSON 文件
with open("./file/SEED-Bench.json", "r") as file:
ground_truth_data = json.load(file)["questions"]
# 将 ground_truth 数据转换为以 question_id 为键的字典
ground_truth = {item["question_id"]: item for item in ground_truth_data}
# 初始化结果统计字典
results = {i: {"correct": 0, "total": 0} for i in range(1, 13)}
# 遍历 predictions,计算每个 question_type_id 的正确预测数和总预测数
for prediction in predictions:
# pdb.set_trace()
prediction = prediction.strip()
if not prediction:
continue
try:
prediction = json.loads(prediction)
except json.JSONDecodeError:
print(f"Warning: Skipping invalid JSON data in line: {prediction}")
continue
question_id = prediction["question_id"]
gt_item = ground_truth[question_id]
question_type_id = gt_item["question_type_id"]
if prediction["prediction"] == gt_item["answer"]:
results[question_type_id]["correct"] += 1
results[question_type_id]["total"] += 1
return results
def add_new_eval(
input_file,
model_name_textbox: str,
revision_name_textbox: str,
model_type: str,
model_link: str,
LLM_type: str,
LLM_name_textbox: str,
Evaluation_dimension: str,
):
if input_file is None:
return "Error! Empty file!"
else:
content = input_file.decode("utf-8")
prediction = prediction_analyse(content)
csv_data = pd.read_csv(CSV_DIR)
Start_dimension, End_dimension = 1, 13
if Evaluation_dimension == 'Image':
End_dimension = 10
elif Evaluation_dimension == 'Video':
Start_dimension = 10
each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)}
# count for average image\video\all
total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
total_image = sum(prediction[i]["total"] for i in range(1, 10))
total_video = sum(prediction[i]["total"] for i in range(10, 13))
if Evaluation_dimension != 'Video':
average_accuracy_image = round(total_correct_image / total_image * 100, 1)
else:
average_accuracy_image = 0
if Evaluation_dimension != 'Image':
average_accuracy_video = round(total_correct_video / total_video * 100, 1)
else:
average_accuracy_video = 0
if Evaluation_dimension == 'All':
overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
else:
overall_accuracy = 0
if LLM_type == 'Other':
LLM_name = LLM_name_textbox
else:
LLM_name = LLM_type
if revision_name_textbox == '':
col = csv_data.shape[0]
model_name = model_name_textbox
else:
model_name = revision_name_textbox
model_name_list = csv_data['Model']
name_list = [name.split(']')[0][1:] for name in model_name_list]
if revision_name_textbox not in name_list:
col = csv_data.shape[0]
else:
col = name_list.index(revision_name_textbox)
if model_link == '':
model_name = model_name # no url
else:
model_name = '[' + model_name + '](' + model_link + ')'
# add new data
new_data = [
model_type,
model_name,
LLM_name,
overall_accuracy,
average_accuracy_image,
average_accuracy_video,
each_task_accuracy[1],
each_task_accuracy[2],
each_task_accuracy[3],
each_task_accuracy[4],
each_task_accuracy[5],
each_task_accuracy[6],
each_task_accuracy[7],
each_task_accuracy[8],
each_task_accuracy[9],
each_task_accuracy[10],
each_task_accuracy[11],
each_task_accuracy[12],
]
csv_data.loc[col] = new_data
csv_data = csv_data.to_csv(CSV_DIR, index=False)
return 0
def get_baseline_df():
# pdb.set_trace()
df = pd.read_csv(CSV_DIR)
df = df.sort_values(by="Avg. All", ascending=False)
present_columns = MODEL_INFO + checkbox_group.value
df = df[present_columns]
return df
def get_all_df():
df = pd.read_csv(CSV_DIR)
df = df.sort_values(by="Avg. All", ascending=False)
return df
block = gr.Blocks()
with block:
gr.Markdown(
LEADERBORAD_INTRODUCTION
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 SEED Benchmark", elem_id="seed-benchmark-tab-table", id=0):
with gr.Row():
with gr.Accordion("Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
).style(show_copy_button=True)
gr.Markdown(
TABLE_INTRODUCTION
)
# selection for column part:
checkbox_group = gr.CheckboxGroup(
choices=TASK_INFO_v2,
value=AVG_INFO,
label="Select options",
interactive=True,
)
# 创建数据帧组件
data_component = gr.components.Dataframe(
value=get_baseline_df,
headers=COLUMN_NAMES,
type="pandas",
datatype=DATA_TITILE_TYPE,
interactive=False,
visible=True,
)
def on_checkbox_group_change(selected_columns):
# pdb.set_trace()
selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
present_columns = MODEL_INFO + selected_columns
updated_data = get_all_df()[present_columns]
updated_headers = present_columns
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
filter_component = gr.components.Dataframe(
value=updated_data,
headers=updated_headers,
type="pandas",
datatype=update_datatype,
interactive=False,
visible=True,
)
# pdb.set_trace()
return filter_component.value
# 将复选框组关联到处理函数
checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
# table 2
with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
# table 3
with gr.TabItem("🚀 Submit here! ", elem_id="seed-benchmark-tab-table", id=3):
gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
with gr.Row():
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(
label="Model name", placeholder="LLaMA-7B"
)
revision_name_textbox = gr.Textbox(
label="Revision Model Name", placeholder="LLaMA-7B"
)
model_type = gr.Dropdown(
choices=[
"LLM",
"ImageLLM",
"VideoLLM",
"Other",
],
label="Model type",
multiselect=False,
value="ImageLLM",
interactive=True,
)
model_link = gr.Textbox(
label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
)
with gr.Column():
LLM_type = gr.Dropdown(
choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
label="LLM type",
multiselect=False,
value="LLaMA-7B",
interactive=True,
)
LLM_name_textbox = gr.Textbox(
label="LLM model (for Other)",
placeholder="LLaMA-13B"
)
Evaluation_dimension = gr.Dropdown(
choices=["All", "Image", "Video"],
label="Evaluation dimension",
multiselect=False,
value="All",
interactive=True,
)
with gr.Column():
input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
inputs = [
input_file,
model_name_textbox,
revision_name_textbox,
model_type,
model_link,
LLM_type,
LLM_name_textbox,
Evaluation_dimension,
],
# outputs = submission_result,
)
with gr.Row():
data_run = gr.Button("Refresh")
data_run.click(
get_baseline_df, outputs=data_component
)
# block.load(get_baseline_df, outputs=data_title)
block.launch()