Spaces:

TIGER-Lab
/

MMEB

Running

App Files Files Community

MMEB / utils.py

wenhu

Fixed some errors (#2)

ba1590f verified 19 days ago

raw

history blame

8.49 kB

	import pandas as pd
	import gradio as gr
	import csv
	import json
	import os
	import requests
	import io
	import shutil
	from huggingface_hub import Repository

	HF_TOKEN = os.environ.get("HF_TOKEN")

	TASKS = ["Classification", "VQA", "Retrieval", "Grounding"]

	MODEL_INFO = [
	"Models", "Model Size(B)", "Data Source",
	"Overall", "IND", "OOD",
	"Classification", "VQA", "Retrieval", "Grounding"
	]

	BASE_COLS = [col for col in MODEL_INFO if col not in TASKS]

	DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

	SUBMISSION_NAME = "MMEB"
	SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/TIGER-Lab/", SUBMISSION_NAME)
	FILE_NAME = "results.csv"
	CSV_DIR = "./results.csv"

	COLUMN_NAMES = MODEL_INFO

	LEADERBOARD_INTRODUCTION = """
	# MMEB Leaderboard

	## Introduction
	We introduce a novel benchmark, MMEB (Massive Multimodal Embedding Benchmark),
	which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
	and evaluating embedding models across various combinations of text and image modalities.
	All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
	or a combination of both. MMEB is divided into 20 in-distribution datasets, which can be used for
	training, and 16 out-of-distribution datasets, reserved for evaluation.

	The detailed explanation of the benchmark and datasets can be found in our paper: https://doi.org/10.48550/arXiv.2410.05160.
	"""

	TABLE_INTRODUCTION = """"""

	LEADERBOARD_INFO = """
	## Dataset Summary
	MMEB is organized into four primary meta-task categories:
	- Classification: This category comprises 5 in-distribution and 5 out-of-distribution datasets. Queries
	consist of instructions and images, optionally accompanied by related text. Targets are class labels,
	and the number of class labels corresponds to the number of classes in the dataset. \n
	- IND: ImageNet-1k, N24News, HatefulMemes, VOC2007, SUN397 \n
	- OOD: Place365, ImageNet-A, ImageNet-R, ObjectNet, Country-211 \n
	- Visual Question Answering: This category includes 6 in-distribution and 4 out-of-distribution
	datasets. The query consists of an instruction, an image, and a piece of text as the question, while
	the target is the answer. Each query has 1,000 target candidates: 1 ground truth and 999 distractors. \n
	- IND: OK-VQA, A-OKVQA, DocVQA, InfographicVQA, ChartQA, Visual7W \n
	- OOD: ScienceQA, VizWiz, GQA, TextVQA \n
	- Information Retrieval: This category contains 8 in-distribution and 4 out-of-distribution datasets.
	Both the query and target sides can involve a combination of text, images, and instructions. Similar
	to the VQA task, each query has 1,000 candidates, with 1 ground truth and 999 distractors. \n
	- IND: VisDial, CIRR, VisualNews_t2i, VisualNews_i2t, MSCOCO_t2i, MSCOCO_i2t, NIGHTS, WebQA \n
	- OOD: OVEN, FashionIQ, EDIS, Wiki-SS-NQ \n
	- Visual Grounding: This category includes 1 in-distribution and 3 out-of-distribution datasets, which are adapted from object detection tasks. Queries consist of an instruction, an image, and text referring to a specific region or object within the image. The target may include a cropped image of the object or text describing the same region. Each query includes 1,000 candidates: 1 ground truth and 999 distractors. These distractors may include hard negatives from the same object class, other objects in the image, or random objects from different images. \n
	- IND: MSCOCO \n
	- OOD: Visual7W-Pointing, RefCOCO, RefCOCO-Matching \n
	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = """"""

	SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction

	## ⚠ Please note that you need to submit the JSON file with the following format:
	```json
	[
	{
	"question_id": 123,
	"question": "abc",
	"options": ["abc", "xyz", ...],
	"answer": "ABC",
	"answer_index": 1,
	"category": "abc,
	"pred": "B",
	"model_outputs": ""
	}, ...
	]
	```
	...
	"""

	def get_df():
	# fetch the leaderboard data
	url = "https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/results.csv"
	response = requests.get(url, headers={"Authorization": f"Bearer {HF_TOKEN}"})
	if response.status_code != 200:
	import sys
	sys.exit(f"Error: {response.status_code}")
	df = pd.read_csv(io.StringIO(response.text))
	df.to_csv(CSV_DIR, index=False) # update local file
	df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
	df = df.sort_values(by=['Overall'], ascending=False)
	return df


	def add_new_eval(input_file):
	if input_file is None:
	return "Error! Empty file!"

	# Load the input json file
	upload_data = json.loads(input_file)
	print("upload_data:\n", upload_data)
	data_row = [f'{upload_data["Model"]}']
	for col in ['Overall', 'Model Size(B)', 'IND', 'OOD'] + TASKS:
	if not col in upload_data.keys():
	return f"Error! Missing {col} column!"
	data_row += [upload_data[col]]
	print("data_row:\n", data_row)
	submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
	use_auth_token=HF_TOKEN, repo_type="space")
	submission_repo.git_pull()

	# Track submitted models
	already_submitted = []
	with open(CSV_DIR, mode='r') as file:
	reader = csv.reader(file, delimiter=',')
	for row in reader:
	already_submitted.append(row[0])
	# if not in the existing models list, add it to the csv file
	if data_row[0] not in already_submitted:
	with open(CSV_DIR, mode='a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow(data_row)

	submission_repo.push_to_hub()
	print('Submission Successful')
	else:
	print('The model already exists in the leaderboard!')

	def refresh_data():
	df = get_df()
	return df[COLUMN_NAMES]


	def search_and_filter_models(df, query, min_size, max_size):
	filtered_df = df.copy()

	if query:
	filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

	size_mask = filtered_df['Model Size(B)'].apply(lambda x:
	(min_size <= 1000.0 <= max_size) if x == 'unknown'
	else (min_size <= x <= max_size))

	filtered_df = filtered_df[size_mask]

	return filtered_df[COLUMN_NAMES]


	# def search_and_filter_models(df, query, min_size, max_size):
	# filtered_df = df.copy()

	# if query:
	# filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

	# def size_filter(x):
	# if isinstance(x, (int, float)):
	# return min_size <= x <= max_size
	# return True

	# filtered_df = filtered_df[filtered_df['Model Size(B)'].apply(size_filter)]

	# return filtered_df[COLUMN_NAMES]


	def search_models(df, query):
	if query:
	return df[df['Models'].str.contains(query, case=False, na=False)]
	return df


	# def get_size_range(df):
	# numeric_sizes = df[df['Model Size(B)'].apply(lambda x: isinstance(x, (int, float)))]['Model Size(B)']
	# if len(numeric_sizes) > 0:
	# return float(numeric_sizes.min()), float(numeric_sizes.max())
	# return 0, 1000


	def get_size_range(df):
	sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x)
	if (sizes == 0.0).all():
	return 0.0, 1000.0
	return float(sizes.min()), float(sizes.max())


	def process_model_size(size):
	if pd.isna(size) or size == 'unk':
	return 'unknown'
	try:
	val = float(size)
	return val
	except (ValueError, TypeError):
	return 'unknown'


	def filter_columns_by_tasks(df, selected_tasks=None):
	if selected_tasks is None or len(selected_tasks) == 0:
	return df[COLUMN_NAMES]

	base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
	selected_columns = base_columns + selected_tasks

	available_columns = [col for col in selected_columns if col in df.columns]
	return df[available_columns]

	def get_task_choices():
	return TASKS