Files changed (3) hide show
  1. app.py +19 -20
  2. results.csv +1 -1
  3. utils.py +66 -29
app.py CHANGED
@@ -2,12 +2,11 @@ from utils import *
2
 
3
  global data_component
4
 
5
- def update_table(query, min_size, max_size, selected_subjects=None):
6
  df = get_df()
7
  filtered_df = search_and_filter_models(df, query, min_size, max_size)
8
- if selected_subjects and len(selected_subjects) > 0:
9
- base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
10
- selected_columns = base_columns + selected_subjects
11
  filtered_df = filtered_df[selected_columns]
12
  return filtered_df
13
 
@@ -53,13 +52,13 @@ with gr.Blocks() as block:
53
  label="Maximum number of parameters (B)",
54
  )
55
 
56
- subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', 'Overall', 'IND', 'OOD']]
57
  with gr.Row():
58
- subjects_select = gr.CheckboxGroup(
59
- choices=subject_choices,
60
- value=subject_choices,
61
- label="Select Subjects to Display",
62
- elem_id="subjects-select"
63
  )
64
 
65
  data_component = gr.components.Dataframe(
@@ -73,27 +72,27 @@ with gr.Blocks() as block:
73
 
74
  refresh_button = gr.Button("Refresh")
75
 
76
- def update_with_subjects(*args):
77
  return update_table(*args)
78
 
79
  search_bar.change(
80
- fn=update_with_subjects,
81
- inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
82
  outputs=data_component
83
  )
84
  min_size_slider.change(
85
- fn=update_with_subjects,
86
- inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
87
  outputs=data_component
88
  )
89
  max_size_slider.change(
90
- fn=update_with_subjects,
91
- inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
92
  outputs=data_component
93
  )
94
- subjects_select.change(
95
- fn=update_with_subjects,
96
- inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
97
  outputs=data_component
98
  )
99
  refresh_button.click(fn=refresh_data, outputs=data_component)
 
2
 
3
  global data_component
4
 
5
+ def update_table(query, min_size, max_size, selected_tasks=None):
6
  df = get_df()
7
  filtered_df = search_and_filter_models(df, query, min_size, max_size)
8
+ if selected_tasks and len(selected_tasks) > 0:
9
+ selected_columns = BASE_COLS + selected_tasks
 
10
  filtered_df = filtered_df[selected_columns]
11
  return filtered_df
12
 
 
52
  label="Maximum number of parameters (B)",
53
  )
54
 
55
+ task_choices = [col for col in COLUMN_NAMES if col not in BASE_COLS]
56
  with gr.Row():
57
+ tasks_select = gr.CheckboxGroup(
58
+ choices=task_choices,
59
+ value=task_choices,
60
+ label="Select tasks to Display",
61
+ elem_id="tasks-select"
62
  )
63
 
64
  data_component = gr.components.Dataframe(
 
72
 
73
  refresh_button = gr.Button("Refresh")
74
 
75
+ def update_with_tasks(*args):
76
  return update_table(*args)
77
 
78
  search_bar.change(
79
+ fn=update_with_tasks,
80
+ inputs=[search_bar, min_size_slider, max_size_slider, tasks_select],
81
  outputs=data_component
82
  )
83
  min_size_slider.change(
84
+ fn=update_with_tasks,
85
+ inputs=[search_bar, min_size_slider, max_size_slider, tasks_select],
86
  outputs=data_component
87
  )
88
  max_size_slider.change(
89
+ fn=update_with_tasks,
90
+ inputs=[search_bar, min_size_slider, max_size_slider, tasks_select],
91
  outputs=data_component
92
  )
93
+ tasks_select.change(
94
+ fn=update_with_tasks,
95
+ inputs=[search_bar, min_size_slider, max_size_slider, tasks_select],
96
  outputs=data_component
97
  )
98
  refresh_button.click(fn=refresh_data, outputs=data_component)
results.csv CHANGED
@@ -12,4 +12,4 @@ OpenCLIP-FFT,unk,unk,47.2,50.5,43.1,56.0,21.9,55.4,64.1
12
  VLM2Vec (Phi-3.5-V-FFT),unk,TIGER-Lab,55.9,62.8,47.4,52.8,50.3,57.8,72.3
13
  VLM2Vec (Phi-3.5-V-LoRA),unk,TIGER-Lab,60.1,66.5,52.0,54.8,54.9,62.3,79.5
14
  VLM2Vec (LLaVA-1.6-LoRA-LowRes),unk,TIGER-Lab,55.0,61.0,47.5,54.7,50.3,56.2,64.0
15
- VLM2Vec (LLaVA-1.6-LoRA-HighRes),unk,TIGER-Lab,62.9,67.5,57.1,61.2,49.9,67.4,86.1
 
12
  VLM2Vec (Phi-3.5-V-FFT),unk,TIGER-Lab,55.9,62.8,47.4,52.8,50.3,57.8,72.3
13
  VLM2Vec (Phi-3.5-V-LoRA),unk,TIGER-Lab,60.1,66.5,52.0,54.8,54.9,62.3,79.5
14
  VLM2Vec (LLaVA-1.6-LoRA-LowRes),unk,TIGER-Lab,55.0,61.0,47.5,54.7,50.3,56.2,64.0
15
+ VLM2Vec (LLaVA-1.6-LoRA-HighRes),unk,TIGER-Lab,62.9,67.5,57.1,61.2,49.9,67.4,86.1
utils.py CHANGED
@@ -3,12 +3,14 @@ import gradio as gr
3
  import csv
4
  import json
5
  import os
 
 
6
  import shutil
7
  from huggingface_hub import Repository
8
 
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
10
 
11
- SUBJECTS = ["Classification", "VQA", "Retrieval", "Grounding"]
12
 
13
  MODEL_INFO = [
14
  "Models", "Model Size(B)", "Data Source",
@@ -16,27 +18,54 @@ MODEL_INFO = [
16
  "Classification", "VQA", "Retrieval", "Grounding"
17
  ]
18
 
 
 
19
  DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
20
 
21
- # TODO: submission process not implemented yet
22
- SUBMISSION_NAME = ""
23
- SUBMISSION_URL = ""
24
- CSV_DIR = "results.csv" # TODO: Temporary file, to be updated with the actual file
25
 
26
  COLUMN_NAMES = MODEL_INFO
27
 
28
- LEADERBOARD_INTRODUCTION = """# MMEB Leaderboard
 
29
 
30
  ## Introduction
31
- We introduce MMEB, a benchmark for multimodal evaluation of models. The benchmark consists of four tasks: Classification, VQA, Retrieval, and Grounding. Models are evaluated based on 36 datasets.
32
-
33
-
 
 
 
 
 
34
  """
35
 
36
  TABLE_INTRODUCTION = """"""
37
 
38
  LEADERBOARD_INFO = """
39
  ## Dataset Summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  """
41
 
42
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -63,46 +92,52 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
63
  """
64
 
65
  def get_df():
66
- # TODO: Update this after the hf dataset has been created!
67
- # repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
68
- # repo.git_pull()
69
- df = pd.read_csv(CSV_DIR)
 
 
 
 
70
  df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
71
  df = df.sort_values(by=['Overall'], ascending=False)
72
  return df
73
 
74
 
75
- def add_new_eval(
76
- input_file,
77
- ):
78
  if input_file is None:
79
  return "Error! Empty file!"
80
 
 
81
  upload_data = json.loads(input_file)
82
  print("upload_data:\n", upload_data)
83
- data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
84
- for subject in SUBJECTS:
85
- data_row += [upload_data[subject]]
 
 
86
  print("data_row:\n", data_row)
87
  submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
88
- use_auth_token=HF_TOKEN, repo_type="dataset")
89
  submission_repo.git_pull()
90
 
 
91
  already_submitted = []
92
  with open(CSV_DIR, mode='r') as file:
93
  reader = csv.reader(file, delimiter=',')
94
  for row in reader:
95
  already_submitted.append(row[0])
96
-
97
  if data_row[0] not in already_submitted:
98
  with open(CSV_DIR, mode='a', newline='') as file:
99
  writer = csv.writer(file)
100
  writer.writerow(data_row)
101
-
102
  submission_repo.push_to_hub()
103
  print('Submission Successful')
104
  else:
105
- print('The entry already exists')
106
 
107
  def refresh_data():
108
  df = get_df()
@@ -154,7 +189,9 @@ def search_models(df, query):
154
 
155
 
156
  def get_size_range(df):
157
- sizes = df['Model Size(B)'].apply(lambda x: 1000.0 if x == 'unknown' else x)
 
 
158
  return float(sizes.min()), float(sizes.max())
159
 
160
 
@@ -168,16 +205,16 @@ def process_model_size(size):
168
  return 'unknown'
169
 
170
 
171
- def filter_columns_by_subjects(df, selected_subjects=None):
172
- if selected_subjects is None or len(selected_subjects) == 0:
173
  return df[COLUMN_NAMES]
174
 
175
  base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
176
- selected_columns = base_columns + selected_subjects
177
 
178
  available_columns = [col for col in selected_columns if col in df.columns]
179
  return df[available_columns]
180
 
181
- def get_subject_choices():
182
- return SUBJECTS
183
 
 
3
  import csv
4
  import json
5
  import os
6
+ import requests
7
+ import io
8
  import shutil
9
  from huggingface_hub import Repository
10
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
 
13
+ TASKS = ["Classification", "VQA", "Retrieval", "Grounding"]
14
 
15
  MODEL_INFO = [
16
  "Models", "Model Size(B)", "Data Source",
 
18
  "Classification", "VQA", "Retrieval", "Grounding"
19
  ]
20
 
21
+ BASE_COLS = [col for col in MODEL_INFO if col not in TASKS]
22
+
23
  DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
24
 
25
+ SUBMISSION_NAME = "MMEB"
26
+ SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/TIGER-Lab/", SUBMISSION_NAME)
27
+ FILE_NAME = "results.csv"
28
+ CSV_DIR = "./results.csv"
29
 
30
  COLUMN_NAMES = MODEL_INFO
31
 
32
+ LEADERBOARD_INTRODUCTION = """
33
+ # MMEB Leaderboard
34
 
35
  ## Introduction
36
+ We introduce a novel benchmark, MMEB (Massive Multimodal Embedding Benchmark),
37
+ which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
38
+ and evaluating embedding models across various combinations of text and image modalities.
39
+ All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
40
+ or a combination of both. MMEB is divided into 20 in-distribution datasets, which can be used for
41
+ training, and 16 out-of-distribution datasets, reserved for evaluation.
42
+
43
+ The detailed explanation of the benchmark and datasets can be found in our paper: https://doi.org/10.48550/arXiv.2410.05160.
44
  """
45
 
46
  TABLE_INTRODUCTION = """"""
47
 
48
  LEADERBOARD_INFO = """
49
  ## Dataset Summary
50
+ MMEB is organized into four primary meta-task categories:
51
+ - **Classification**: This category comprises 5 in-distribution and 5 out-of-distribution datasets. Queries
52
+ consist of instructions and images, optionally accompanied by related text. Targets are class labels,
53
+ and the number of class labels corresponds to the number of classes in the dataset. \n
54
+ - IND: ImageNet-1k, N24News, HatefulMemes, VOC2007, SUN397 \n
55
+ - OOD: Place365, ImageNet-A, ImageNet-R, ObjectNet, Country-211 \n
56
+ - **Visual Question Answering**: This category includes 6 in-distribution and 4 out-of-distribution
57
+ datasets. The query consists of an instruction, an image, and a piece of text as the question, while
58
+ the target is the answer. Each query has 1,000 target candidates: 1 ground truth and 999 distractors. \n
59
+ - IND: OK-VQA, A-OKVQA, DocVQA, InfographicVQA, ChartQA, Visual7W \n
60
+ - OOD: ScienceQA, VizWiz, GQA, TextVQA \n
61
+ - **Information Retrieval**: This category contains 8 in-distribution and 4 out-of-distribution datasets.
62
+ Both the query and target sides can involve a combination of text, images, and instructions. Similar
63
+ to the VQA task, each query has 1,000 candidates, with 1 ground truth and 999 distractors. \n
64
+ - IND: VisDial, CIRR, VisualNews_t2i, VisualNews_i2t, MSCOCO_t2i, MSCOCO_i2t, NIGHTS, WebQA \n
65
+ - OOD: OVEN, FashionIQ, EDIS, Wiki-SS-NQ \n
66
+ - **Visual Grounding**: This category includes 1 in-distribution and 3 out-of-distribution datasets, which are adapted from object detection tasks. Queries consist of an instruction, an image, and text referring to a specific region or object within the image. The target may include a cropped image of the object or text describing the same region. Each query includes 1,000 candidates: 1 ground truth and 999 distractors. These distractors may include hard negatives from the same object class, other objects in the image, or random objects from different images. \n
67
+ - IND: MSCOCO \n
68
+ - OOD: Visual7W-Pointing, RefCOCO, RefCOCO-Matching \n
69
  """
70
 
71
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
92
  """
93
 
94
  def get_df():
95
+ # fetch the leaderboard data
96
+ url = "https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/results.csv"
97
+ response = requests.get(url, headers={"Authorization": f"Bearer {HF_TOKEN}"})
98
+ if response.status_code != 200:
99
+ import sys
100
+ sys.exit(f"Error: {response.status_code}")
101
+ df = pd.read_csv(io.StringIO(response.text))
102
+ df.to_csv(CSV_DIR, index=False) # update local file
103
  df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
104
  df = df.sort_values(by=['Overall'], ascending=False)
105
  return df
106
 
107
 
108
+ def add_new_eval(input_file):
 
 
109
  if input_file is None:
110
  return "Error! Empty file!"
111
 
112
+ # Load the input json file
113
  upload_data = json.loads(input_file)
114
  print("upload_data:\n", upload_data)
115
+ data_row = [f'{upload_data["Model"]}']
116
+ for col in ['Overall', 'Model Size(B)', 'IND', 'OOD'] + TASKS:
117
+ if not col in upload_data.keys():
118
+ return f"Error! Missing {col} column!"
119
+ data_row += [upload_data[col]]
120
  print("data_row:\n", data_row)
121
  submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
122
+ use_auth_token=HF_TOKEN, repo_type="space")
123
  submission_repo.git_pull()
124
 
125
+ # Track submitted models
126
  already_submitted = []
127
  with open(CSV_DIR, mode='r') as file:
128
  reader = csv.reader(file, delimiter=',')
129
  for row in reader:
130
  already_submitted.append(row[0])
131
+ # if not in the existing models list, add it to the csv file
132
  if data_row[0] not in already_submitted:
133
  with open(CSV_DIR, mode='a', newline='') as file:
134
  writer = csv.writer(file)
135
  writer.writerow(data_row)
136
+
137
  submission_repo.push_to_hub()
138
  print('Submission Successful')
139
  else:
140
+ print('The model already exists in the leaderboard!')
141
 
142
  def refresh_data():
143
  df = get_df()
 
189
 
190
 
191
  def get_size_range(df):
192
+ sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x)
193
+ if (sizes == 0.0).all():
194
+ return 0.0, 1000.0
195
  return float(sizes.min()), float(sizes.max())
196
 
197
 
 
205
  return 'unknown'
206
 
207
 
208
+ def filter_columns_by_tasks(df, selected_tasks=None):
209
+ if selected_tasks is None or len(selected_tasks) == 0:
210
  return df[COLUMN_NAMES]
211
 
212
  base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
213
+ selected_columns = base_columns + selected_tasks
214
 
215
  available_columns = [col for col in selected_columns if col in df.columns]
216
  return df[available_columns]
217
 
218
+ def get_task_choices():
219
+ return TASKS
220