tttoaster commited on
Commit
54f3112
·
1 Parent(s): dd6990d

Upload 9 files

Browse files
app.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+ import json
7
+ import pdb
8
+ import tempfile
9
+
10
+ from constants import *
11
+ from src.auto_leaderboard.model_metadata_type import ModelType
12
+
13
+ global data_component, filter_component
14
+
15
+
16
+ def upload_file(files):
17
+ file_paths = [file.name for file in files]
18
+ return file_paths
19
+
20
+ def prediction_analyse(prediction_content):
21
+ # pdb.set_trace()
22
+ predictions = prediction_content.split("\n")
23
+
24
+ # 读取 ground_truth JSON 文件
25
+ with open("./file/SEED-Bench.json", "r") as file:
26
+ ground_truth_data = json.load(file)["questions"]
27
+
28
+ # 将 ground_truth 数据转换为以 question_id 为键的字典
29
+ ground_truth = {item["question_id"]: item for item in ground_truth_data}
30
+
31
+ # 初始化结果统计字典
32
+ results = {i: {"correct": 0, "total": 0} for i in range(1, 13)}
33
+
34
+ # 遍历 predictions,计算每个 question_type_id 的正确预测数和总预测数
35
+ for prediction in predictions:
36
+ # pdb.set_trace()
37
+ prediction = prediction.strip()
38
+ if not prediction:
39
+ continue
40
+ try:
41
+ prediction = json.loads(prediction)
42
+ except json.JSONDecodeError:
43
+ print(f"Warning: Skipping invalid JSON data in line: {prediction}")
44
+ continue
45
+ question_id = prediction["question_id"]
46
+ gt_item = ground_truth[question_id]
47
+ question_type_id = gt_item["question_type_id"]
48
+
49
+ if prediction["prediction"] == gt_item["answer"]:
50
+ results[question_type_id]["correct"] += 1
51
+
52
+ results[question_type_id]["total"] += 1
53
+
54
+ return results
55
+
56
+ def add_new_eval(
57
+ input_file,
58
+ model_name_textbox: str,
59
+ revision_name_textbox: str,
60
+ model_type: str,
61
+ model_link: str,
62
+ LLM_type: str,
63
+ LLM_name_textbox: str,
64
+ Evaluation_dimension: str,
65
+ ):
66
+ if input_file is None:
67
+ return "Error! Empty file!"
68
+ else:
69
+ content = input_file.decode("utf-8")
70
+ prediction = prediction_analyse(content)
71
+ csv_data = pd.read_csv(CSV_DIR)
72
+
73
+ Start_dimension, End_dimension = 1, 13
74
+ if Evaluation_dimension == 'Image':
75
+ End_dimension = 10
76
+ elif Evaluation_dimension == 'Video':
77
+ Start_dimension = 10
78
+ each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)}
79
+
80
+ # count for average image\video\all
81
+ total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
82
+ total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
83
+
84
+ total_image = sum(prediction[i]["total"] for i in range(1, 10))
85
+ total_video = sum(prediction[i]["total"] for i in range(10, 13))
86
+
87
+ if Evaluation_dimension != 'Video':
88
+ average_accuracy_image = round(total_correct_image / total_image * 100, 1)
89
+ else:
90
+ average_accuracy_image = 0
91
+
92
+ if Evaluation_dimension != 'Image':
93
+ average_accuracy_video = round(total_correct_video / total_video * 100, 1)
94
+ else:
95
+ average_accuracy_video = 0
96
+
97
+ if Evaluation_dimension == 'All':
98
+ overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
99
+ else:
100
+ overall_accuracy = 0
101
+
102
+ if LLM_type == 'Other':
103
+ LLM_name = LLM_name_textbox
104
+ else:
105
+ LLM_name = LLM_type
106
+
107
+ if revision_name_textbox == '':
108
+ col = csv_data.shape[0]
109
+ model_name = model_name_textbox
110
+ else:
111
+ model_name = revision_name_textbox
112
+ model_name_list = csv_data['Model']
113
+ name_list = [name.split(']')[0][1:] for name in model_name_list]
114
+ if revision_name_textbox not in name_list:
115
+ col = csv_data.shape[0]
116
+ else:
117
+ col = name_list.index(revision_name_textbox)
118
+
119
+ if model_link == '':
120
+ model_name = model_name # no url
121
+ else:
122
+ model_name = '[' + model_name + '](' + model_link + ')'
123
+
124
+ # add new data
125
+ new_data = [
126
+ model_type,
127
+ model_name,
128
+ LLM_name,
129
+ each_task_accuracy[1],
130
+ each_task_accuracy[2],
131
+ each_task_accuracy[3],
132
+ each_task_accuracy[4],
133
+ each_task_accuracy[5],
134
+ each_task_accuracy[6],
135
+ each_task_accuracy[7],
136
+ each_task_accuracy[8],
137
+ each_task_accuracy[9],
138
+ average_accuracy_image,
139
+ each_task_accuracy[10],
140
+ each_task_accuracy[11],
141
+ each_task_accuracy[12],
142
+ average_accuracy_video,
143
+ overall_accuracy]
144
+ # pdb.set_trace()
145
+ csv_data.loc[col] = new_data
146
+ csv_data = csv_data.to_csv(CSV_DIR, index=False)
147
+ return 0
148
+
149
+ def get_baseline_df():
150
+ df = pd.read_csv(CSV_DIR)
151
+ return df
152
+
153
+ block = gr.Blocks()
154
+
155
+
156
+ with block:
157
+ gr.Markdown(
158
+ LEADERBORAD_INTRODUCTION
159
+ )
160
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
161
+ with gr.TabItem("🏅 SEED Benchmark", elem_id="seed-benchmark-tab-table", id=0):
162
+ with gr.Row():
163
+ with gr.Accordion("Citation", open=False):
164
+ citation_button = gr.Textbox(
165
+ value=CITATION_BUTTON_TEXT,
166
+ label=CITATION_BUTTON_LABEL,
167
+ elem_id="citation-button",
168
+ ).style(show_copy_button=True)
169
+
170
+ gr.Markdown(
171
+ TABLE_INTRODUCTION
172
+ )
173
+
174
+ # selection for column part:
175
+ checkbox_group = gr.CheckboxGroup(
176
+ choices=TASK_INFO,
177
+ value=TASK_INFO,
178
+ label="Select options",
179
+ interactive=True,
180
+ )
181
+
182
+ # 创建数据帧组件
183
+ data_component = gr.components.Dataframe(
184
+ value=get_baseline_df,
185
+ headers=COLUMN_NAMES,
186
+ type="pandas",
187
+ datatype=DATA_TITILE_TYPE,
188
+ interactive=False,
189
+ visible=True,
190
+ )
191
+
192
+ def on_checkbox_group_change(selected_columns):
193
+ # pdb.set_trace()
194
+ selected_columns = [item for item in TASK_INFO if item in selected_columns]
195
+ present_columns = MODEL_INFO + selected_columns
196
+ updated_data = get_baseline_df()[present_columns]
197
+ updated_headers = present_columns
198
+ update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
199
+
200
+ filter_component = gr.components.Dataframe(
201
+ value=updated_data,
202
+ headers=updated_headers,
203
+ type="pandas",
204
+ datatype=update_datatype,
205
+ interactive=False,
206
+ visible=True,
207
+ )
208
+ # pdb.set_trace()
209
+
210
+ return filter_component.value
211
+
212
+ # 将复选框组关联到处理函数
213
+ checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
214
+
215
+ # table 2
216
+ with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
217
+ gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
218
+
219
+ # table 3
220
+ with gr.TabItem("🚀 Submit here! ", elem_id="seed-benchmark-tab-table", id=3):
221
+ gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
222
+
223
+ with gr.Row():
224
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
225
+
226
+ with gr.Row():
227
+ gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
228
+
229
+ with gr.Row():
230
+ with gr.Column():
231
+ model_name_textbox = gr.Textbox(
232
+ label="Model name", placeholder="LLaMA-7B"
233
+ )
234
+ revision_name_textbox = gr.Textbox(
235
+ label="Revision Model Name", placeholder="LLaMA-7B"
236
+ )
237
+ model_type = gr.Dropdown(
238
+ choices=[
239
+ "LLM",
240
+ "ImageLLM",
241
+ "VideoLLM",
242
+ "Other",
243
+ ],
244
+ label="Model type",
245
+ multiselect=False,
246
+ value="ImageLLM",
247
+ interactive=True,
248
+ )
249
+ model_link = gr.Textbox(
250
+ label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
251
+ )
252
+
253
+ with gr.Column():
254
+
255
+ LLM_type = gr.Dropdown(
256
+ choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
257
+ label="LLM type",
258
+ multiselect=False,
259
+ value="LLaMA-7B",
260
+ interactive=True,
261
+ )
262
+ LLM_name_textbox = gr.Textbox(
263
+ label="LLM model (for Other)",
264
+ placeholder="LLaMA-13B"
265
+ )
266
+ Evaluation_dimension = gr.Dropdown(
267
+ choices=["All", "Image", "Video"],
268
+ label="Evaluation dimension",
269
+ multiselect=False,
270
+ value="All",
271
+ interactive=True,
272
+ )
273
+
274
+ with gr.Column():
275
+
276
+ input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
277
+ submit_button = gr.Button("Submit Eval")
278
+
279
+ submission_result = gr.Markdown()
280
+ submit_button.click(
281
+ add_new_eval,
282
+ inputs = [
283
+ input_file,
284
+ model_name_textbox,
285
+ revision_name_textbox,
286
+ model_type,
287
+ model_link,
288
+ LLM_type,
289
+ LLM_name_textbox,
290
+ Evaluation_dimension,
291
+ ],
292
+ # outputs = submission_result,
293
+ )
294
+
295
+
296
+ with gr.Row():
297
+ data_run = gr.Button("Refresh")
298
+ data_run.click(
299
+ get_baseline_df, outputs=data_component
300
+ )
301
+
302
+ # block.load(get_baseline_df, outputs=data_title)
303
+
304
+ block.launch()
constants.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is .py for store constants
2
+ MODEL_INFO = ["Model Type", "Model", "Language Model"]
3
+ TASK_INFO = ["Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Avg. Img", "Action Recognition", "Action Prediction", "Procedure Understanding", "Avg. Video", "Avg. All"]
4
+ AVG_INFO = ["Avg. Img", "Avg. Video", "Avg. All"]
5
+ DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
6
+ CSV_DIR = "./file/result.csv"
7
+
8
+ COLUMN_NAMES = MODEL_INFO + TASK_INFO
9
+ DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192]
10
+
11
+ UNTUNED_MODEL_RESULTS = '''LLM & Flan-T5 & Flan-T5-XL &23.0 &29.0 &32.8 &31.8 &20.5 &31.8 &33.0 &18.2 &19.4 &23.2 &34.9 &25.4 \\
12
+ LLM & Vicuna & Vicuna-7B &23.4 &30.7 &29.7 &30.9 &30.8 &28.6 &29.8 &18.5 &13.4 &27.3 &34.5 &23.8 \\
13
+ LLM & LLaMA & LLaMA-7B &26.3 &27.4 &26.2 &28.3 &25.1 &28.8 &19.2 &37.0 & 9.0 &33.0 &23.1 &26.2 \\
14
+ ImageLLM & BLIP-2 & Flan-T5-XL &59.1 &53.9 &49.2 &42.3 &43.2 &36.7 &55.7 &45.6 &25.9 &32.6 &47.5 &24.0 \\
15
+ ImageLLM & InstructBLIP & Flan-T5-XL &60.3 &58.5 &63.4 &40.6 &58.4 &38.7 &51.6 &45.9 &25.9 &33.1 &49.1 &27.1 \\
16
+ ImageLLM & InstructBLIP-Vicuna & Vicuna-7B &60.2 &58.9 &65.6 &43.6 &57.2 &40.3 &52.6 &47.7 &43.5 &34.5 &49.6 &23.1 \\
17
+ ImageLLM & LLaVA & LLaMA-7B &42.7 &34.9 &33.5 &28.4 &41.9 &30.8 &27.8 &46.8 &27.7 &29.7 &21.4 &19.1 \\
18
+ ImageLLM & MiniGPT-4 & Flan-T5-XL &56.3 &49.2 &45.8 &37.9 &45.3 &32.6 &47.4 &57.1 &11.8 &38.2 &24.5 &27.1 \\
19
+ ImageLLM & VPGTrans & LLaMA-7B &51.9 &44.1 &39.9 &36.1 &33.7 &36.4 &32.0 &53.2 &30.6 &39.5 &24.3 &31.9 \\
20
+ ImageLLM & MultiModal-GPT & LLaMA-7B &43.6 &37.9 &31.5 &30.8 &27.3 &30.1 &29.9 &51.4 &18.8 &36.9 &25.8 &24.0 \\
21
+ ImageLLM & Otter & LLaMA-7B &44.9 &38.6 &32.2 &30.9 &26.3 &31.8 &32.0 &51.4 &31.8 &37.9 &27.2 &24.8 \\
22
+ ImageLLM & OpenFlamingo & LLaMA-7B &43.9 &38.1 &31.3 &30.1 &27.3 &30.6 &29.9 &50.2 &20.0 &37.2 &25.4 &24.2 \\
23
+ ImageLLM & LLaMA-Adapter V2 & LLaMA-7B &45.2 &38.5 &29.3 &33.0 &29.7 &35.5 &39.2 &52.0 &24.7 &38.6 &18.5 &19.6 \\
24
+ ImageLLM & GVT & Vicuna-7B &41.7 &35.5 &31.8 &29.5 &36.2 &32.0 &32.0 &51.1 &27.1 &33.9 &25.4 &23.0 \\
25
+ ImageLLM & mPLUG-Owl & LLaMA-7B &49.7 &45.3 &32.5 &36.7 &27.3 &32.7 &44.3 &54.7 &28.8 &26.7 &17.9 &26.5 \\
26
+ VideoLLM & VideoChat & Vicuna-7B &47.1 &43.8 &34.9 &40.0 &32.8 &34.6 &42.3 &50.5 &17.7 &34.9 &36.4 &27.3 \\
27
+ VideoLLM & Video-ChatGPT & LLaMA-7B &37.2 &31.4 &33.2 &28.4 &35.5 &29.5 &23.7 &42.3 &25.9 &27.6 &21.3 &21.1 \\
28
+ VideoLLM & Valley & LLaMA-13B &39.3 &32.9 &31.6 &27.9 &24.2 &30.1 &27.8 &43.8 &11.8 &31.3 &23.2 &20.7 \\'''
29
+
30
+
31
+ LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard
32
+
33
+ Welcome to the leaderboard of the SEED-Bench! 🏆
34
+ This is a community where participants create multimodal language models and action generation algorithms to generate API function calls based goals described in natural lanugage!
35
+ Please refer to [our paper](https://arxiv.org/abs/2307.16125) for more details.
36
+ """
37
+
38
+ SUBMIT_INTRODUCTION = """# Submit Precautions
39
+ 1. Attain json file from our [github repository](https://github.com/AILab-CVC/SEED-Bench)
40
+ 2. If you want to revision model, please ensure 'Revision Model Name' align with what's in the leaderboard.
41
+ 3. Please ensure for right link for each submittion. Everyone could go to model's repository thought model name in the leaderboard.
42
+ 4. If you don't want to evaluate all dimension, not evaluated dimension performance and its corresponding average performance will set to 0.
43
+ """
44
+
45
+ TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
46
+ We use accurancy(%) as the primary evaluation metric for each tasks.
47
+ """
48
+
49
+ LEADERBORAD_INFO = """
50
+ Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
51
+ In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
52
+ SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
53
+ We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
54
+ Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
55
+ We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
56
+ By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
57
+ """
58
+
59
+
60
+
61
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
62
+ CITATION_BUTTON_TEXT = r"""@article{li2023seed,
63
+ title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension},
64
+ author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
65
+ journal={arXiv preprint arXiv:2307.16125},
66
+ year={2023}
67
+ }"""
file/SEED-Bench.json ADDED
The diff for this file is too large to render. See raw diff
 
file/result.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Type,Model,Language Model,Scene Understanding,Instance Identity,Instance Attributes,Instance Localization,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Avg. Img,Action Recognition,Action Prediction,Procedure Understanding,Avg. Video,Avg. All
2
+ LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,23.0,29.0,32.8,31.8,20.5,31.8,33.0,18.2,19.4,27.3,23.2,34.9,25.4,28.6,27.7
3
+ LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,28.2,27.3,34.5,23.8,29.5,28.5
4
+ LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37.0,9.0,26.6,33.0,23.1,26.2,27.3,26.8
5
+ ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,49.7,32.6,47.5,24.0,36.7,46.4
6
+ ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,57.8,33.1,49.1,27.1,38.3,52.7
7
+ ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,58.8,34.5,49.6,23.1,38.1,53.4
8
+ ImageLLM,[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,42.7,34.9,33.5,28.4,41.9,30.8,27.8,46.8,27.7,37.0,29.7,21.4,19.1,23.8,33.5
9
+ ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Flan-T5-XL,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,47.4,38.2,24.5,27.1,29.9,42.8
10
+ ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,51.9,44.1,39.9,36.1,33.7,36.4,32.0,53.2,30.6,41.8,39.5,24.3,31.9,31.4,39.1
11
+ ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,34.5,36.9,25.8,24.0,29.2,33.2
12
+ ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,44.9,38.6,32.2,30.9,26.3,31.8,32.0,51.4,31.8,35.2,37.9,27.2,24.8,30.4,33.9
13
+ ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,42.9,36.8,29.2,23.8,30.6,39.7
14
+ ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20.0,34.5,37.2,25.4,24.2,29.3,33.1
15
+ ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.9,42.7,42.9,34.7,26.9,35.7,40.9
16
+ ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,24.7,35.2,38.6,18.5,19.6,25.8,32.7
17
+ ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,27.1,35.5,33.9,25.4,23.0,27.8,33.5
18
+ ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,37.9,26.7,17.9,26.5,23.0,34.0
19
+ ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,25.9,54.4,41.3,40.4,27.0,37.5,50.0
20
+ VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,47.1,43.8,34.9,40.0,32.8,34.6,42.3,50.5,17.7,39.0,34.9,36.4,27.3,33.7,37.6
21
+ VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,33.9,27.6,21.3,21.1,23.5,31.2
22
+ VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,32.0,31.3,23.2,20.7,25.4,30.3
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ APScheduler==3.10.1
7
+ async-timeout==4.0.2
8
+ attrs==23.1.0
9
+ certifi==2022.12.7
10
+ charset-normalizer==3.1.0
11
+ click==8.1.3
12
+ contourpy==1.0.7
13
+ cycler==0.11.0
14
+ datasets==2.12.0
15
+ entrypoints==0.4
16
+ fastapi==0.95.1
17
+ ffmpy==0.3.0
18
+ filelock==3.11.0
19
+ fonttools==4.39.3
20
+ frozenlist==1.3.3
21
+ fsspec==2023.4.0
22
+ gradio==3.27.0
23
+ gradio_client==0.1.3
24
+ h11==0.14.0
25
+ httpcore==0.17.0
26
+ httpx==0.24.0
27
+ huggingface-hub==0.13.4
28
+ idna==3.4
29
+ Jinja2==3.1.2
30
+ jsonschema==4.17.3
31
+ kiwisolver==1.4.4
32
+ linkify-it-py==2.0.0
33
+ markdown-it-py==2.2.0
34
+ MarkupSafe==2.1.2
35
+ matplotlib==3.7.1
36
+ mdit-py-plugins==0.3.3
37
+ mdurl==0.1.2
38
+ multidict==6.0.4
39
+ numpy==1.24.2
40
+ orjson==3.8.10
41
+ packaging==23.1
42
+ pandas==2.0.0
43
+ Pillow==9.5.0
44
+ plotly==5.14.1
45
+ pyarrow==11.0.0
46
+ pydantic==1.10.7
47
+ pydub==0.25.1
48
+ pyparsing==3.0.9
49
+ pyrsistent==0.19.3
50
+ python-dateutil==2.8.2
51
+ python-multipart==0.0.6
52
+ pytz==2023.3
53
+ pytz-deprecation-shim==0.1.0.post0
54
+ PyYAML==6.0
55
+ requests==2.28.2
56
+ semantic-version==2.10.0
57
+ six==1.16.0
58
+ sniffio==1.3.0
59
+ starlette==0.26.1
60
+ toolz==0.12.0
61
+ tqdm==4.65.0
62
+ transformers==4.28.1
63
+ typing_extensions==4.5.0
64
+ tzdata==2023.3
65
+ tzlocal==4.3
66
+ uc-micro-py==1.0.1
67
+ urllib3==1.26.15
68
+ uvicorn==0.21.1
69
+ websockets==11.0.1
70
+ yarl==1.8.2
src/__pycache__/utils_display.cpython-38.pyc ADDED
Binary file (4.26 kB). View file
 
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc ADDED
Binary file (1.22 kB). View file
 
src/auto_leaderboard/model_metadata_type.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ import glob
4
+ import json
5
+ import os
6
+ from typing import Dict, List
7
+
8
+ from ..utils_display import AutoEvalColumn
9
+
10
+ @dataclass
11
+ class ModelInfo:
12
+ name: str
13
+ symbol: str # emoji
14
+
15
+ model_type_symbols = {
16
+ "LLM": "🟢",
17
+ "ImageLLM": "🔶",
18
+ "VideoLLM": "⭕",
19
+ "Other": "🟦",
20
+ }
21
+
22
+ class ModelType(Enum):
23
+ PT = ModelInfo(name="LLM", symbol="🟢")
24
+ FT = ModelInfo(name="ImageLLM", symbol="🔶")
25
+ IFT = ModelInfo(name="VideoLLM", symbol="⭕")
26
+ RL = ModelInfo(name="Other", symbol="🟦")
27
+
28
+ def to_str(self, separator = " "):
29
+ return f"{self.value.symbol}{separator}{self.value.name}"
30
+
src/utils_display.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ # These classes are for user facing column names, to avoid having to change them
4
+ # all around the code when a modif is needed
5
+ @dataclass
6
+ class ColumnContent:
7
+ name: str
8
+ type: str
9
+ displayed_by_default: bool
10
+ hidden: bool = False
11
+
12
+ def fields(raw_class):
13
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
14
+
15
+ @dataclass(frozen=True)
16
+ class AutoEvalColumn: # Auto evals column
17
+ model_type_symbol = ColumnContent("T", "str", True)
18
+ model = ColumnContent("Model", "markdown", True)
19
+ average = ColumnContent("Average ⬆️", "number", True)
20
+ arc = ColumnContent("ARC", "number", True)
21
+ hellaswag = ColumnContent("HellaSwag", "number", True)
22
+ mmlu = ColumnContent("MMLU", "number", True)
23
+ truthfulqa = ColumnContent("TruthfulQA", "number", True)
24
+ model_type = ColumnContent("Type", "str", False)
25
+ precision = ColumnContent("Precision", "str", False, True)
26
+ license = ColumnContent("Hub License", "str", False)
27
+ params = ColumnContent("#Params (B)", "number", False)
28
+ likes = ColumnContent("Hub ❤️", "number", False)
29
+ revision = ColumnContent("Model sha", "str", False, False)
30
+ dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
31
+
32
+ @dataclass(frozen=True)
33
+ class EloEvalColumn: # Elo evals column
34
+ model = ColumnContent("Model", "markdown", True)
35
+ gpt4 = ColumnContent("GPT-4 (all)", "number", True)
36
+ human_all = ColumnContent("Human (all)", "number", True)
37
+ human_instruct = ColumnContent("Human (instruct)", "number", True)
38
+ human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class EvalQueueColumn: # Queue column
43
+ model = ColumnContent("model", "markdown", True)
44
+ revision = ColumnContent("revision", "str", True)
45
+ private = ColumnContent("private", "bool", True)
46
+ precision = ColumnContent("precision", "bool", True)
47
+ weight_type = ColumnContent("weight_type", "str", "Original")
48
+ status = ColumnContent("status", "str", True)
49
+
50
+ LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
51
+
52
+
53
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
54
+ VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
55
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
56
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
57
+ MODEL_PAGE = "https://huggingface.co/models"
58
+ LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
59
+ VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
60
+ ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
61
+
62
+
63
+ def model_hyperlink(link, model_name):
64
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
65
+
66
+
67
+ def make_clickable_model(model_name):
68
+ link = f"https://huggingface.co/{model_name}"
69
+
70
+ if model_name in LLAMAS:
71
+ link = LLAMA_LINK
72
+ model_name = model_name.split("/")[1]
73
+ elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
74
+ link = VICUNA_LINK
75
+ model_name = "stable-vicuna-13b"
76
+ elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
77
+ link = ALPACA_LINK
78
+ model_name = "alpaca-13b"
79
+ if model_name == "dolly-12b":
80
+ link = DOLLY_LINK
81
+ elif model_name == "vicuna-13b":
82
+ link = VICUNA_LINK
83
+ elif model_name == "koala-13b":
84
+ link = KOALA_LINK
85
+ elif model_name == "oasst-12b":
86
+ link = OASST_LINK
87
+ #else:
88
+ # link = MODEL_PAGE
89
+
90
+ return model_hyperlink(link, model_name)
91
+
92
+ def styled_error(error):
93
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
94
+
95
+ def styled_warning(warn):
96
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
97
+
98
+ def styled_message(message):
99
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"