BreakLee commited on
Commit
1edb956
1 Parent(s): a20c2ef

SEED Bench

Browse files
app.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+ import json
7
+ import pdb
8
+ import tempfile
9
+
10
+ from constants import *
11
+ from src.auto_leaderboard.model_metadata_type import ModelType
12
+
13
+ global data_component, filter_component
14
+
15
+
16
+ def upload_file(files):
17
+ file_paths = [file.name for file in files]
18
+ return file_paths
19
+
20
+ def prediction_analyse(prediction_content):
21
+ # pdb.set_trace()
22
+ predictions = prediction_content.split("\n")
23
+
24
+ # 读取 ground_truth JSON 文件
25
+ with open("./file/SEED-Bench.json", "r") as file:
26
+ ground_truth_data = json.load(file)["questions"]
27
+
28
+ # 将 ground_truth 数据转换为以 question_id 为键的字典
29
+ ground_truth = {item["question_id"]: item for item in ground_truth_data}
30
+
31
+ # 初始化结果统计字典
32
+ results = {i: {"correct": 0, "total": 0} for i in range(1, 13)}
33
+
34
+ # 遍历 predictions,计算每个 question_type_id 的正确预测数和总预测数
35
+ for prediction in predictions:
36
+ prediction = json.loads(prediction)
37
+ question_id = prediction["question_id"]
38
+ gt_item = ground_truth[question_id]
39
+ question_type_id = gt_item["question_type_id"]
40
+
41
+ if prediction["prediction"] == gt_item["answer"]:
42
+ results[question_type_id]["correct"] += 1
43
+
44
+ results[question_type_id]["total"] += 1
45
+
46
+ return results
47
+
48
+ def add_new_eval(
49
+ input_file,
50
+ model_name_textbox: str,
51
+ revision_name_textbox: str,
52
+ model_type: str,
53
+ model_link: str,
54
+ LLM_type: str,
55
+ LLM_name_textbox: str,
56
+ Evaluation_dimension: str,
57
+ ):
58
+ if input_file is None:
59
+ return "Error! Empty file!"
60
+ else:
61
+ content = input_file.decode("utf-8")
62
+ prediction = prediction_analyse(content)
63
+ each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) for i in range(1, 13)}
64
+
65
+ # count for average image\video\all
66
+ total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
67
+ total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
68
+
69
+ total_image = sum(prediction[i]["total"] for i in range(1, 10))
70
+ total_video = sum(prediction[i]["total"] for i in range(10, 13))
71
+
72
+ average_accuracy_image = round(total_correct_image / total_image * 100, 1)
73
+ average_accuracy_video = round(total_correct_video / total_video * 100, 1)
74
+ overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
75
+
76
+ if LLM_type == 'other':
77
+ LLM_name = LLM_name_textbox
78
+ else:
79
+ LLM_name = LLM_type
80
+ # add new data
81
+ new_data = [
82
+ model_type,
83
+ model_name_textbox,
84
+ LLM_name,
85
+ each_task_accuracy[1],
86
+ each_task_accuracy[2],
87
+ each_task_accuracy[3],
88
+ each_task_accuracy[4],
89
+ each_task_accuracy[5],
90
+ each_task_accuracy[6],
91
+ each_task_accuracy[7],
92
+ each_task_accuracy[8],
93
+ each_task_accuracy[9],
94
+ average_accuracy_image,
95
+ each_task_accuracy[10],
96
+ each_task_accuracy[11],
97
+ each_task_accuracy[12],
98
+ average_accuracy_video,
99
+ overall_accuracy]
100
+ # pdb.set_trace()
101
+ csv_data = pd.read_csv(CSV_DIR)
102
+ col = csv_data.shape[0]
103
+ csv_data.loc[col] = new_data
104
+ csv_data = csv_data.to_csv(CSV_DIR, index=False)
105
+ return 0
106
+
107
+ def get_baseline_df():
108
+ df = pd.read_csv(CSV_DIR)
109
+ return df
110
+
111
+ block = gr.Blocks()
112
+
113
+
114
+ with block:
115
+ gr.Markdown(
116
+ LEADERBORAD_INTRODUCTION
117
+ )
118
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
119
+ with gr.TabItem("🏅 SEED Benchmark", elem_id="seed-benchmark-tab-table", id=0):
120
+ with gr.Row():
121
+ with gr.Accordion("Citation", open=False):
122
+ citation_button = gr.Textbox(
123
+ value=CITATION_BUTTON_TEXT,
124
+ label=CITATION_BUTTON_LABEL,
125
+ elem_id="citation-button",
126
+ ).style(show_copy_button=True)
127
+
128
+ gr.Markdown(
129
+ TABLE_INTRODUCTION
130
+ )
131
+
132
+ # selection for column part:
133
+ checkbox_group = gr.CheckboxGroup(
134
+ choices=TASK_INFO,
135
+ value=TASK_INFO,
136
+ label="Select options",
137
+ interactive=True,
138
+ )
139
+
140
+ # 创建数据帧组件
141
+ data_component = gr.components.Dataframe(
142
+ value=get_baseline_df,
143
+ headers=COLUMN_NAMES,
144
+ type="pandas",
145
+ datatype=DATA_TITILE_TYPE,
146
+ interactive=False,
147
+ visible=True,
148
+ )
149
+
150
+ def on_checkbox_group_change(selected_columns):
151
+ print("TEst")
152
+ # pdb.set_trace()
153
+ selected_columns = [item for item in TASK_INFO if item in selected_columns]
154
+ present_columns = MODEL_INFO + selected_columns
155
+ updated_data = get_baseline_df()[present_columns]
156
+ updated_headers = present_columns
157
+ update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
158
+
159
+ # data_component.update(value=updated_data, headers=updated_headers, datatype=update_datatype)
160
+
161
+ filter_component = gr.components.Dataframe(
162
+ value=updated_data,
163
+ headers=updated_headers,
164
+ type="pandas",
165
+ datatype=update_datatype,
166
+ interactive=False,
167
+ visible=True,
168
+ )
169
+ # pdb.set_trace()
170
+
171
+ return filter_component.value
172
+
173
+ # 将复选框组关联到处理函数
174
+ checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
175
+
176
+ # table 2
177
+ with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
178
+ gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
179
+
180
+ # table 3
181
+ with gr.TabItem("🚀 Submit here! ", elem_id="seed-benchmark-tab-table", id=3):
182
+ gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
183
+
184
+
185
+ with gr.Row():
186
+ gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
187
+
188
+ with gr.Row():
189
+ with gr.Column():
190
+ model_name_textbox = gr.Textbox(
191
+ label="Model name", placeholder="LLaMA-7B"
192
+ )
193
+ revision_name_textbox = gr.Textbox(
194
+ label="Revision Model Name", placeholder="LLaMA-7B"
195
+ )
196
+ model_type = gr.Dropdown(
197
+ choices=[
198
+ ModelType.PT.to_str(" : "),
199
+ ModelType.FT.to_str(" : "),
200
+ ModelType.IFT.to_str(" : "),
201
+ ModelType.RL.to_str(" : "),
202
+ ],
203
+ label="Model type",
204
+ multiselect=False,
205
+ value="LLaMA-7B",
206
+ interactive=True,
207
+ )
208
+ model_link = gr.Textbox(
209
+ label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
210
+ )
211
+
212
+ with gr.Column():
213
+
214
+ LLM_type = gr.Dropdown(
215
+ choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "other"],
216
+ label="LLM type",
217
+ multiselect=False,
218
+ value="LLaMA-7B",
219
+ interactive=True,
220
+ )
221
+ LLM_name_textbox = gr.Textbox(
222
+ label="LLM model (for other)",
223
+ value="LLaMA-13B"
224
+ )
225
+ Evaluation_dimension = gr.Dropdown(
226
+ choices=["All", "Image", "Video"],
227
+ label="Evaluation dimension",
228
+ multiselect=False,
229
+ value="All",
230
+ interactive=True,
231
+ )
232
+
233
+ with gr.Column():
234
+
235
+ input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
236
+ submit_button = gr.Button("Submit Eval")
237
+
238
+ submission_result = gr.Markdown()
239
+ submit_button.click(
240
+ add_new_eval,
241
+ inputs = [
242
+ input_file,
243
+ model_name_textbox,
244
+ revision_name_textbox,
245
+ model_type,
246
+ model_link,
247
+ LLM_type,
248
+ LLM_name_textbox,
249
+ Evaluation_dimension,
250
+ ],
251
+ # outputs = submission_result,
252
+ )
253
+
254
+
255
+ with gr.Row():
256
+ data_run = gr.Button("Refresh")
257
+ data_run.click(
258
+ get_baseline_df, outputs=data_component
259
+ )
260
+
261
+ # block.load(get_baseline_df, outputs=data_title)
262
+
263
+ block.launch()
constants.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is .py for store constants
2
+ MODEL_INFO = ["Model Type", "Model", "Language Model"]
3
+ TASK_INFO = ["Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Avg. Img", "Action Recognition", "Action Prediction", "Procedure Understanding", "Avg. Video", "Avg. All"]
4
+ AVG_INFO = ["Avg. Img", "Avg. Video", "Avg. All"]
5
+ DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
6
+ CSV_DIR = "./file/result.csv"
7
+
8
+ COLUMN_NAMES = MODEL_INFO + TASK_INFO
9
+ DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192]
10
+
11
+ UNTUNED_MODEL_RESULTS = '''LLM & Flan-T5 & Flan-T5-XL &23.0 &29.0 &32.8 &31.8 &20.5 &31.8 &33.0 &18.2 &19.4 &23.2 &34.9 &25.4 \\
12
+ LLM & Vicuna & Vicuna-7B &23.4 &30.7 &29.7 &30.9 &30.8 &28.6 &29.8 &18.5 &13.4 &27.3 &34.5 &23.8 \\
13
+ LLM & LLaMA & LLaMA-7B &26.3 &27.4 &26.2 &28.3 &25.1 &28.8 &19.2 &37.0 & 9.0 &33.0 &23.1 &26.2 \\
14
+ ImageLLM & BLIP-2 & Flan-T5-XL &59.1 &53.9 &49.2 &42.3 &43.2 &36.7 &55.7 &45.6 &25.9 &32.6 &47.5 &24.0 \\
15
+ ImageLLM & InstructBLIP & Flan-T5-XL &60.3 &58.5 &63.4 &40.6 &58.4 &38.7 &51.6 &45.9 &25.9 &33.1 &49.1 &27.1 \\
16
+ ImageLLM & InstructBLIP-Vicuna & Vicuna-7B &60.2 &58.9 &65.6 &43.6 &57.2 &40.3 &52.6 &47.7 &43.5 &34.5 &49.6 &23.1 \\
17
+ ImageLLM & LLaVA & LLaMA-7B &42.7 &34.9 &33.5 &28.4 &41.9 &30.8 &27.8 &46.8 &27.7 &29.7 &21.4 &19.1 \\
18
+ ImageLLM & MiniGPT-4 & Flan-T5-XL &56.3 &49.2 &45.8 &37.9 &45.3 &32.6 &47.4 &57.1 &11.8 &38.2 &24.5 &27.1 \\
19
+ ImageLLM & VPGTrans & LLaMA-7B &51.9 &44.1 &39.9 &36.1 &33.7 &36.4 &32.0 &53.2 &30.6 &39.5 &24.3 &31.9 \\
20
+ ImageLLM & MultiModal-GPT & LLaMA-7B &43.6 &37.9 &31.5 &30.8 &27.3 &30.1 &29.9 &51.4 &18.8 &36.9 &25.8 &24.0 \\
21
+ ImageLLM & Otter & LLaMA-7B &44.9 &38.6 &32.2 &30.9 &26.3 &31.8 &32.0 &51.4 &31.8 &37.9 &27.2 &24.8 \\
22
+ ImageLLM & OpenFlamingo & LLaMA-7B &43.9 &38.1 &31.3 &30.1 &27.3 &30.6 &29.9 &50.2 &20.0 &37.2 &25.4 &24.2 \\
23
+ ImageLLM & LLaMA-Adapter V2 & LLaMA-7B &45.2 &38.5 &29.3 &33.0 &29.7 &35.5 &39.2 &52.0 &24.7 &38.6 &18.5 &19.6 \\
24
+ ImageLLM & GVT & Vicuna-7B &41.7 &35.5 &31.8 &29.5 &36.2 &32.0 &32.0 &51.1 &27.1 &33.9 &25.4 &23.0 \\
25
+ ImageLLM & mPLUG-Owl & LLaMA-7B &49.7 &45.3 &32.5 &36.7 &27.3 &32.7 &44.3 &54.7 &28.8 &26.7 &17.9 &26.5 \\
26
+ VideoLLM & VideoChat & Vicuna-7B &47.1 &43.8 &34.9 &40.0 &32.8 &34.6 &42.3 &50.5 &17.7 &34.9 &36.4 &27.3 \\
27
+ VideoLLM & Video-ChatGPT & LLaMA-7B &37.2 &31.4 &33.2 &28.4 &35.5 &29.5 &23.7 &42.3 &25.9 &27.6 &21.3 &21.1 \\
28
+ VideoLLM & Valley & LLaMA-13B &39.3 &32.9 &31.6 &27.9 &24.2 &30.1 &27.8 &43.8 &11.8 &31.3 &23.2 &20.7 \\'''
29
+
30
+
31
+ LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard
32
+
33
+ Welcome to the leaderboard of the SEED-Bench! 🏆
34
+ This is a community where participants create multimodal language models and action generation algorithms to generate API function calls based goals described in natural lanugage!
35
+ Please refer to [our paper](https://arxiv.org/abs/2307.16125) for more details.
36
+ """
37
+
38
+ TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
39
+ We use accurancy(%) as the primary evaluation metric for most tasks.
40
+ """
41
+
42
+ LEADERBORAD_INFO = """
43
+ Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
44
+ In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
45
+ SEED-Bench consists of 19K multiple choice questions with accurate human annotations ($\times$6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
46
+ We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
47
+ Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
48
+ We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
49
+ By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
50
+ """
51
+
52
+
53
+
54
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
55
+ CITATION_BUTTON_TEXT = r"""@article{li2023seed,
56
+ title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension},
57
+ author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
58
+ journal={arXiv preprint arXiv:2307.16125},
59
+ year={2023}
60
+ }"""
file/SEED-Bench.json ADDED
The diff for this file is too large to render. See raw diff
 
file/result.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Type,Model,Language Model,Scene Understanding,Instance Identity,Instance Attributes,Instance Localization,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Avg. Img,Action Recognition,Action Prediction,Procedure Understanding,Avg. Video,Avg. All
2
+ LLM,Flan-T5,Flan-T5-XL,23.0,29.0,32.8,31.8,20.5,31.8,33.0,18.2,19.4,27.32,23.2,34.9,25.4,28.57,27.65
3
+ LLM,Vicuna,Vicuna-7B,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,28.16,27.3,34.5,23.8,29.47,28.5
4
+ LLM,LLaMA,LLaMA-7B,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37.0,9.0,26.56,33.0,23.1,26.2,27.27,26.75
5
+ ImageLLM,BLIP-2,Flan-T5-XL,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,49.74,32.6,47.5,24.0,36.71,46.35
6
+ ImageLLM,InstructBLIP,Flan-T5-XL,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,57.8,33.1,49.1,27.1,38.31,52.73
7
+ ImageLLM,InstructBLIP-Vicuna,Vicuna-7B,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,58.76,34.5,49.6,23.1,38.05,53.37
8
+ ImageLLM,LLaVA,LLaMA-7B,42.7,34.9,33.5,28.4,41.9,30.8,27.8,46.8,27.7,36.96,29.7,21.4,19.1,23.76,33.52
9
+ ImageLLM,MiniGPT-4,Flan-T5-XL,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,47.4,38.2,24.5,27.1,29.89,42.84
10
+ ImageLLM,VPGTrans,LLaMA-7B,51.9,44.1,39.9,36.1,33.7,36.4,32.0,53.2,30.6,41.81,39.5,24.3,31.9,31.4,39.1
11
+ ImageLLM,MultiModal-GPT,LLaMA-7B,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,34.54,36.9,25.8,24.0,29.21,33.15
12
+ ImageLLM,Otter,LLaMA-7B,44.9,38.6,32.2,30.9,26.3,31.8,32.0,51.4,31.8,35.16,37.9,27.2,24.8,30.35,33.91
13
+ ImageLLM,OpenFlamingo,LLaMA-7B,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20.0,34.51,37.2,25.4,24.2,29.25,33.14
14
+ ImageLLM,LLaMA-AdapterV2,LLaMA-7B,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,24.7,35.19,38.6,18.5,19.6,25.75,32.73
15
+ ImageLLM,GVT,Vicuna-7B,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,27.1,35.49,33.9,25.4,23.0,27.77,33.48
16
+ ImageLLM,mPLUG-Owl,LLaMA-7B,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,37.88,26.7,17.9,26.5,23.02,34.01
17
+ VideoLLM,VideoChat,Vicuna-7B,47.1,43.8,34.9,40.0,32.8,34.6,42.3,50.5,17.7,39.02,34.9,36.4,27.3,33.68,37.63
18
+ VideoLLM,Video-ChatGPT,LLaMA-7B,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,33.88,27.6,21.3,21.1,23.46,31.17
19
+ VideoLLM,Valley,LLaMA-13B,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,32.04,31.3,23.2,20.7,25.41,30.32
20
+ LLaMA-7B,test,LLaMA-7B,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.6,42.7,42.9,34.7,26.9,35.7,40.9
21
+ LLaMA-7B,test2,LLaMA-7B,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.6,42.7,42.9,34.7,26.9,35.7,40.9
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ APScheduler==3.10.1
7
+ async-timeout==4.0.2
8
+ attrs==23.1.0
9
+ certifi==2022.12.7
10
+ charset-normalizer==3.1.0
11
+ click==8.1.3
12
+ contourpy==1.0.7
13
+ cycler==0.11.0
14
+ datasets==2.12.0
15
+ entrypoints==0.4
16
+ fastapi==0.95.1
17
+ ffmpy==0.3.0
18
+ filelock==3.11.0
19
+ fonttools==4.39.3
20
+ frozenlist==1.3.3
21
+ fsspec==2023.4.0
22
+ gradio==3.27.0
23
+ gradio_client==0.1.3
24
+ h11==0.14.0
25
+ httpcore==0.17.0
26
+ httpx==0.24.0
27
+ huggingface-hub==0.13.4
28
+ idna==3.4
29
+ Jinja2==3.1.2
30
+ jsonschema==4.17.3
31
+ kiwisolver==1.4.4
32
+ linkify-it-py==2.0.0
33
+ markdown-it-py==2.2.0
34
+ MarkupSafe==2.1.2
35
+ matplotlib==3.7.1
36
+ mdit-py-plugins==0.3.3
37
+ mdurl==0.1.2
38
+ multidict==6.0.4
39
+ numpy==1.24.2
40
+ orjson==3.8.10
41
+ packaging==23.1
42
+ pandas==2.0.0
43
+ Pillow==9.5.0
44
+ plotly==5.14.1
45
+ pyarrow==11.0.0
46
+ pydantic==1.10.7
47
+ pydub==0.25.1
48
+ pyparsing==3.0.9
49
+ pyrsistent==0.19.3
50
+ python-dateutil==2.8.2
51
+ python-multipart==0.0.6
52
+ pytz==2023.3
53
+ pytz-deprecation-shim==0.1.0.post0
54
+ PyYAML==6.0
55
+ requests==2.28.2
56
+ semantic-version==2.10.0
57
+ six==1.16.0
58
+ sniffio==1.3.0
59
+ starlette==0.26.1
60
+ toolz==0.12.0
61
+ tqdm==4.65.0
62
+ transformers==4.28.1
63
+ typing_extensions==4.5.0
64
+ tzdata==2023.3
65
+ tzlocal==4.3
66
+ uc-micro-py==1.0.1
67
+ urllib3==1.26.15
68
+ uvicorn==0.21.1
69
+ websockets==11.0.1
70
+ yarl==1.8.2
src/auto_leaderboard/model_metadata_type.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ import glob
4
+ import json
5
+ import os
6
+ from typing import Dict, List
7
+
8
+ from ..utils_display import AutoEvalColumn
9
+
10
+ @dataclass
11
+ class ModelInfo:
12
+ name: str
13
+ symbol: str # emoji
14
+
15
+ model_type_symbols = {
16
+ "LLM": "🟢",
17
+ "ImageLLM": "🔶",
18
+ "VideoLLM": "⭕",
19
+ "Other": "🟦",
20
+ }
21
+
22
+ class ModelType(Enum):
23
+ PT = ModelInfo(name="LLM", symbol="🟢")
24
+ FT = ModelInfo(name="ImageLLM", symbol="🔶")
25
+ IFT = ModelInfo(name="VideoLLM", symbol="⭕")
26
+ RL = ModelInfo(name="Other", symbol="🟦")
27
+
28
+ def to_str(self, separator = " "):
29
+ return f"{self.value.symbol}{separator}{self.value.name}"
30
+
src/utils_display.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ # These classes are for user facing column names, to avoid having to change them
4
+ # all around the code when a modif is needed
5
+ @dataclass
6
+ class ColumnContent:
7
+ name: str
8
+ type: str
9
+ displayed_by_default: bool
10
+ hidden: bool = False
11
+
12
+ def fields(raw_class):
13
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
14
+
15
+ @dataclass(frozen=True)
16
+ class AutoEvalColumn: # Auto evals column
17
+ model_type_symbol = ColumnContent("T", "str", True)
18
+ model = ColumnContent("Model", "markdown", True)
19
+ average = ColumnContent("Average ⬆️", "number", True)
20
+ arc = ColumnContent("ARC", "number", True)
21
+ hellaswag = ColumnContent("HellaSwag", "number", True)
22
+ mmlu = ColumnContent("MMLU", "number", True)
23
+ truthfulqa = ColumnContent("TruthfulQA", "number", True)
24
+ model_type = ColumnContent("Type", "str", False)
25
+ precision = ColumnContent("Precision", "str", False, True)
26
+ license = ColumnContent("Hub License", "str", False)
27
+ params = ColumnContent("#Params (B)", "number", False)
28
+ likes = ColumnContent("Hub ❤️", "number", False)
29
+ revision = ColumnContent("Model sha", "str", False, False)
30
+ dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
31
+
32
+ @dataclass(frozen=True)
33
+ class EloEvalColumn: # Elo evals column
34
+ model = ColumnContent("Model", "markdown", True)
35
+ gpt4 = ColumnContent("GPT-4 (all)", "number", True)
36
+ human_all = ColumnContent("Human (all)", "number", True)
37
+ human_instruct = ColumnContent("Human (instruct)", "number", True)
38
+ human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class EvalQueueColumn: # Queue column
43
+ model = ColumnContent("model", "markdown", True)
44
+ revision = ColumnContent("revision", "str", True)
45
+ private = ColumnContent("private", "bool", True)
46
+ precision = ColumnContent("precision", "bool", True)
47
+ weight_type = ColumnContent("weight_type", "str", "Original")
48
+ status = ColumnContent("status", "str", True)
49
+
50
+ LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
51
+
52
+
53
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
54
+ VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
55
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
56
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
57
+ MODEL_PAGE = "https://huggingface.co/models"
58
+ LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
59
+ VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
60
+ ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
61
+
62
+
63
+ def model_hyperlink(link, model_name):
64
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
65
+
66
+
67
+ def make_clickable_model(model_name):
68
+ link = f"https://huggingface.co/{model_name}"
69
+
70
+ if model_name in LLAMAS:
71
+ link = LLAMA_LINK
72
+ model_name = model_name.split("/")[1]
73
+ elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
74
+ link = VICUNA_LINK
75
+ model_name = "stable-vicuna-13b"
76
+ elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
77
+ link = ALPACA_LINK
78
+ model_name = "alpaca-13b"
79
+ if model_name == "dolly-12b":
80
+ link = DOLLY_LINK
81
+ elif model_name == "vicuna-13b":
82
+ link = VICUNA_LINK
83
+ elif model_name == "koala-13b":
84
+ link = KOALA_LINK
85
+ elif model_name == "oasst-12b":
86
+ link = OASST_LINK
87
+ #else:
88
+ # link = MODEL_PAGE
89
+
90
+ return model_hyperlink(link, model_name)
91
+
92
+ def styled_error(error):
93
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
94
+
95
+ def styled_warning(warn):
96
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
97
+
98
+ def styled_message(message):
99
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"