BreakLee commited on
Commit
ff6b794
·
verified ·
1 Parent(s): 3d0a4bf

Upload 11 files

Browse files
README.md CHANGED
@@ -1,14 +1,13 @@
1
- ---
2
- title: AV Odyssey Bench Leaderboard
3
- emoji: 👁
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.6.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: This is the leaderboard for AV-Odyssey Bench
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Av-Odyssey Bench Leaderboard
3
+ emoji: 🏆
4
+ colorFrom: gray
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.40.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc-by-4.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
app.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import json
5
+ import pdb
6
+ import tempfile
7
+ import re
8
+ from constants import *
9
+ from src.auto_leaderboard.model_metadata_type import ModelType
10
+
11
+
12
+ global data_component, filter_component
13
+
14
+ def validate_model_size(s):
15
+ pattern = r'^\d+B$|^-$'
16
+ if re.match(pattern, s):
17
+ return s
18
+ else:
19
+ return '-'
20
+
21
+ def upload_file(files):
22
+ file_paths = [file.name for file in files]
23
+ return file_paths
24
+
25
+ def prediction_analyse(prediction_content):
26
+ # pdb.set_trace()
27
+ predictions = prediction_content.split("\n")
28
+
29
+ # 读取 ground_truth JSON 文件
30
+ with open("./file/SEED-Bench-1.json", "r") as file:
31
+ ground_truth_data = json.load(file)["questions"]
32
+
33
+ # 将 ground_truth 数据转换为以 question_id 为键的字典
34
+ ground_truth = {item["question_id"]: item for item in ground_truth_data}
35
+
36
+ # 初始化结果统计字典
37
+ results = {i: {"correct": 0, "total": 0} for i in range(1, 13)}
38
+
39
+ # 遍历 predictions,计算每个 question_type_id 的正确预测数和总预测数
40
+ for prediction in predictions:
41
+ # pdb.set_trace()
42
+ prediction = prediction.strip()
43
+ if not prediction:
44
+ continue
45
+ try:
46
+ prediction = json.loads(prediction)
47
+ except json.JSONDecodeError:
48
+ print(f"Warning: Skipping invalid JSON data in line: {prediction}")
49
+ continue
50
+ question_id = prediction["question_id"]
51
+ if question_id not in ground_truth:
52
+ continue
53
+ gt_item = ground_truth[question_id]
54
+ question_type_id = gt_item["question_type_id"]
55
+
56
+ if prediction["prediction"] == gt_item["answer"]:
57
+ results[question_type_id]["correct"] += 1
58
+
59
+ results[question_type_id]["total"] += 1
60
+
61
+ return results
62
+
63
+
64
+ def add_new_eval(
65
+ input_file,
66
+ model_name_textbox: str,
67
+ revision_name_textbox: str,
68
+ model_link: str,
69
+ ):
70
+ if input_file is None:
71
+ return "Error! Empty file!"
72
+ else:
73
+ model_size = validate_model_size(model_size)
74
+ # v1 evaluation
75
+ content = input_file.decode("utf-8")
76
+ prediction = prediction_analyse(content)
77
+ csv_data = pd.read_csv(CSV_DIR)
78
+
79
+ Start_dimension, End_dimension = 1, 13
80
+ if Evaluation_dimension == 'Image':
81
+ End_dimension = 10
82
+ elif Evaluation_dimension == 'Video':
83
+ Start_dimension = 10
84
+ each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)}
85
+
86
+ # count for average image\video\all
87
+ total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
88
+ total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
89
+
90
+ total_image = sum(prediction[i]["total"] for i in range(1, 10))
91
+ total_video = sum(prediction[i]["total"] for i in range(10, 13))
92
+
93
+ if Evaluation_dimension != 'Video':
94
+ average_accuracy_image = round(total_correct_image / total_image * 100, 1)
95
+ else:
96
+ average_accuracy_image = 0
97
+
98
+ if Evaluation_dimension != 'Image':
99
+ average_accuracy_video = round(total_correct_video / total_video * 100, 1)
100
+ else:
101
+ average_accuracy_video = 0
102
+
103
+ if Evaluation_dimension == 'All':
104
+ overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
105
+ else:
106
+ overall_accuracy = 0
107
+
108
+ if LLM_type == 'Other':
109
+ LLM_name = LLM_name_textbox
110
+ else:
111
+ LLM_name = LLM_type
112
+
113
+ if revision_name_textbox == '':
114
+ col = csv_data.shape[0]
115
+ model_name = model_name_textbox
116
+ else:
117
+ model_name = revision_name_textbox
118
+ model_name_list = csv_data['Model']
119
+ name_list = [name.split(']')[0][1:] for name in model_name_list]
120
+ if revision_name_textbox not in name_list:
121
+ col = csv_data.shape[0]
122
+ else:
123
+ col = name_list.index(revision_name_textbox)
124
+
125
+ if model_link == '':
126
+ model_name = model_name # no url
127
+ else:
128
+ model_name = '[' + model_name + '](' + model_link + ')'
129
+
130
+ # add new data
131
+ new_data = [
132
+ model_name,
133
+ LLM_name,
134
+ model_size,
135
+ overall_accuracy,
136
+ average_accuracy_image,
137
+ average_accuracy_video,
138
+ each_task_accuracy[1],
139
+ each_task_accuracy[2],
140
+ each_task_accuracy[3],
141
+ each_task_accuracy[4],
142
+ each_task_accuracy[5],
143
+ each_task_accuracy[6],
144
+ each_task_accuracy[7],
145
+ each_task_accuracy[8],
146
+ each_task_accuracy[9],
147
+ each_task_accuracy[10],
148
+ each_task_accuracy[11],
149
+ each_task_accuracy[12],
150
+ ]
151
+ csv_data.loc[col] = new_data
152
+ csv_data = csv_data.to_csv(CSV_DIR, index=False)
153
+
154
+ csv_task_data.loc[col] = new_data
155
+ csv_task_data = csv_task_data.to_csv(CSV_TASK_DIR, index=False)
156
+ return 0
157
+
158
+ def get_baseline_df():
159
+ df = pd.read_csv(CSV_DIR)
160
+ df = df.sort_values(by="Avg. All", ascending=False)
161
+ present_columns = MODEL_INFO + checkbox_group.value
162
+ df = df[present_columns]
163
+ return df
164
+
165
+ def get_all_df():
166
+ df = pd.read_csv(CSV_DIR)
167
+ df = df.sort_values(by="Avg. All", ascending=False)
168
+ return df
169
+
170
+
171
+ def switch_version(version):
172
+ return f"当前版本: {version}"
173
+
174
+ block = gr.Blocks()
175
+
176
+
177
+ with block:
178
+ gr.Markdown(
179
+ LEADERBORAD_INTRODUCTION
180
+ )
181
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
182
+ # table seed-bench-v1
183
+ with gr.TabItem("🏅 AV-Odyssey Benchmark", elem_id="av-odyssey-tab-table", id=1):
184
+ with gr.Row():
185
+ with gr.Accordion("Citation", open=False):
186
+ citation_button = gr.Textbox(
187
+ value=CITATION_BUTTON_TEXT,
188
+ label=CITATION_BUTTON_LABEL,
189
+ elem_id="citation-button",
190
+ ).style(show_copy_button=True)
191
+
192
+ gr.Markdown(
193
+ TABLE_INTRODUCTION
194
+ )
195
+
196
+ # selection for column part:
197
+ checkbox_group = gr.CheckboxGroup(
198
+ choices=TASK_INFO,
199
+ value=AVG_INFO,
200
+ label="Evaluation Dimension",
201
+ interactive=True,
202
+ )
203
+
204
+
205
+ baseline_value = get_baseline_df()
206
+ baseline_header = MODEL_INFO + checkbox_group.value
207
+ baseline_datatype = ['markdown'] * len(MODEL_INFO) + ['number'] * len(checkbox_group.value)
208
+ # 创建数据帧组件
209
+ data_component = gr.components.Dataframe(
210
+ value=baseline_value,
211
+ headers=baseline_header,
212
+ type="pandas",
213
+ datatype=baseline_datatype,
214
+ interactive=False,
215
+ visible=True,
216
+ )
217
+
218
+ def on_filter_model_size_method_change(selected_columns):
219
+
220
+ updated_data = get_all_df()
221
+
222
+ # columns:
223
+ selected_columns = [item for item in TASK_INFO if item in selected_columns]
224
+ present_columns = MODEL_INFO + selected_columns
225
+ updated_data = updated_data[present_columns]
226
+ updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
227
+ updated_headers = present_columns
228
+ update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
229
+
230
+ filter_component = gr.components.Dataframe(
231
+ value=updated_data,
232
+ headers=updated_headers,
233
+ type="pandas",
234
+ datatype=update_datatype,
235
+ interactive=False,
236
+ visible=True,
237
+ )
238
+ # pdb.set_trace()
239
+
240
+ return filter_component.value
241
+
242
+ def on_average_type_change(average_type):
243
+ return get_baseline_df()
244
+
245
+ checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component)
246
+
247
+ # table 2
248
+ with gr.TabItem("📝 About", elem_id="av-odyssey-tab-table", id=2):
249
+ gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
250
+
251
+ # table 3
252
+ with gr.TabItem("🚀 Submit here! ", elem_id="av-odyssey-tab-table", id=3):
253
+ gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
254
+
255
+ with gr.Row():
256
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
257
+
258
+ with gr.Row():
259
+ gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
260
+
261
+ with gr.Row():
262
+ with gr.Column():
263
+ model_name_textbox = gr.Textbox(
264
+ label="Model name", placeholder="VideoLLaMA2"
265
+ )
266
+ revision_name_textbox = gr.Textbox(
267
+ label="Revision Model Name", placeholder="VideoLLaMA2"
268
+ )
269
+ model_link = gr.Textbox(
270
+ label="Model Link", placeholder="https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F"
271
+ )
272
+
273
+
274
+ with gr.Column():
275
+
276
+ input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
277
+ submit_button = gr.Button("Submit Eval")
278
+
279
+ submission_result = gr.Markdown()
280
+ submit_button.click(
281
+ add_new_eval,
282
+ inputs = [
283
+ input_file,
284
+ model_name_textbox,
285
+ revision_name_textbox,
286
+ model_link
287
+ ],
288
+ )
289
+
290
+
291
+ def refresh_data():
292
+ value1 = get_baseline_df()
293
+
294
+ return value1
295
+
296
+ with gr.Row():
297
+ data_run = gr.Button("Refresh")
298
+ data_run.click(
299
+ refresh_data, outputs=data_component
300
+ )
301
+
302
+
303
+ block.launch()
constants.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is .py for store constants
2
+ MODEL_INFO = ["Model"]
3
+ TASK_INFO = [
4
+ "Avg. All", "Avg. Timbre", "Avg. Tone", "Avg. Melody", "Avg. Space", "Avg. Time", "Avg. Hallucination", "Avg. Intricacy",
5
+ "Instrument Recognition", "Singer Recognition", "Gunshot Recognition", "Bird Recognition", "Animal Recognition",
6
+ "Transportation Recognition", "Material Recognition", "Scene Recognition", "Hazard Recognition", "Action Recognition",
7
+ "Eating Sound Recognition", "Speech Sentiment Analysis", "Meme Understanding", "Music Sentiment Analysis", "Music Genre Classification",
8
+ "Dance and Music Matching", "Film and Music Matching", "Music Score Matching", "Audio 3D Angle Estimation", "Audio Distance Estimation",
9
+ "Audio Time Estimation", "Audio-Visual Synchronization", "Action Sequencing", "Hallucination Evaluation",
10
+ "Action Prediction", "Action Tracing"]
11
+
12
+ AVG_INFO = ["Avg. All", "Avg. Timbre", "Avg. Tone", "Avg. Melody", "Avg. Space", "Avg. Time", "Avg. Hallucination", "Avg. Intricacy"]
13
+ DATA_TITILE_TYPE = ["markdown"] * len(MODEL_INFO) + ["number"] * len(TASK_INFO)
14
+ CSV_DIR = "./file/AV-Odyssey_performance.csv"
15
+
16
+ COLUMN_NAMES = MODEL_INFO + TASK_INFO
17
+
18
+ DATA_NUM = [200, 200, 200, 200, 200, 200, 200, 200, 108, 196, 200, 200, 20, 97, 200, 200, 200, 200, 20, 20, 200, 200, 200, 200, 199, 195]
19
+
20
+ LEADERBORAD_INTRODUCTION = """# AV-Odyssey Bench Leaderboard
21
+ Welcome to the leaderboard of the AV-Odyssey Bench! 🏆
22
+
23
+ AV-Odyssey Bench, a comprehensive audio-visual benchmark designed to assess whether those MLLMs can truly understand the audio-visual information. This benchmark encompasses 4,555 carefully crafted problems, each incorporating text, visual, and audio components. To successfully infer answers, models must effectively leverage clues from both visual and audio inputs.
24
+ Please refer to [AV-Odyssey paper](https://arxiv.org/abs/2307.16125) for more details.
25
+
26
+ """
27
+
28
+
29
+ SUBMIT_INTRODUCTION = """# Submit on AV-Odyssey Bench Introduction
30
+
31
+
32
+ Note: The format of the submitted json file is a dict for each line. This dict contains two keys: question_id and prediction. Specific examples are as follows:
33
+ ```shell
34
+ {"question_id": "5_0", "prediction": "B"}
35
+ {"question_id": "3_0", "prediction": "B"}
36
+ ```
37
+
38
+ ## Submit Example
39
+
40
+
41
+ ## If you have any questions, please contact [libohao1998@gmail.com](libohao1998@gmail.com).
42
+ """
43
+
44
+ TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
45
+ We use accurancy(%) as the primary evaluation metric for each tasks.
46
+
47
+ Performance Average Type is All Average means that calculates the overall accuracy by dividing the total number of correct QA answers by the total number of QA questions.
48
+
49
+ If you have any questions, please feel free to contact us.
50
+ """
51
+
52
+ LEADERBORAD_INFO = """
53
+ Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini 1.5 Pro, and Reka Core, have expanded their capabilities to include vision and audio modalities. While these models demonstrate impressive performance across a wide range of audio-visual applications, our proposed DeafTest reveals that MLLMs often struggle with simple tasks humans find trivial:
54
+ 1) determining which of two sounds is louder, and
55
+ 2) determining which of two sounds has a higher pitch.
56
+ Motivated by these observations, we introduce AV-Odyssey Bench. This benchmark encompasses 26 different tasks and 4,555 carefully crafted problems, each incorporating text, visual, and audio components. All data are newly collected and annotated by humans, not from any existing audio-visual dataset. AV-Odyssey Bench demonstrates three major features:
57
+ 1. Comprehensive Audio Attributes;
58
+ 2. Extensive Domains;
59
+ 3. Interleaved Text, Audio, and Visual components.
60
+ """
61
+
62
+
63
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
64
+ CITATION_BUTTON_TEXT = r"""{
65
+
66
+ }"""
67
+
file/AV-Odyssey_performance.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Avg. All,Avg. Timbre,Avg. Tone,Avg. Melody,Avg. Space,Avg. Time,Avg. Hallucination,Avg. Intricacy,Instrument Recognition,Singer Recognition,Gunshot Recognition,Bird Recognition,Animal Recognition,Transportation Recognition,Material Recognition,Scene Recognition,Hazard Recognition,Action Recognition,Eating Sound Recognition,Speech Sentiment Analysis,Meme Understanding,Music Sentiment Analysis,Music Genre Classification,Dance and Music Matching,Film and Music Matching,Music Score Matching,Audio 3D Angle Estimation,Audio Distance Estimation,Audio Time Estimation,Audio-Visual Synchronization,Action Sequencing,Hallucination Evaluation,Action Prediction,Action Tracing
2
+ [Unified-IO-2 L](https://unified-io-2.allenai.org/),26.0,23.8,24.1,28.8,15.0,26.8,30.0,30.4,20.5,22.5,25.5,18.5,27.0,26.5,23.0,28.0,21.3,20.9,26.5,24.5,20.0,27.9,31.0,27.5,32.5,24.5,15.0,15.0,28.0,25.5,27.0,30.0,27.1,33.8
3
+ [Unified-IO-2 XL](https://unified-io-2.allenai.org/),26.3,24.3,23.2,27.8,22.5,25.3,31.5,34.8,20.0,23.5,24.0,20.5,27.5,26.0,27.5,30.0,19.4,19.9,26.5,23.0,25.0,26.9,30.5,27.0,31.5,22.5,30.0,15.0,26.5,25.5,24.0,31.5,35.7,33.8
4
+ [Unified-IO-2 XXL](https://unified-io-2.allenai.org/),27.2,26.3,22.7,26.4,32.5,26.8,24.5,33.8,29.5,24.0,23.5,29.0,23.5,25.5,30.5,26.5,23.1,27.0,25.5,23.0,20.0,23.9,31.5,27.5,24.5,23.5,50.0,15.0,28.0,25.0,27.5,24.5,33.2,34.4
5
+ [OneLLM](https://github.com/csuhan/OneLLM),27.4,25.0,25.5,21.5,37.5,29.3,25.5,38.4,26.0,21.5,27.0,26.0,22.0,20.0,29.5,24.5,26.9,23.0,29.5,26.0,20.0,20.8,23.5,26.5,18.5,18.0,45.0,30.0,31.5,29.5,27.0,25.5,41.7,34.9
6
+ [PandaGPT](https://panda-gpt.github.io/),26.7,23.5,23.2,27.6,45.0,23.8,28.0,23.9,20.0,21.5,23.0,17.5,26.0,26.5,28.0,27.0,23.1,21.4,24.5,23.5,20.0,21.6,28.0,27.0,32.5,26.0,45.0,45.0,18.5,26.0,27.0,28.0,19.6,28.2
7
+ [Video-llama](https://github.com/DAMO-NLP-SG/Video-LLaMA),26.1,25.5,22.3,24.4,30.0,26.2,25.0,30.7,22.5,24.5,27.0,26.5,27.0,23.5,28.0,25.0,25.0,26.0,25.5,23.0,15.0,25.8,24.0,20.0,25.0,28.0,45.0,15.0,28.5,23.5,26.5,25.0,28.6,32.8
8
+ [VideoLLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2),26.8,24.1,25.5,26.4,30.0,27.2,33.0,34.5,22.5,24.0,27.0,17.0,23.5,27.5,26.5,26.5,19.4,23.0,25.5,26.0,20.0,26.8,29.0,25.5,30.5,20.5,45.0,15.0,28.5,26.5,26.5,33.0,28.6,40.5
9
+ [AnyGPT](https://junzhan2000.github.io/AnyGPT.github.io/),26.1,24.6,25.0,26.4,27.5,29.2,29.0,25.7,22.5,28.5,28.0,17.5,24.0,25.5,23.0,28.0,25.9,20.4,27.5,25.5,20.0,23.4,29.5,25.5,26.0,26.0,40.0,15.0,30.5,28.0,29.0,29.0,21.1,30.3
10
+ [NExT-GPT](https://next-gpt.github.io/),25.5,23.2,20.9,27.8,30.0,28.8,28.5,23.6,21.0,23.5,25.5,21.5,25.5,25.5,21.0,24.0,19.4,23.0,24.0,21.5,15.0,23.7,26.0,28.0,31.0,28.0,45.0,15.0,31.5,24.0,31.0,28.5,20.6,26.7
11
+ [VITA](https://vita-home.github.io/),26.4,24.1,26.4,27.8,22.5,26.3,31.0,36.8,22.0,20.5,24.5,21.5,27.5,25.0,23.5,28.5,21.3,19.4,29.5,24.5,45.0,26.9,26.0,27.5,33.5,24.5,25.0,20.0,26.5,25.5,27.0,31.0,34.2,39.5
12
+ [Gemini 1.5 Flash](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf),27.8,27.2,25.0,28.8,30.0,25.3,28.5,31.2,24.5,24.0,23.5,17.0,32.5,26.0,22.5,29.5,34.3,48.0,21.5,23.5,40.0,21.3,31.0,27.5,32.5,28.0,30.0,30.0,27.5,23.5,25.0,28.5,27.6,34.9
13
+ [Gemini 1.5 Flash-8B](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf),26.8,25.1,24.5,28.9,27.5,27.5,29.0,30.2,16.5,22.5,24.0,19.0,28.0,26.5,27.0,29.0,26.9,32.7,24.5,24.5,25.0,25.9,33.0,27.5,32.0,24.5,40.0,15.0,31.0,25.5,26.0,29.0,25.6,34.9
14
+ [Gemini 1.5 Pro](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf),30.8,30.8,31.4,31.3,37.5,27.7,20.5,33.0,33.0,26.0,29.0,25.0,25.5,26.0,29.5,30.0,38.0,57.7,22.5,29.5,50.0,25.4,42.5,28.0,28.5,29.0,35.0,40.0,30.0,24.5,28.5,20.5,32.2,33.8
15
+ [Reka Core](https://arxiv.org/abs/2404.12387),26.9,26.7,27.7,26.4,22.5,26.5,24.0,34.3,32.5,20.0,26.5,25.0,24.0,27.0,30.0,27.0,25.0,34.2,21.5,28.5,20.0,22.8,24.5,27.5,30.0,25.5,25.0,20.0,30.0,25.5,24.0,24.0,33.7,34.9
16
+ [Reka Flash](https://arxiv.org/abs/2404.12387),26.3,25.5,24.1,27.2,30.0,27.5,31.5,24.1,20.0,22.5,26.5,26.0,28.5,26.5,26.5,29.0,28.7,22.4,25.0,24.5,20.0,30.5,29.5,27.5,25.5,24.5,45.0,15.0,30.0,25.5,27.0,31.5,19.1,29.2
17
+ [Reka Edge](https://arxiv.org/abs/2404.12387),25.0,23.8,20.5,26.3,22.5,25.5,22.5,36.8,21.5,24.0,30.5,20.0,19.5,22.5,20.5,25.5,25.9,23.5,29.0,20.5,20.0,24.9,24.5,27.5,30.0,24.0,30.0,15.0,30.0,25.5,21.0,22.5,38.2,35.4
18
+ [GPT-4o visual caption](https://openai.com/index/hello-gpt-4o/),32.3,37.4,28.6,32.3,27.5,25.5,23.0,28.9,33.0,30.5,24.0,26.5,43.0,42.0,32.5,39.0,49.1,67.3,30.5,26.0,55.0,24.4,48.0,27.0,34.5,23.5,25.0,30.0,21.5,22.5,32.5,23.0,32.2,25.6
19
+ [GPT-4o audio caption](https://openai.com/index/hello-gpt-4o/),34.5,38.6,31.8,33.6,32.5,27.5,25.0,26.1,40.0,38.0,27.5,26.5,45.0,42.0,27.0,41.0,42.6,62.2,35.5,28.0,70.0,24.4,56.5,27.5,32.5,22.5,30.0,35.0,23.5,25.5,33.5,25.0,30.2,22.0
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ APScheduler==3.10.1
7
+ async-timeout==4.0.2
8
+ attrs==23.1.0
9
+ certifi==2022.12.7
10
+ charset-normalizer==3.1.0
11
+ click==8.1.3
12
+ contourpy==1.0.7
13
+ cycler==0.11.0
14
+ datasets==2.12.0
15
+ entrypoints==0.4
16
+ fastapi==0.95.1
17
+ ffmpy==0.3.0
18
+ filelock==3.11.0
19
+ fonttools==4.39.3
20
+ frozenlist==1.3.3
21
+ fsspec==2023.4.0
22
+ gradio==3.27.0
23
+ gradio_client==0.1.3
24
+ h11==0.14.0
25
+ httpcore==0.17.0
26
+ httpx==0.24.0
27
+ huggingface-hub==0.13.4
28
+ idna==3.4
29
+ Jinja2==3.1.2
30
+ jsonschema==4.17.3
31
+ kiwisolver==1.4.4
32
+ linkify-it-py==2.0.0
33
+ markdown-it-py==2.2.0
34
+ MarkupSafe==2.1.2
35
+ matplotlib==3.7.1
36
+ mdit-py-plugins==0.3.3
37
+ mdurl==0.1.2
38
+ multidict==6.0.4
39
+ numpy==1.24.2
40
+ orjson==3.8.10
41
+ packaging==23.1
42
+ pandas==2.0.0
43
+ Pillow==9.5.0
44
+ plotly==5.14.1
45
+ pyarrow==11.0.0
46
+ pydantic==1.10.7
47
+ pydub==0.25.1
48
+ pyparsing==3.0.9
49
+ pyrsistent==0.19.3
50
+ python-dateutil==2.8.2
51
+ python-multipart==0.0.6
52
+ pytz==2023.3
53
+ pytz-deprecation-shim==0.1.0.post0
54
+ PyYAML==6.0
55
+ requests==2.28.2
56
+ semantic-version==2.10.0
57
+ six==1.16.0
58
+ sniffio==1.3.0
59
+ starlette==0.26.1
60
+ toolz==0.12.0
61
+ tqdm==4.65.0
62
+ transformers==4.28.1
63
+ typing_extensions==4.5.0
64
+ tzdata==2023.3
65
+ tzlocal==4.3
66
+ uc-micro-py==1.0.1
67
+ urllib3==1.26.15
68
+ uvicorn==0.21.1
69
+ websockets==11.0.1
70
+ yarl==1.8.2
src/__pycache__/utils_display.cpython-311.pyc ADDED
Binary file (6.25 kB). View file
 
src/__pycache__/utils_display.cpython-38.pyc ADDED
Binary file (4.29 kB). View file
 
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-311.pyc ADDED
Binary file (1.72 kB). View file
 
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc ADDED
Binary file (1.25 kB). View file
 
src/auto_leaderboard/model_metadata_type.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ import glob
4
+ import json
5
+ import os
6
+ from typing import Dict, List
7
+
8
+ from ..utils_display import AutoEvalColumn
9
+
10
+ @dataclass
11
+ class ModelInfo:
12
+ name: str
13
+ symbol: str # emoji
14
+
15
+ model_type_symbols = {
16
+ "LLM": "🟢",
17
+ "ImageLLM": "🔶",
18
+ "VideoLLM": "⭕",
19
+ "Other": "🟦",
20
+ }
21
+
22
+ class ModelType(Enum):
23
+ PT = ModelInfo(name="LLM", symbol="🟢")
24
+ FT = ModelInfo(name="ImageLLM", symbol="🔶")
25
+ IFT = ModelInfo(name="VideoLLM", symbol="⭕")
26
+ RL = ModelInfo(name="Other", symbol="🟦")
27
+
28
+ def to_str(self, separator = " "):
29
+ return f"{self.value.symbol}{separator}{self.value.name}"
30
+
src/utils_display.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ # These classes are for user facing column names, to avoid having to change them
4
+ # all around the code when a modif is needed
5
+ @dataclass
6
+ class ColumnContent:
7
+ name: str
8
+ type: str
9
+ displayed_by_default: bool
10
+ hidden: bool = False
11
+
12
+ def fields(raw_class):
13
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
14
+
15
+ @dataclass(frozen=True)
16
+ class AutoEvalColumn: # Auto evals column
17
+ model_type_symbol = ColumnContent("T", "str", True)
18
+ model = ColumnContent("Model", "markdown", True)
19
+ average = ColumnContent("Average ⬆️", "number", True)
20
+ arc = ColumnContent("ARC", "number", True)
21
+ hellaswag = ColumnContent("HellaSwag", "number", True)
22
+ mmlu = ColumnContent("MMLU", "number", True)
23
+ truthfulqa = ColumnContent("TruthfulQA", "number", True)
24
+ model_type = ColumnContent("Type", "str", False)
25
+ precision = ColumnContent("Precision", "str", False, True)
26
+ license = ColumnContent("Hub License", "str", False)
27
+ params = ColumnContent("#Params (B)", "number", False)
28
+ likes = ColumnContent("Hub ❤️", "number", False)
29
+ revision = ColumnContent("Model sha", "str", False, False)
30
+ dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
31
+
32
+ @dataclass(frozen=True)
33
+ class EloEvalColumn: # Elo evals column
34
+ model = ColumnContent("Model", "markdown", True)
35
+ gpt4 = ColumnContent("GPT-4 (all)", "number", True)
36
+ human_all = ColumnContent("Human (all)", "number", True)
37
+ human_instruct = ColumnContent("Human (instruct)", "number", True)
38
+ human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class EvalQueueColumn: # Queue column
43
+ model = ColumnContent("model", "markdown", True)
44
+ revision = ColumnContent("revision", "str", True)
45
+ private = ColumnContent("private", "bool", True)
46
+ precision = ColumnContent("precision", "bool", True)
47
+ weight_type = ColumnContent("weight_type", "str", "Original")
48
+ status = ColumnContent("status", "str", True)
49
+
50
+ LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
51
+
52
+
53
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
54
+ VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
55
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
56
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
57
+ MODEL_PAGE = "https://huggingface.co/models"
58
+ LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
59
+ VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
60
+ ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
61
+
62
+
63
+ def model_hyperlink(link, model_name):
64
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
65
+
66
+
67
+ def make_clickable_model(model_name):
68
+ link = f"https://huggingface.co/{model_name}"
69
+
70
+ if model_name in LLAMAS:
71
+ link = LLAMA_LINK
72
+ model_name = model_name.split("/")[1]
73
+ elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
74
+ link = VICUNA_LINK
75
+ model_name = "stable-vicuna-13b"
76
+ elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
77
+ link = ALPACA_LINK
78
+ model_name = "alpaca-13b"
79
+ if model_name == "dolly-12b":
80
+ link = DOLLY_LINK
81
+ elif model_name == "vicuna-13b":
82
+ link = VICUNA_LINK
83
+ elif model_name == "koala-13b":
84
+ link = KOALA_LINK
85
+ elif model_name == "oasst-12b":
86
+ link = OASST_LINK
87
+ #else:
88
+ # link = MODEL_PAGE
89
+
90
+ return model_hyperlink(link, model_name)
91
+
92
+ def styled_error(error):
93
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
94
+
95
+ def styled_warning(warn):
96
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
97
+
98
+ def styled_message(message):
99
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"