KennyUTC commited on
Commit
ad8152e
·
1 Parent(s): 76e34e0

update leaderboard

Browse files
Files changed (4) hide show
  1. README.md +0 -34
  2. app.py +165 -192
  3. gen_table.py +182 -0
  4. meta_data.py +26 -0
README.md CHANGED
@@ -9,37 +9,3 @@ pinned: true
9
  license: apache-2.0
10
  short_description: A Leaderboard that demonstrates LMM reasoning capabilities
11
  ---
12
-
13
- # Start the configuration
14
-
15
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
16
-
17
- Results files should have the following format and be stored as json files:
18
- ```json
19
- {
20
- "config": {
21
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
- "model_name": "path of the model on the hub: org/model",
23
- "model_sha": "revision on the hub",
24
- },
25
- "results": {
26
- "task_name": {
27
- "metric_name": score,
28
- },
29
- "task_name2": {
30
- "metric_name": score,
31
- }
32
- }
33
- }
34
- ```
35
-
36
- Request files are created automatically by this tool.
37
-
38
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
-
40
- # Code logic for more complex edits
41
-
42
- You'll find
43
- - the main table' columns names and properties in `src/display/utils.py`
44
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
9
  license: apache-2.0
10
  short_description: A Leaderboard that demonstrates LMM reasoning capabilities
11
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,204 +1,177 @@
 
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
-
91
-
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
  )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
  )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
194
  value=CITATION_BUTTON_TEXT,
195
  label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ import abc
2
+
3
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ from gen_table import *
6
+ from meta_data import *
7
+
8
+ head_style = """
9
+ <style>
10
+ @media (min-width: 1536px)
11
+ {
12
+ .gradio-container {
13
+ min-width: var(--size-full) !important;
14
+ }
15
+ }
16
+ </style>
17
+ """
18
+
19
+ with gr.Blocks(title="Math Leaderboard", head=head_style) as demo:
20
+ results = load_results()['results']
21
+ N_MODEL = len(results)
22
+ DATASETS = []
23
+ for m in results:
24
+ DATASETS.extend(results[m].keys())
25
+ DATASETS = [d for d in set(DATASETS) if d != 'META']
26
+
27
+
28
+ N_DATA = len(DATASETS)
29
+ structs = [abc.abstractproperty() for _ in range(N_DATA)]
30
+
31
+ gr.Markdown(LEADERBORAD_INTRODUCTION)
32
+
33
+ with gr.Tabs(elem_classes='tab-buttons') as tabs:
34
+ with gr.TabItem('🏅 LMM Math Leaderboard', elem_id='main', id=0):
35
+ _, check_box = BUILD_L1_DF(results)
36
+
37
+ table = generate_table(results)
38
+ table['Rank'] = list(range(1, len(table) + 1))
39
+
40
+ type_map = check_box['type_map']
41
+ type_map['Rank'] = 'number'
42
+
43
+ checkbox_group = gr.CheckboxGroup(
44
+ choices=check_box['all'],
45
+ value=check_box['required'],
46
+ label='Evaluation Dimension',
47
+ interactive=True,
48
+ )
49
+
50
+ headers = ['Rank'] + check_box['essential'] + checkbox_group.value
51
  with gr.Row():
52
+ model_size = gr.CheckboxGroup(
53
+ choices=MODEL_SIZE,
54
+ value=MODEL_SIZE,
55
+ label='Model Size',
56
+ interactive=True
57
+ )
58
+ model_type = gr.CheckboxGroup(
59
+ choices=MODEL_TYPE,
60
+ value=MODEL_TYPE,
61
+ label='Model Type',
62
+ interactive=True
63
+ )
64
+ data_component = gr.components.DataFrame(
65
+ value=table[headers],
66
+ type='pandas',
67
+ datatype=[type_map[x] for x in headers],
68
+ interactive=False,
69
+ visible=True)
70
+
71
+ def filter_df(fields, model_size, model_type):
72
+ results = load_results()['results']
73
+ headers = ['Rank'] + check_box['essential'] + fields
74
+
75
+ df = generate_table(results)
76
+
77
+ df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
78
+ df = df[df['flag']]
79
+ df.pop('flag')
80
+ if len(df):
81
+ df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
82
+ df = df[df['flag']]
83
+ df.pop('flag')
84
+ df['Rank'] = list(range(1, len(df) + 1))
85
+
86
+ comp = gr.components.DataFrame(
87
+ value=df[headers],
88
+ type='pandas',
89
+ datatype=[type_map[x] for x in headers],
90
+ interactive=False,
91
+ visible=True)
92
+ return comp
93
+
94
+ for cbox in [checkbox_group, model_size, model_type]:
95
+ cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
96
+
97
+ for i, dataset in enumerate(DATASETS):
98
+ tab_name_map = {
99
+ 'MathVista': 'MathVista (Test Mini)',
100
+ 'MathVerse': 'MathVerse (Vision Only)',
101
+ }
102
+
103
+ with gr.TabItem(
104
+ f'📊 {dataset if dataset not in tab_name_map else tab_name_map[dataset]}', elem_id=dataset, id=i + 2):
105
+
106
+ s = structs[i]
107
+ s.table, s.check_box = BUILD_L2_DF(results, dataset)
108
+ s.type_map = s.check_box['type_map']
109
+ s.type_map['Rank'] = 'number'
110
+
111
+ s.checkbox_group = gr.CheckboxGroup(
112
+ choices=s.check_box['all'],
113
+ value=s.check_box['required'],
114
+ label=f'{dataset} CheckBoxes',
115
+ interactive=True,
116
+ )
117
+ s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
118
+ s.table['Rank'] = list(range(1, len(s.table) + 1))
119
 
120
+ with gr.Row():
121
+ s.model_size = gr.CheckboxGroup(
122
+ choices=MODEL_SIZE,
123
+ value=MODEL_SIZE,
124
+ label='Model Size',
125
+ interactive=True
 
126
  )
127
+ s.model_type = gr.CheckboxGroup(
128
+ choices=MODEL_TYPE,
129
+ value=MODEL_TYPE,
130
+ label='Model Type',
131
+ interactive=True
 
132
  )
133
+ s.data_component = gr.components.DataFrame(
134
+ value=s.table[s.headers],
135
+ type='pandas',
136
+ datatype=[s.type_map[x] for x in s.headers],
137
+ interactive=False,
138
+ visible=True)
139
+ s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
140
+
141
+ def filter_df_l2(dataset_name, fields, model_size, model_type):
142
+ results = load_results()['results']
143
+ s = structs[DATASETS.index(dataset_name)]
144
+ headers = ['Rank'] + s.check_box['essential'] + fields
145
+ df = cp.deepcopy(s.table)
146
+ df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
147
+ df = df[df['flag']]
148
+ df.pop('flag')
149
+ if len(df):
150
+ df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
151
+ df = df[df['flag']]
152
+ df.pop('flag')
153
+ df['Rank'] = list(range(1, len(df) + 1))
154
+
155
+ comp = gr.components.DataFrame(
156
+ value=df[headers],
157
+ type='pandas',
158
+ datatype=[s.type_map[x] for x in headers],
159
+ interactive=False,
160
+ visible=True)
161
+ return comp
162
+
163
+ for cbox in [s.checkbox_group, s.model_size, s.model_type]:
164
+ cbox.change(
165
+ fn=filter_df_l2,
166
+ inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
167
+ outputs=s.data_component)
168
 
169
  with gr.Row():
170
+ with gr.Accordion('Citation', open=False):
171
  citation_button = gr.Textbox(
172
  value=CITATION_BUTTON_TEXT,
173
  label=CITATION_BUTTON_LABEL,
174
+ elem_id='citation-button')
 
 
 
175
 
176
+ if __name__ == '__main__':
177
+ demo.launch(server_name='0.0.0.0')
 
 
gen_table.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy as cp
2
+ import json
3
+ from collections import defaultdict
4
+ from urllib.request import urlopen
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from meta_data import META_FIELDS, URL, DATASETS_ALL, DATASETS_ESS
11
+
12
+
13
+ def listinstr(lst, s):
14
+ assert isinstance(lst, list)
15
+ for item in lst:
16
+ if item in s:
17
+ return True
18
+ return False
19
+
20
+
21
+ def upper_key(k):
22
+ if k == 'ocr':
23
+ return 'OCR'
24
+ elif '_' in k:
25
+ k = k.split('_')
26
+ k = [x[0].upper() + x[1:] for x in k]
27
+ k = ' '.join(k)
28
+ return k
29
+ else:
30
+ return k
31
+
32
+
33
+ def load_results():
34
+ data = json.loads(urlopen(URL).read())
35
+ return data
36
+
37
+
38
+ def nth_large(val, vals):
39
+ return sum([1 for v in vals if v > val]) + 1
40
+
41
+
42
+ def model_size_flag(sz, FIELDS):
43
+ if pd.isna(sz) and 'Unknown' in FIELDS:
44
+ return True
45
+ if pd.isna(sz):
46
+ return False
47
+ sz = int(sz)
48
+ if '<4B' in FIELDS and sz < 4:
49
+ return True
50
+ if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
51
+ return True
52
+ if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
53
+ return True
54
+ if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
55
+ return True
56
+ if '>40B' in FIELDS and sz >= 40:
57
+ return True
58
+ return False
59
+
60
+
61
+ def model_type_flag(line, FIELDS):
62
+ if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
63
+ return True
64
+ if 'API' in FIELDS and line['OpenSource'] == 'No':
65
+ return True
66
+ return False
67
+
68
+
69
+ def BUILD_L1_DF(results):
70
+ check_box = {}
71
+ check_box['essential'] = ['Method', 'Org', 'Param (B)', 'Language Model', 'Vision Model']
72
+ # revise there to set default dataset
73
+ check_box['required'] = ['Overall'] + DATASETS_ESS
74
+ check_box['all'] = ['Overall'] + DATASETS_ALL
75
+ type_map = defaultdict(lambda: 'number')
76
+ type_map['Method'] = 'html'
77
+ type_map['Language Model'] = type_map['Vision Model'] = type_map['Org'] = 'html'
78
+ type_map['OpenSource'] = type_map['Verified'] = 'str'
79
+ check_box['type_map'] = type_map
80
+
81
+ df = generate_table(results)
82
+ return df, check_box
83
+
84
+
85
+ def BUILD_L2_DF(results, dataset):
86
+ res = defaultdict(list)
87
+ sub = [v for v in results.values() if dataset in v]
88
+ assert len(sub), dataset
89
+ fields = list(sub[0][dataset].keys())
90
+
91
+ non_overall_fields = [x for x in fields if 'Overall' not in x]
92
+ overall_fields = [x for x in fields if 'Overall' in x]
93
+
94
+ for m in results:
95
+ item = results[m]
96
+ if dataset not in item:
97
+ continue
98
+ for k in META_FIELDS:
99
+ if k == 'Param (B)':
100
+ param = item['META']['Parameters']
101
+ res[k].append(float(param.replace('B', '')) if param != '' else None)
102
+ elif k == 'Method':
103
+ name, url = item['META']['Method']
104
+ res[k].append(f'<a href="{url}">{name}</a>')
105
+ else:
106
+ s = item['META'][k].replace('\n', '<br>')
107
+ s = s.replace(' & ', '<br>')
108
+ res[k].append(s)
109
+
110
+ for d in overall_fields:
111
+ res[d].append(float(item[dataset][d]))
112
+ for d in non_overall_fields:
113
+ res[d].append(float(item[dataset][d]))
114
+
115
+ df = pd.DataFrame(res)
116
+ all_fields = overall_fields + non_overall_fields
117
+ # Use the first 5 non-overall fields as required fields
118
+ required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
119
+
120
+ df = df.sort_values('Overall')
121
+ df = df.iloc[::-1]
122
+
123
+ check_box = {}
124
+ check_box['essential'] = ['Method', 'Org', 'Param (B)', 'Language Model', 'Vision Model']
125
+ check_box['required'] = required_fields
126
+ check_box['all'] = all_fields
127
+ type_map = defaultdict(lambda: 'number')
128
+ type_map['Method'] = 'html'
129
+ type_map['Language Model'] = type_map['Vision Model'] = type_map['Org'] = 'html'
130
+ type_map['OpenSource'] = type_map['Verified'] = 'str'
131
+ check_box['type_map'] = type_map
132
+ return df, check_box
133
+
134
+
135
+ def generate_table(results):
136
+
137
+ res = defaultdict(list)
138
+ for i, m in enumerate(results):
139
+ item = results[m]
140
+ avg = 0
141
+ for k in META_FIELDS:
142
+ if k == 'Param (B)':
143
+ param = item['META']['Parameters']
144
+ res[k].append(float(param.replace('B', '')) if param != '' else None)
145
+ elif k == 'Method':
146
+ name, url = item['META']['Method']
147
+ res[k].append(f'<a href="{url}">{name}</a>')
148
+ else:
149
+ s = item['META'][k].replace('\n', '<br>')
150
+ s = s.replace(' & ', '<br>')
151
+ res[k].append(s)
152
+
153
+
154
+ for d in DATASETS_ALL:
155
+ key_name = 'Overall'
156
+ if d in item:
157
+ val = float(item[d][key_name])
158
+ val = float(f'{val:.1f}')
159
+ res[d].append(val)
160
+ else:
161
+ res[d].append(None)
162
+ if d in DATASETS_ESS:
163
+ if d in item and avg is not None:
164
+ avg += res[d][-1]
165
+ else:
166
+ avg = None
167
+
168
+ if avg is not None:
169
+ avg = float(f'{avg / len(DATASETS_ESS):.1f}')
170
+
171
+ res['Overall'].append(avg)
172
+
173
+ df = pd.DataFrame(res)
174
+ overall_isna = df[pd.isna(df['Overall'])]
175
+ overall_notna = df[~pd.isna(df['Overall'])]
176
+ overall_notna = overall_notna.sort_values('Overall')
177
+ overall_notna = overall_notna.iloc[::-1]
178
+ overall_isna = overall_isna.sort_values('MathVista')
179
+ overall_isna = overall_isna.iloc[::-1]
180
+ df = pd.concat([overall_notna, overall_isna])
181
+
182
+ return df
meta_data.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CONSTANTS-URL
2
+ URL = "http://opencompass.openxlab.space/assets/MathLB.json"
3
+ # CONSTANTS-CITATION
4
+ CITATION_BUTTON_TEXT = r"""\
5
+ @inproceedings{duan2024vlmevalkit,
6
+ title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models},
7
+ author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others},
8
+ booktitle={Proceedings of the 32nd ACM International Conference on Multimedia},
9
+ pages={11198--11201},
10
+ year={2024}
11
+ }
12
+ """
13
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
14
+ # CONSTANTS-TEXT
15
+ LEADERBORAD_INTRODUCTION = """# Open LMM Reasoning Leaderboard
16
+
17
+ This leaderboard aims at providing a comprehensive evaluation of the reasoning capabilities of LMMs. \
18
+ Currently, it is a collection of evaluation results on multiple multi-modal mathematical reasoning benchmarks.
19
+ """
20
+
21
+ # CONSTANTS-FIELDS
22
+ DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'MMMath', 'OlympiadBench']
23
+ DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
24
+ META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
25
+ MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
26
+ MODEL_TYPE = ['OpenSource', 'API']