Alina Lozovskaya commited on
Commit
e82b8ef
1 Parent(s): c81dadf

New search logic

Browse files
.gitignore CHANGED
@@ -4,6 +4,7 @@ __pycache__/
4
  .ipynb_checkpoints
5
  *ipynb
6
  .vscode/
 
7
 
8
  eval-queue/
9
  eval-results/
 
4
  .ipynb_checkpoints
5
  *ipynb
6
  .vscode/
7
+ .DS_Store
8
 
9
  eval-queue/
10
  eval-results/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10.0
app.py CHANGED
@@ -1,16 +1,16 @@
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
- from gradio_space_ci import enable_space_ci
6
 
7
  from src.display.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
 
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
- FAQ_TEXT,
14
  TITLE,
15
  )
16
  from src.display.css_html_js import custom_css
@@ -23,23 +23,32 @@ from src.display.utils import (
23
  TYPES,
24
  AutoEvalColumn,
25
  ModelType,
26
- fields,
27
  WeightType,
28
- Precision
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  )
30
- from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
31
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
32
- from src.submission.submit import add_new_eval
33
  from src.scripts.update_all_request_files import update_dynamic_files
 
34
  from src.tools.collections import update_collections
35
- from src.tools.plots import (
36
- create_metric_plot_obj,
37
- create_plot_df,
38
- create_scores_df,
39
- )
40
 
41
  # Start ephemeral Spaces on PRs (see config in README.md)
42
- #enable_space_ci()
 
43
 
44
  def restart_space():
45
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
@@ -50,32 +59,46 @@ def init_space(full_init: bool = True):
50
  try:
51
  print(EVAL_REQUESTS_PATH)
52
  snapshot_download(
53
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, max_workers=8
 
 
 
 
 
54
  )
55
  except Exception:
56
  restart_space()
57
  try:
58
  print(DYNAMIC_INFO_PATH)
59
  snapshot_download(
60
- repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, max_workers=8
 
 
 
 
 
61
  )
62
  except Exception:
63
  restart_space()
64
  try:
65
  print(EVAL_RESULTS_PATH)
66
  snapshot_download(
67
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, max_workers=8
 
 
 
 
 
68
  )
69
  except Exception:
70
- restart_space()
71
-
72
 
73
  raw_data, original_df = get_leaderboard_df(
74
- results_path=EVAL_RESULTS_PATH,
75
- requests_path=EVAL_REQUESTS_PATH,
76
- dynamic_path=DYNAMIC_INFO_FILE_PATH,
77
- cols=COLS,
78
- benchmark_cols=BENCHMARK_COLS
79
  )
80
  update_collections(original_df.copy())
81
  leaderboard_df = original_df.copy()
@@ -90,7 +113,10 @@ def init_space(full_init: bool = True):
90
 
91
  return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
92
 
93
- leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
 
 
 
94
 
95
 
96
  # Searching and filtering
@@ -103,7 +129,13 @@ def update_table(
103
  hide_models: list,
104
  query: str,
105
  ):
106
- filtered_df = filter_models(df=hidden_df, type_query=type_query, size_query=size_query, precision_query=precision_query, hide_models=hide_models)
 
 
 
 
 
 
107
  filtered_df = filter_queries(query, filtered_df)
108
  df = select_columns(filtered_df, columns)
109
  return df
@@ -111,43 +143,84 @@ def update_table(
111
 
112
  def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
113
  query = request.query_params.get("query") or ""
114
- return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
 
 
 
115
 
116
 
117
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
118
- return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
 
 
 
 
119
 
120
 
121
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
122
  always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
123
  dummy_col = [AutoEvalColumn.dummy.name]
124
- #AutoEvalColumn.model_type_symbol.name,
125
- #AutoEvalColumn.model.name,
126
  # We use COLS to maintain sorting
127
- filtered_df = df[
128
- always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
129
- ]
130
  return filtered_df
131
 
132
 
133
- def filter_queries(query: str, filtered_df: pd.DataFrame):
134
- """Added by Abishek"""
135
- final_df = []
136
- if query != "":
137
- queries = [q.strip() for q in query.split(";")]
138
- for _q in queries:
139
- _q = _q.strip()
140
- if _q != "":
141
- temp_filtered_df = search_table(filtered_df, _q)
142
- if len(temp_filtered_df) > 0:
143
- final_df.append(temp_filtered_df)
144
- if len(final_df) > 0:
145
- filtered_df = pd.concat(final_df)
146
- filtered_df = filtered_df.drop_duplicates(
147
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
148
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- return filtered_df
151
 
152
 
153
  def filter_models(
@@ -179,12 +252,13 @@ def filter_models(
179
 
180
  return filtered_df
181
 
 
182
  leaderboard_df = filter_models(
183
- df=leaderboard_df,
184
- type_query=[t.to_str(" : ") for t in ModelType],
185
- size_query=list(NUMERIC_INTERVALS.keys()),
186
  precision_query=[i.value.name for i in Precision],
187
- hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
188
  )
189
 
190
  demo = gr.Blocks(css=custom_css)
@@ -198,7 +272,7 @@ with demo:
198
  with gr.Column():
199
  with gr.Row():
200
  search_bar = gr.Textbox(
201
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
202
  show_label=False,
203
  elem_id="search-bar",
204
  )
@@ -221,12 +295,12 @@ with demo:
221
  with gr.Row():
222
  hide_models = gr.CheckboxGroup(
223
  label="Hide models",
224
- choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
225
  value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
226
- interactive=True
227
  )
228
  with gr.Column(min_width=320):
229
- #with gr.Box(elem_id="box-filter"):
230
  filter_columns_type = gr.CheckboxGroup(
231
  label="Model types",
232
  choices=[t.to_str() for t in ModelType],
@@ -260,7 +334,7 @@ with demo:
260
  elem_id="leaderboard-table",
261
  interactive=False,
262
  visible=True,
263
- #column_widths=["2%", "33%"]
264
  )
265
 
266
  # Dummy leaderboard for handling the case when the user uses backspace key
@@ -301,8 +375,14 @@ with demo:
301
  )
302
  # Check query parameter once at startup and update search bar + hidden component
303
  demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
304
-
305
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, hide_models]:
 
 
 
 
 
 
306
  selector.change(
307
  update_table,
308
  [
@@ -326,14 +406,14 @@ with demo:
326
  [AutoEvalColumn.average.name],
327
  title="Average of Top Scores and Human Baseline Over Time (from last update)",
328
  )
329
- gr.Plot(value=chart, min_width=500)
330
  with gr.Column():
331
  chart = create_metric_plot_obj(
332
  plot_df,
333
  BENCHMARK_COLS,
334
  title="Top Scores and Human Baseline Over Time (from last update)",
335
  )
336
- gr.Plot(value=chart, min_width=500)
337
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
338
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
339
 
@@ -441,8 +521,8 @@ with demo:
441
  )
442
 
443
  scheduler = BackgroundScheduler()
444
- scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
445
- scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
446
  scheduler.start()
447
 
448
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ import os
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
6
 
7
  from src.display.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
11
+ FAQ_TEXT,
12
  INTRODUCTION_TEXT,
13
  LLM_BENCHMARKS_TEXT,
 
14
  TITLE,
15
  )
16
  from src.display.css_html_js import custom_css
 
23
  TYPES,
24
  AutoEvalColumn,
25
  ModelType,
26
+ Precision,
27
  WeightType,
28
+ fields,
29
+ )
30
+ from src.envs import (
31
+ API,
32
+ DYNAMIC_INFO_FILE_PATH,
33
+ DYNAMIC_INFO_PATH,
34
+ DYNAMIC_INFO_REPO,
35
+ EVAL_REQUESTS_PATH,
36
+ EVAL_RESULTS_PATH,
37
+ H4_TOKEN,
38
+ IS_PUBLIC,
39
+ QUEUE_REPO,
40
+ REPO_ID,
41
+ RESULTS_REPO,
42
  )
 
43
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
44
  from src.scripts.update_all_request_files import update_dynamic_files
45
+ from src.submission.submit import add_new_eval
46
  from src.tools.collections import update_collections
47
+ from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 
 
 
 
48
 
49
  # Start ephemeral Spaces on PRs (see config in README.md)
50
+ # enable_space_ci()
51
+
52
 
53
  def restart_space():
54
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 
59
  try:
60
  print(EVAL_REQUESTS_PATH)
61
  snapshot_download(
62
+ repo_id=QUEUE_REPO,
63
+ local_dir=EVAL_REQUESTS_PATH,
64
+ repo_type="dataset",
65
+ tqdm_class=None,
66
+ etag_timeout=30,
67
+ max_workers=8,
68
  )
69
  except Exception:
70
  restart_space()
71
  try:
72
  print(DYNAMIC_INFO_PATH)
73
  snapshot_download(
74
+ repo_id=DYNAMIC_INFO_REPO,
75
+ local_dir=DYNAMIC_INFO_PATH,
76
+ repo_type="dataset",
77
+ tqdm_class=None,
78
+ etag_timeout=30,
79
+ max_workers=8,
80
  )
81
  except Exception:
82
  restart_space()
83
  try:
84
  print(EVAL_RESULTS_PATH)
85
  snapshot_download(
86
+ repo_id=RESULTS_REPO,
87
+ local_dir=EVAL_RESULTS_PATH,
88
+ repo_type="dataset",
89
+ tqdm_class=None,
90
+ etag_timeout=30,
91
+ max_workers=8,
92
  )
93
  except Exception:
94
+ restart_space()
 
95
 
96
  raw_data, original_df = get_leaderboard_df(
97
+ results_path=EVAL_RESULTS_PATH,
98
+ requests_path=EVAL_REQUESTS_PATH,
99
+ dynamic_path=DYNAMIC_INFO_FILE_PATH,
100
+ cols=COLS,
101
+ benchmark_cols=BENCHMARK_COLS,
102
  )
103
  update_collections(original_df.copy())
104
  leaderboard_df = original_df.copy()
 
113
 
114
  return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
115
 
116
+
117
+ leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = (
118
+ init_space()
119
+ )
120
 
121
 
122
  # Searching and filtering
 
129
  hide_models: list,
130
  query: str,
131
  ):
132
+ filtered_df = filter_models(
133
+ df=hidden_df,
134
+ type_query=type_query,
135
+ size_query=size_query,
136
+ precision_query=precision_query,
137
+ hide_models=hide_models,
138
+ )
139
  filtered_df = filter_queries(query, filtered_df)
140
  df = select_columns(filtered_df, columns)
141
  return df
 
143
 
144
  def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
145
  query = request.query_params.get("query") or ""
146
+ return (
147
+ query,
148
+ query,
149
+ ) # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
150
 
151
 
152
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
153
+ return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False, na=False))]
154
+
155
+
156
+ def search_license(df: pd.DataFrame, query: str) -> pd.DataFrame:
157
+ return df[df[AutoEvalColumn.license.name].str.contains(query, case=False, na=False)]
158
 
159
 
160
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
161
  always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
162
  dummy_col = [AutoEvalColumn.dummy.name]
163
+ # AutoEvalColumn.model_type_symbol.name,
164
+ # AutoEvalColumn.model.name,
165
  # We use COLS to maintain sorting
166
+ filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col]
 
 
167
  return filtered_df
168
 
169
 
170
+ def filter_queries(query: str, df: pd.DataFrame):
171
+ """Added by Abishek + Updated by alozowski"""
172
+ tmp_result_df = []
173
+
174
+ # Empty query return the same df
175
+ if query == "":
176
+ return df
177
+
178
+ all_queries = [q.strip() for q in query.split(";")]
179
+ license_queries = []
180
+
181
+ # Handling model name search
182
+ for _q in all_queries:
183
+ if _q.startswith("license:"):
184
+ # Skipping license query here
185
+ license_queries.append(_q)
186
+ continue
187
+
188
+ if _q != "":
189
+ tmp_df = search_table(df, _q)
190
+ if len(tmp_df) > 0:
191
+ tmp_result_df.append(tmp_df)
192
+
193
+ if not tmp_result_df and not license_queries:
194
+ # Nothing is found, no license_queries -> return empty df
195
+ return pd.DataFrame(columns=df.columns)
196
+
197
+ if tmp_result_df:
198
+ df = pd.concat(tmp_result_df)
199
+ df = df.drop_duplicates(
200
+ subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
201
+ )
202
+
203
+ if not license_queries:
204
+ return df
205
+
206
+ tmp_result_df = []
207
+ for _q in license_queries:
208
+ _q = _q.replace("license:", "").strip()
209
+ if _q != "":
210
+ tmp_df = search_license(df, _q)
211
+ if len(tmp_df) > 0:
212
+ tmp_result_df.append(tmp_df)
213
+
214
+ if not tmp_result_df:
215
+ # Nothing is found, return empty df
216
+ return pd.DataFrame(columns=df.columns)
217
+
218
+ df = pd.concat(tmp_result_df)
219
+ df = df.drop_duplicates(
220
+ subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
221
+ )
222
 
223
+ return df
224
 
225
 
226
  def filter_models(
 
252
 
253
  return filtered_df
254
 
255
+
256
  leaderboard_df = filter_models(
257
+ df=leaderboard_df,
258
+ type_query=[t.to_str(" : ") for t in ModelType],
259
+ size_query=list(NUMERIC_INTERVALS.keys()),
260
  precision_query=[i.value.name for i in Precision],
261
+ hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
262
  )
263
 
264
  demo = gr.Blocks(css=custom_css)
 
272
  with gr.Column():
273
  with gr.Row():
274
  search_bar = gr.Textbox(
275
+ placeholder="🔍 Search models or licenses (e.g., 'model_name; license: MIT') and press ENTER...",
276
  show_label=False,
277
  elem_id="search-bar",
278
  )
 
295
  with gr.Row():
296
  hide_models = gr.CheckboxGroup(
297
  label="Hide models",
298
+ choices=["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
299
  value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
300
+ interactive=True,
301
  )
302
  with gr.Column(min_width=320):
303
+ # with gr.Box(elem_id="box-filter"):
304
  filter_columns_type = gr.CheckboxGroup(
305
  label="Model types",
306
  choices=[t.to_str() for t in ModelType],
 
334
  elem_id="leaderboard-table",
335
  interactive=False,
336
  visible=True,
337
+ # column_widths=["2%", "33%"]
338
  )
339
 
340
  # Dummy leaderboard for handling the case when the user uses backspace key
 
375
  )
376
  # Check query parameter once at startup and update search bar + hidden component
377
  demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
378
+
379
+ for selector in [
380
+ shown_columns,
381
+ filter_columns_type,
382
+ filter_columns_precision,
383
+ filter_columns_size,
384
+ hide_models,
385
+ ]:
386
  selector.change(
387
  update_table,
388
  [
 
406
  [AutoEvalColumn.average.name],
407
  title="Average of Top Scores and Human Baseline Over Time (from last update)",
408
  )
409
+ gr.Plot(value=chart, min_width=500)
410
  with gr.Column():
411
  chart = create_metric_plot_obj(
412
  plot_df,
413
  BENCHMARK_COLS,
414
  title="Top Scores and Human Baseline Over Time (from last update)",
415
  )
416
+ gr.Plot(value=chart, min_width=500)
417
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
418
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
419
 
 
521
  )
522
 
523
  scheduler = BackgroundScheduler()
524
+ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
525
+ scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
526
  scheduler.start()
527
 
528
+ demo.queue(default_concurrency_limit=40).launch()
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -1,9 +1,9 @@
1
  [tool.ruff]
2
  # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- select = ["E", "F"]
4
- ignore = ["E501"] # line too long (black is taking care of this)
5
  line-length = 119
6
- fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
 
8
  [tool.isort]
9
  profile = "black"
@@ -11,3 +11,36 @@ line_length = 119
11
 
12
  [tool.black]
13
  line-length = 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  [tool.ruff]
2
  # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ lint.select = ["E", "F"]
4
+ lint.ignore = ["E501"] # line too long (black is taking care of this)
5
  line-length = 119
6
+ lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
 
8
  [tool.isort]
9
  profile = "black"
 
11
 
12
  [tool.black]
13
  line-length = 119
14
+
15
+ [tool.poetry]
16
+ name = "open-llm-leaderboard"
17
+ version = "0.1.0"
18
+ description = ""
19
+ authors = ["Alina Lozovskaia <alina.lozovskaia@huggingface.co>"]
20
+ readme = "README.md"
21
+
22
+ [tool.poetry.dependencies]
23
+ python = "3.10.0"
24
+ apscheduler = "3.10.1"
25
+ black = "23.11.0"
26
+ click = "8.1.3"
27
+ datasets = "2.14.5"
28
+ huggingface-hub = ">=0.18.0"
29
+ matplotlib = "3.7.1"
30
+ numpy = "1.24.2"
31
+ pandas = "2.0.0"
32
+ plotly = "5.14.1"
33
+ python-dateutil = "2.8.2"
34
+ requests = "2.28.2"
35
+ sentencepiece = "^0.2.0"
36
+ tqdm = "4.65.0"
37
+ transformers = "4.39.0"
38
+ tokenizers = ">=0.15.0"
39
+ gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.1"}
40
+ gradio = "4.9.0"
41
+ isort = "^5.13.2"
42
+ ruff = "^0.3.5"
43
+
44
+ [build-system]
45
+ requires = ["poetry-core"]
46
+ build-backend = "poetry.core.masonry.api"
src/display/about.py CHANGED
@@ -12,7 +12,7 @@ icons = f"""
12
  - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
13
  - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
14
  """
15
- LLM_BENCHMARKS_TEXT = f"""
16
  ## ABOUT
17
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
18
 
 
12
  - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
13
  - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
14
  """
15
+ LLM_BENCHMARKS_TEXT = """
16
  ## ABOUT
17
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
18
 
src/display/formatting.py CHANGED
@@ -1,12 +1,8 @@
1
- import os
2
- from datetime import datetime, timezone
3
-
4
  from huggingface_hub import HfApi
5
- from huggingface_hub.hf_api import ModelInfo
6
-
7
 
8
  API = HfApi()
9
 
 
10
  def model_hyperlink(link, model_name):
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
 
 
 
 
 
1
  from huggingface_hub import HfApi
 
 
2
 
3
  API = HfApi()
4
 
5
+
6
  def model_hyperlink(link, model_name):
7
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
8
 
src/display/utils.py CHANGED
@@ -3,6 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
 
6
  def fields(raw_class):
7
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
8
 
@@ -13,6 +14,7 @@ class Task:
13
  metric: str
14
  col_name: str
15
 
 
16
  class Tasks(Enum):
17
  arc = Task("arc:challenge", "acc_norm", "ARC")
18
  hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
@@ -21,6 +23,7 @@ class Tasks(Enum):
21
  winogrande = Task("winogrande", "acc", "Winogrande")
22
  gsm8k = Task("gsm8k", "acc", "GSM8K")
23
 
 
24
  # These classes are for user facing column names,
25
  # to avoid having to change them all around the code
26
  # when a modif is needed
@@ -33,11 +36,12 @@ class ColumnContent:
33
  never_hidden: bool = False
34
  dummy: bool = False
35
 
 
36
  auto_eval_column_dict = []
37
  # Init
38
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
39
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
40
- #Scores
41
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
42
  for task in Tasks:
43
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
@@ -50,7 +54,9 @@ auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "
50
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
51
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
52
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
53
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)])
 
 
54
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
55
  auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
56
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
@@ -60,6 +66,7 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
60
  # We use make dataclass to dynamically fill the scores from Tasks
61
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
62
 
 
63
  @dataclass(frozen=True)
64
  class EvalQueueColumn: # Queue column
65
  model = ColumnContent("model", "markdown", True)
@@ -112,10 +119,11 @@ human_baseline_row = {
112
  AutoEvalColumn.flagged.name: False,
113
  }
114
 
 
115
  @dataclass
116
  class ModelDetails:
117
  name: str
118
- symbol: str = "" # emoji, only for the model type
119
 
120
 
121
  class ModelType(Enum):
@@ -143,11 +151,13 @@ class ModelType(Enum):
143
  return ModelType.merges
144
  return ModelType.Unknown
145
 
 
146
  class WeightType(Enum):
147
  Adapter = ModelDetails("Adapter")
148
  Original = ModelDetails("Original")
149
  Delta = ModelDetails("Delta")
150
 
 
151
  class Precision(Enum):
152
  float16 = ModelDetails("float16")
153
  bfloat16 = ModelDetails("bfloat16")
@@ -168,8 +178,6 @@ class Precision(Enum):
168
  if precision in ["GPTQ", "None"]:
169
  return Precision.qt_GPTQ
170
  return Precision.Unknown
171
-
172
-
173
 
174
 
175
  # Column selection
 
3
 
4
  import pandas as pd
5
 
6
+
7
  def fields(raw_class):
8
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
9
 
 
14
  metric: str
15
  col_name: str
16
 
17
+
18
  class Tasks(Enum):
19
  arc = Task("arc:challenge", "acc_norm", "ARC")
20
  hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
 
23
  winogrande = Task("winogrande", "acc", "Winogrande")
24
  gsm8k = Task("gsm8k", "acc", "GSM8K")
25
 
26
+
27
  # These classes are for user facing column names,
28
  # to avoid having to change them all around the code
29
  # when a modif is needed
 
36
  never_hidden: bool = False
37
  dummy: bool = False
38
 
39
+
40
  auto_eval_column_dict = []
41
  # Init
42
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
43
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
44
+ # Scores
45
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
46
  for task in Tasks:
47
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
54
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
55
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
56
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
57
+ auto_eval_column_dict.append(
58
+ ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)]
59
+ )
60
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
61
  auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
62
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 
66
  # We use make dataclass to dynamically fill the scores from Tasks
67
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
68
 
69
+
70
  @dataclass(frozen=True)
71
  class EvalQueueColumn: # Queue column
72
  model = ColumnContent("model", "markdown", True)
 
119
  AutoEvalColumn.flagged.name: False,
120
  }
121
 
122
+
123
  @dataclass
124
  class ModelDetails:
125
  name: str
126
+ symbol: str = "" # emoji, only for the model type
127
 
128
 
129
  class ModelType(Enum):
 
151
  return ModelType.merges
152
  return ModelType.Unknown
153
 
154
+
155
  class WeightType(Enum):
156
  Adapter = ModelDetails("Adapter")
157
  Original = ModelDetails("Original")
158
  Delta = ModelDetails("Delta")
159
 
160
+
161
  class Precision(Enum):
162
  float16 = ModelDetails("float16")
163
  bfloat16 = ModelDetails("bfloat16")
 
178
  if precision in ["GPTQ", "None"]:
179
  return Precision.qt_GPTQ
180
  return Precision.Unknown
 
 
181
 
182
 
183
  # Column selection
src/envs.py CHANGED
@@ -15,7 +15,7 @@ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
15
 
16
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
17
 
18
- CACHE_PATH=os.getenv("HF_HOME", ".")
19
 
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
15
 
16
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
17
 
18
+ CACHE_PATH = os.getenv("HF_HOME", ".")
19
 
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
src/leaderboard/filter_models.py CHANGED
@@ -29,7 +29,7 @@ FLAGGED_MODELS = {
29
  "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
30
  "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
31
  "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
32
- "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
33
  "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
34
  "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
35
  "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
@@ -43,7 +43,6 @@ FLAGGED_MODELS = {
43
  "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
44
  "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
45
  "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
46
- "udkai/Turdus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
47
  "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
48
  "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
49
  "alnrg2arg/test2_3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
@@ -101,19 +100,19 @@ FLAGGED_MODELS = {
101
  "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
102
  "cookinai/OpenCM-14": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
103
  "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
104
- "jan-hq/supermario-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
105
  # MoErges
106
- "cloudyu/Yi-34Bx2-MoE-60B":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
107
- "cloudyu/Mixtral_34Bx2_MoE_60B":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
108
- "gagan3012/MetaModel_moe":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
109
- "macadeliccc/SOLAR-math-2x10.7b-v0.2":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
110
- "cloudyu/Mixtral_7Bx2_MoE":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
111
- "macadeliccc/SOLAR-math-2x10.7b":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
112
- "macadeliccc/Orca-SOLAR-4x10.7b":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
113
- "macadeliccc/piccolo-8x7b":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
114
- "cloudyu/Mixtral_7Bx4_MOE_24B":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
115
- "macadeliccc/laser-dolphin-mixtral-2x7b-dpo":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
116
- "macadeliccc/polyglot-math-4x7b":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
117
  # Other - contamination mostly
118
  "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/566",
119
  "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/556",
@@ -124,16 +123,16 @@ FLAGGED_MODELS = {
124
  # Models which have been requested by orgs to not be submitted on the leaderboard
125
  DO_NOT_SUBMIT_MODELS = [
126
  "Voicelab/trurl-2-13b", # trained on MMLU
127
- "TigerResearch/tigerbot-70b-chat", # per authors request
128
- "TigerResearch/tigerbot-70b-chat-v2", # per authors request
129
- "TigerResearch/tigerbot-70b-chat-v4-4k", # per authors request
130
  ]
131
 
132
 
133
  def flag_models(leaderboard_data: list[dict]):
134
  for model_data in leaderboard_data:
135
  # Merges and moes are flagged automatically
136
- if model_data[AutoEvalColumn.flagged.name] == True:
137
  flag_key = "merged"
138
  else:
139
  flag_key = model_data["model_name_for_query"]
@@ -144,9 +143,9 @@ def flag_models(leaderboard_data: list[dict]):
144
  FLAGGED_MODELS[flag_key],
145
  f"See discussion #{issue_num}",
146
  )
147
- model_data[
148
- AutoEvalColumn.model.name
149
- ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
150
  model_data[AutoEvalColumn.flagged.name] = True
151
  else:
152
  model_data[AutoEvalColumn.flagged.name] = False
 
29
  "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
30
  "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
31
  "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
32
+ "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
33
  "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
34
  "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
35
  "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
 
43
  "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
44
  "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
45
  "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
 
46
  "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
47
  "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
48
  "alnrg2arg/test2_3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
 
100
  "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
101
  "cookinai/OpenCM-14": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
102
  "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
103
+ "jan-hq/supermario-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
104
  # MoErges
105
+ "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
106
+ "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
107
+ "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
108
+ "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
109
+ "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
110
+ "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
111
+ "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
112
+ "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
113
+ "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
114
+ "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
115
+ "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
116
  # Other - contamination mostly
117
  "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/566",
118
  "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/556",
 
123
  # Models which have been requested by orgs to not be submitted on the leaderboard
124
  DO_NOT_SUBMIT_MODELS = [
125
  "Voicelab/trurl-2-13b", # trained on MMLU
126
+ "TigerResearch/tigerbot-70b-chat", # per authors request
127
+ "TigerResearch/tigerbot-70b-chat-v2", # per authors request
128
+ "TigerResearch/tigerbot-70b-chat-v4-4k", # per authors request
129
  ]
130
 
131
 
132
  def flag_models(leaderboard_data: list[dict]):
133
  for model_data in leaderboard_data:
134
  # Merges and moes are flagged automatically
135
+ if model_data[AutoEvalColumn.flagged.name]:
136
  flag_key = "merged"
137
  else:
138
  flag_key = model_data["model_name_for_query"]
 
143
  FLAGGED_MODELS[flag_key],
144
  f"See discussion #{issue_num}",
145
  )
146
+ model_data[AutoEvalColumn.model.name] = (
147
+ f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
148
+ )
149
  model_data[AutoEvalColumn.flagged.name] = True
150
  else:
151
  model_data[AutoEvalColumn.flagged.name] = False
src/leaderboard/read_evals.py CHANGED
@@ -7,29 +7,27 @@ from dataclasses import dataclass
7
  import dateutil
8
  import numpy as np
9
 
10
- from huggingface_hub import ModelCard
11
-
12
  from src.display.formatting import make_clickable_model
13
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
14
 
15
 
16
  @dataclass
17
  class EvalResult:
18
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown" # From config file
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = True
34
  is_merge: bool = False
35
  flagged: bool = False
@@ -96,8 +94,8 @@ class EvalResult:
96
  org=org,
97
  model=model,
98
  results=results,
99
- precision=precision,
100
- revision= config.get("model_sha", ""),
101
  )
102
 
103
  def update_with_request_file(self, requests_path):
@@ -113,7 +111,7 @@ class EvalResult:
113
  self.date = request.get("submitted_time", "")
114
  self.architecture = request.get("architectures", "Unknown")
115
  self.status = request.get("status", "FAILED")
116
- except Exception as e:
117
  self.status = "FAILED"
118
  print(f"Could not find request file for {self.org}/{self.model}")
119
 
@@ -123,7 +121,6 @@ class EvalResult:
123
  self.still_on_hub = file_dict["still_on_hub"]
124
  self.tags = file_dict.get("tags", [])
125
  self.flagged = any("flagged" in tag for tag in self.tags)
126
-
127
 
128
  def to_dict(self):
129
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -145,7 +142,7 @@ class EvalResult:
145
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
146
  AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
147
  AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
148
- AutoEvalColumn.flagged.name: self.flagged
149
  }
150
 
151
  for task in Tasks:
@@ -168,10 +165,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
168
  for tmp_request_file in request_files:
169
  with open(tmp_request_file, "r") as f:
170
  req_content = json.load(f)
171
- if (
172
- req_content["status"] in ["FINISHED"]
173
- and req_content["precision"] == precision.split(".")[-1]
174
- ):
175
  request_file = tmp_request_file
176
  return request_file
177
 
@@ -207,7 +201,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
207
  if eval_result.full_model in dynamic_data:
208
  eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
209
  # Hardcoding because of gating problem
210
- if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
211
  eval_result.still_on_hub = True
212
 
213
  # Store results of same eval together
@@ -221,7 +215,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
221
  for v in eval_results.values():
222
  try:
223
  if v.status == "FINISHED":
224
- v.to_dict() # we test if the dict version is complete
225
  results.append(v)
226
  except KeyError: # not all eval values present
227
  continue
 
7
  import dateutil
8
  import numpy as np
9
 
 
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
12
 
13
 
14
  @dataclass
15
  class EvalResult:
16
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
17
+ eval_name: str # org_model_precision (uid)
18
+ full_model: str # org/model (path on hub)
19
+ org: str
20
  model: str
21
+ revision: str # commit hash, "" if main
22
  results: dict
23
  precision: Precision = Precision.Unknown
24
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
25
+ weight_type: WeightType = WeightType.Original # Original or Adapter
26
+ architecture: str = "Unknown" # From config file
27
  license: str = "?"
28
  likes: int = 0
29
  num_params: int = 0
30
+ date: str = "" # submission date of request file
31
  still_on_hub: bool = True
32
  is_merge: bool = False
33
  flagged: bool = False
 
94
  org=org,
95
  model=model,
96
  results=results,
97
+ precision=precision,
98
+ revision=config.get("model_sha", ""),
99
  )
100
 
101
  def update_with_request_file(self, requests_path):
 
111
  self.date = request.get("submitted_time", "")
112
  self.architecture = request.get("architectures", "Unknown")
113
  self.status = request.get("status", "FAILED")
114
+ except Exception:
115
  self.status = "FAILED"
116
  print(f"Could not find request file for {self.org}/{self.model}")
117
 
 
121
  self.still_on_hub = file_dict["still_on_hub"]
122
  self.tags = file_dict.get("tags", [])
123
  self.flagged = any("flagged" in tag for tag in self.tags)
 
124
 
125
  def to_dict(self):
126
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
142
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
143
  AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
144
  AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
145
+ AutoEvalColumn.flagged.name: self.flagged,
146
  }
147
 
148
  for task in Tasks:
 
165
  for tmp_request_file in request_files:
166
  with open(tmp_request_file, "r") as f:
167
  req_content = json.load(f)
168
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
169
  request_file = tmp_request_file
170
  return request_file
171
 
 
201
  if eval_result.full_model in dynamic_data:
202
  eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
203
  # Hardcoding because of gating problem
204
+ if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
205
  eval_result.still_on_hub = True
206
 
207
  # Store results of same eval together
 
215
  for v in eval_results.values():
216
  try:
217
  if v.status == "FINISHED":
218
+ v.to_dict() # we test if the dict version is complete
219
  results.append(v)
220
  except KeyError: # not all eval values present
221
  continue
src/populate.py CHANGED
@@ -9,7 +9,9 @@ from src.leaderboard.filter_models import filter_models_flags
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
- def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
 
13
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
  all_data_json.append(baseline_row)
 
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
+ def get_leaderboard_df(
13
+ results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list
14
+ ) -> pd.DataFrame:
15
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
16
  all_data_json = [v.to_dict() for v in raw_data]
17
  all_data_json.append(baseline_row)
src/scripts/create_request_file.py CHANGED
@@ -7,8 +7,8 @@ import click
7
  from colorama import Fore
8
  from huggingface_hub import HfApi, snapshot_download
9
 
10
- from src.submission.check_validity import get_model_size
11
  from src.display.utils import ModelType, WeightType
 
12
 
13
  EVAL_REQUESTS_PATH = "eval-queue"
14
  QUEUE_REPO = "open-llm-leaderboard/requests"
 
7
  from colorama import Fore
8
  from huggingface_hub import HfApi, snapshot_download
9
 
 
10
  from src.display.utils import ModelType, WeightType
11
+ from src.submission.check_validity import get_model_size
12
 
13
  EVAL_REQUESTS_PATH = "eval-queue"
14
  QUEUE_REPO = "open-llm-leaderboard/requests"
src/scripts/update_all_request_files.py CHANGED
@@ -1,37 +1,41 @@
1
- from huggingface_hub import ModelFilter, snapshot_download
2
- from huggingface_hub import ModelCard
3
-
4
  import json
5
  import os
6
  import time
7
 
8
- from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_tags
9
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN
 
 
 
10
 
11
  def update_one_model(model_id, data, models_on_the_hub):
12
  # Model no longer on the hub at all
13
  if model_id not in models_on_the_hub:
14
- data['still_on_hub'] = False
15
- data['likes'] = 0
16
- data['downloads'] = 0
17
- data['created_at'] = ""
18
  data["tags"] = []
19
  return data
20
 
21
  # Grabbing model parameters
22
  model_cfg = models_on_the_hub[model_id]
23
- data['likes'] = model_cfg.likes
24
- data['downloads'] = model_cfg.downloads
25
- data['created_at'] = str(model_cfg.created_at)
26
- data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
27
 
28
  # Grabbing model details
29
  model_name = model_id
30
  if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
31
  if isinstance(model_cfg.card_data.base_model, str):
32
- model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
33
  still_on_hub, _, _ = is_model_on_hub(
34
- model_name=model_name, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
 
 
 
 
35
  )
36
  # If the model doesn't have a model card or a license, we consider it's deleted
37
  if still_on_hub:
@@ -42,13 +46,14 @@ def update_one_model(model_id, data, models_on_the_hub):
42
  except Exception:
43
  model_card = None
44
  still_on_hub = False
45
- data['still_on_hub'] = still_on_hub
46
 
47
  tags = get_model_tags(model_card, model_id) if still_on_hub else []
48
 
49
  data["tags"] = tags
50
  return data
51
 
 
52
  def update_models(file_path, models_on_the_hub):
53
  """
54
  Search through all JSON files in the specified root folder and its subfolders,
@@ -60,9 +65,7 @@ def update_models(file_path, models_on_the_hub):
60
  for model_id in model_infos.keys():
61
  seen_models.append(model_id)
62
  model_infos[model_id] = update_one_model(
63
- model_id = model_id,
64
- data=model_infos[model_id],
65
- models_on_the_hub=models_on_the_hub
66
  )
67
 
68
  # If new requests files have been created since we started all this
@@ -70,7 +73,8 @@ def update_models(file_path, models_on_the_hub):
70
  all_models = []
71
  try:
72
  for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
73
- if ix == 0: continue
 
74
  for file in files:
75
  if "eval_request" in file:
76
  path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
@@ -81,18 +85,14 @@ def update_models(file_path, models_on_the_hub):
81
 
82
  for model_id in all_models:
83
  if model_id not in seen_models:
84
- model_infos[model_id] = update_one_model(
85
- model_id = model_id,
86
- data={},
87
- models_on_the_hub=models_on_the_hub
88
- )
89
 
90
- with open(file_path, 'w') as f:
91
  json.dump(model_infos, f, indent=2)
92
 
 
93
  def update_dynamic_files():
94
- """ This will only update metadata for models already linked in the repo, not add missing ones.
95
- """
96
  snapshot_download(
97
  repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
98
  )
@@ -101,13 +101,15 @@ def update_dynamic_files():
101
  # Get models
102
  start = time.time()
103
 
104
- models = list(API.list_models(
105
- #filter=ModelFilter(task="text-generation"),
106
- full=False,
107
- cardData=True,
108
- fetch_config=True,
109
- ))
110
- id_to_model = {model.id : model for model in models}
 
 
111
 
112
  print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
113
 
@@ -122,7 +124,6 @@ def update_dynamic_files():
122
  path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
123
  repo_id=DYNAMIC_INFO_REPO,
124
  repo_type="dataset",
125
- commit_message=f"Daily request file update.",
126
  )
127
- print(f"UPDATE_DYNAMIC: pushed to hub")
128
-
 
 
 
 
1
  import json
2
  import os
3
  import time
4
 
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
8
+ from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
9
+
10
 
11
  def update_one_model(model_id, data, models_on_the_hub):
12
  # Model no longer on the hub at all
13
  if model_id not in models_on_the_hub:
14
+ data["still_on_hub"] = False
15
+ data["likes"] = 0
16
+ data["downloads"] = 0
17
+ data["created_at"] = ""
18
  data["tags"] = []
19
  return data
20
 
21
  # Grabbing model parameters
22
  model_cfg = models_on_the_hub[model_id]
23
+ data["likes"] = model_cfg.likes
24
+ data["downloads"] = model_cfg.downloads
25
+ data["created_at"] = str(model_cfg.created_at)
26
+ data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
27
 
28
  # Grabbing model details
29
  model_name = model_id
30
  if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
31
  if isinstance(model_cfg.card_data.base_model, str):
32
+ model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
33
  still_on_hub, _, _ = is_model_on_hub(
34
+ model_name=model_name,
35
+ revision=data.get("revision"),
36
+ trust_remote_code=True,
37
+ test_tokenizer=False,
38
+ token=H4_TOKEN,
39
  )
40
  # If the model doesn't have a model card or a license, we consider it's deleted
41
  if still_on_hub:
 
46
  except Exception:
47
  model_card = None
48
  still_on_hub = False
49
+ data["still_on_hub"] = still_on_hub
50
 
51
  tags = get_model_tags(model_card, model_id) if still_on_hub else []
52
 
53
  data["tags"] = tags
54
  return data
55
 
56
+
57
  def update_models(file_path, models_on_the_hub):
58
  """
59
  Search through all JSON files in the specified root folder and its subfolders,
 
65
  for model_id in model_infos.keys():
66
  seen_models.append(model_id)
67
  model_infos[model_id] = update_one_model(
68
+ model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
 
 
69
  )
70
 
71
  # If new requests files have been created since we started all this
 
73
  all_models = []
74
  try:
75
  for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
76
+ if ix == 0:
77
+ continue
78
  for file in files:
79
  if "eval_request" in file:
80
  path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
 
85
 
86
  for model_id in all_models:
87
  if model_id not in seen_models:
88
+ model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
 
 
 
 
89
 
90
+ with open(file_path, "w") as f:
91
  json.dump(model_infos, f, indent=2)
92
 
93
+
94
  def update_dynamic_files():
95
+ """This will only update metadata for models already linked in the repo, not add missing ones."""
 
96
  snapshot_download(
97
  repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
98
  )
 
101
  # Get models
102
  start = time.time()
103
 
104
+ models = list(
105
+ API.list_models(
106
+ # filter=ModelFilter(task="text-generation"),
107
+ full=False,
108
+ cardData=True,
109
+ fetch_config=True,
110
+ )
111
+ )
112
+ id_to_model = {model.id: model for model in models}
113
 
114
  print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
115
 
 
124
  path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
125
  repo_id=DYNAMIC_INFO_REPO,
126
  repo_type="dataset",
127
+ commit_message="Daily request file update.",
128
  )
129
+ print("UPDATE_DYNAMIC: pushed to hub")
 
src/submission/check_validity.py CHANGED
@@ -24,10 +24,14 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
24
  # Enforce license metadata
25
  if card.data.license is None:
26
  if not ("license_name" in card.data and "license_link" in card.data):
27
- return False, (
28
- "License not found. Please add a license to your model card using the `license` metadata or a"
29
- " `license_name`/`license_link` pair."
30
- ), None
 
 
 
 
31
 
32
  # Enforce card content
33
  if len(card.text) < 200:
@@ -36,27 +40,33 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
36
  return True, "", card
37
 
38
 
39
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str, AutoConfig]:
 
 
40
  try:
41
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
 
 
42
  if test_tokenizer:
43
  try:
44
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
45
  except ValueError as e:
 
 
46
  return (
47
  False,
48
- f"uses a tokenizer which is not in a transformers release: {e}",
49
- None
50
  )
51
- except Exception as e:
52
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
53
  return True, None, config
54
 
55
- except ValueError as e:
56
  return (
57
  False,
58
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
59
- None
60
  )
61
 
62
  except Exception as e:
@@ -64,6 +74,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
64
  return True, "uses a gated model.", None
65
  return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
66
 
 
67
  def get_model_size(model_info: ModelInfo, precision: str):
68
  size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
69
  safetensors = None
@@ -79,16 +90,18 @@ def get_model_size(model_info: ModelInfo, precision: str):
79
  size_match = re.search(size_pattern, model_info.id.lower())
80
  model_size = size_match.group(0)
81
  model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
82
- except AttributeError as e:
83
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
84
 
85
  size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
86
  model_size = size_factor * model_size
87
  return model_size
88
 
 
89
  def get_model_arch(model_info: ModelInfo):
90
  return model_info.config.get("architectures", "Unknown")
91
 
 
92
  def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
93
  if org_or_user not in users_to_submission_dates:
94
  return True, ""
@@ -135,6 +148,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
135
 
136
  return set(file_names), users_to_submission_dates
137
 
 
138
  def get_model_tags(model_card, model: str):
139
  is_merge_from_metadata = False
140
  is_moe_from_metadata = False
@@ -143,10 +157,14 @@ def get_model_tags(model_card, model: str):
143
  if model_card is None:
144
  return tags
145
  if model_card.data.tags:
146
- is_merge_from_metadata = any([tag in model_card.data.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"]])
 
 
147
  is_moe_from_metadata = any([tag in model_card.data.tags for tag in ["moe", "moerge"]])
148
 
149
- is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"])
 
 
150
  if is_merge_from_model_card or is_merge_from_metadata:
151
  tags.append("merge")
152
  is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
 
24
  # Enforce license metadata
25
  if card.data.license is None:
26
  if not ("license_name" in card.data and "license_link" in card.data):
27
+ return (
28
+ False,
29
+ (
30
+ "License not found. Please add a license to your model card using the `license` metadata or a"
31
+ " `license_name`/`license_link` pair."
32
+ ),
33
+ None,
34
+ )
35
 
36
  # Enforce card content
37
  if len(card.text) < 200:
 
40
  return True, "", card
41
 
42
 
43
+ def is_model_on_hub(
44
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
45
+ ) -> tuple[bool, str, AutoConfig]:
46
  try:
47
+ config = AutoConfig.from_pretrained(
48
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
49
+ ) # , force_download=True)
50
  if test_tokenizer:
51
  try:
52
+ tk = AutoTokenizer.from_pretrained(
53
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
54
+ )
55
  except ValueError as e:
56
+ return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
57
+ except Exception:
58
  return (
59
  False,
60
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
61
+ None,
62
  )
 
 
63
  return True, None, config
64
 
65
+ except ValueError:
66
  return (
67
  False,
68
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
69
+ None,
70
  )
71
 
72
  except Exception as e:
 
74
  return True, "uses a gated model.", None
75
  return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
76
 
77
+
78
  def get_model_size(model_info: ModelInfo, precision: str):
79
  size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
80
  safetensors = None
 
90
  size_match = re.search(size_pattern, model_info.id.lower())
91
  model_size = size_match.group(0)
92
  model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
93
+ except AttributeError:
94
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
95
 
96
  size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
97
  model_size = size_factor * model_size
98
  return model_size
99
 
100
+
101
  def get_model_arch(model_info: ModelInfo):
102
  return model_info.config.get("architectures", "Unknown")
103
 
104
+
105
  def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
106
  if org_or_user not in users_to_submission_dates:
107
  return True, ""
 
148
 
149
  return set(file_names), users_to_submission_dates
150
 
151
+
152
  def get_model_tags(model_card, model: str):
153
  is_merge_from_metadata = False
154
  is_moe_from_metadata = False
 
157
  if model_card is None:
158
  return tags
159
  if model_card.data.tags:
160
+ is_merge_from_metadata = any(
161
+ [tag in model_card.data.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"]]
162
+ )
163
  is_moe_from_metadata = any([tag in model_card.data.tags for tag in ["moe", "moerge"]])
164
 
165
+ is_merge_from_model_card = any(
166
+ keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
167
+ )
168
  if is_merge_from_model_card or is_merge_from_metadata:
169
  tags.append("merge")
170
  is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
src/submission/submit.py CHANGED
@@ -2,23 +2,34 @@ import json
2
  import os
3
  from datetime import datetime, timezone
4
 
5
- from huggingface_hub import ModelCard, snapshot_download
6
 
7
  from src.display.formatting import styled_error, styled_message, styled_warning
8
- from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
 
 
 
 
 
 
 
 
 
 
9
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
10
  from src.submission.check_validity import (
11
  already_submitted_models,
12
  check_model_card,
13
  get_model_size,
 
14
  is_model_on_hub,
15
  user_submission_permission,
16
- get_model_tags
17
  )
18
 
19
  REQUESTED_MODELS = None
20
  USERS_TO_SUBMISSION_DATES = None
21
 
 
22
  def add_new_eval(
23
  model: str,
24
  base_model: str,
@@ -58,7 +69,9 @@ def add_new_eval(
58
  return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
59
 
60
  if model == "CohereForAI/c4ai-command-r-plus":
61
- return styled_warning("This model cannot be submitted manually on the leaderboard before the transformers release.")
 
 
62
 
63
  # Does the model actually exist?
64
  if revision == "":
@@ -66,7 +79,9 @@ def add_new_eval(
66
 
67
  # Is the model on the hub?
68
  if weight_type in ["Delta", "Adapter"]:
69
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
 
 
70
  if not base_model_on_hub:
71
  return styled_error(f'Base model "{base_model}" {error}')
72
 
@@ -81,10 +96,8 @@ def add_new_eval(
81
  architectures = getattr(model_config, "architectures", None)
82
  if architectures:
83
  architecture = ";".join(architectures)
84
- downloads = getattr(model_config, 'downloads', 0)
85
- created_at = getattr(model_config, 'created_at', '')
86
-
87
-
88
 
89
  # Is the model info correctly filled?
90
  try:
@@ -103,7 +116,7 @@ def add_new_eval(
103
  modelcard_OK, error_msg, model_card = check_model_card(model)
104
  if not modelcard_OK:
105
  return styled_error(error_msg)
106
-
107
  tags = get_model_tags(model_card, model)
108
 
109
  # Seems good, creating the eval
@@ -130,8 +143,8 @@ def add_new_eval(
130
  "license": license,
131
  "still_on_hub": True,
132
  "tags": tags,
133
- "downloads": downloads,
134
- "created_at": created_at
135
  }
136
 
137
  # Check for duplicate submission
@@ -175,8 +188,6 @@ def add_new_eval(
175
  commit_message=f"Add {model} to dynamic info queue",
176
  )
177
 
178
-
179
-
180
  # Remove the local file
181
  os.remove(out_path)
182
 
 
2
  import os
3
  from datetime import datetime, timezone
4
 
5
+ from huggingface_hub import snapshot_download
6
 
7
  from src.display.formatting import styled_error, styled_message, styled_warning
8
+ from src.envs import (
9
+ API,
10
+ DYNAMIC_INFO_FILE_PATH,
11
+ DYNAMIC_INFO_PATH,
12
+ DYNAMIC_INFO_REPO,
13
+ EVAL_REQUESTS_PATH,
14
+ H4_TOKEN,
15
+ QUEUE_REPO,
16
+ RATE_LIMIT_PERIOD,
17
+ RATE_LIMIT_QUOTA,
18
+ )
19
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
20
  from src.submission.check_validity import (
21
  already_submitted_models,
22
  check_model_card,
23
  get_model_size,
24
+ get_model_tags,
25
  is_model_on_hub,
26
  user_submission_permission,
 
27
  )
28
 
29
  REQUESTED_MODELS = None
30
  USERS_TO_SUBMISSION_DATES = None
31
 
32
+
33
  def add_new_eval(
34
  model: str,
35
  base_model: str,
 
69
  return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
70
 
71
  if model == "CohereForAI/c4ai-command-r-plus":
72
+ return styled_warning(
73
+ "This model cannot be submitted manually on the leaderboard before the transformers release."
74
+ )
75
 
76
  # Does the model actually exist?
77
  if revision == "":
 
79
 
80
  # Is the model on the hub?
81
  if weight_type in ["Delta", "Adapter"]:
82
+ base_model_on_hub, error, _ = is_model_on_hub(
83
+ model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True
84
+ )
85
  if not base_model_on_hub:
86
  return styled_error(f'Base model "{base_model}" {error}')
87
 
 
96
  architectures = getattr(model_config, "architectures", None)
97
  if architectures:
98
  architecture = ";".join(architectures)
99
+ downloads = getattr(model_config, "downloads", 0)
100
+ created_at = getattr(model_config, "created_at", "")
 
 
101
 
102
  # Is the model info correctly filled?
103
  try:
 
116
  modelcard_OK, error_msg, model_card = check_model_card(model)
117
  if not modelcard_OK:
118
  return styled_error(error_msg)
119
+
120
  tags = get_model_tags(model_card, model)
121
 
122
  # Seems good, creating the eval
 
143
  "license": license,
144
  "still_on_hub": True,
145
  "tags": tags,
146
+ "downloads": downloads,
147
+ "created_at": created_at,
148
  }
149
 
150
  # Check for duplicate submission
 
188
  commit_message=f"Add {model} to dynamic info queue",
189
  )
190
 
 
 
191
  # Remove the local file
192
  os.remove(out_path)
193
 
src/tools/collections.py CHANGED
@@ -1,5 +1,3 @@
1
- import os
2
-
3
  import pandas as pd
4
  from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
5
  from huggingface_hub.utils._errors import HfHubHTTPError
 
 
 
1
  import pandas as pd
2
  from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
3
  from huggingface_hub.utils._errors import HfHubHTTPError
src/tools/plots.py CHANGED
@@ -1,14 +1,14 @@
1
- import pandas as pd
2
  import numpy as np
 
3
  import plotly.express as px
4
  from plotly.graph_objs import Figure
5
 
 
 
6
  from src.leaderboard.filter_models import FLAGGED_MODELS
7
- from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
8
  from src.leaderboard.read_evals import EvalResult
9
 
10
 
11
-
12
  def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
13
  """
14
  Generates a DataFrame containing the maximum scores until each date.
@@ -18,7 +18,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
18
  """
19
  # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
  results_df = pd.DataFrame(raw_data)
21
- #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
  results_df.sort_values(by="date", inplace=True)
23
 
24
  # Step 2: Initialize the scores dictionary
@@ -31,8 +31,13 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
31
  column = task.col_name
32
  for _, row in results_df.iterrows():
33
  current_model = row["full_model"]
34
- # We ignore models that are flagged/no longer on the hub/not finished
35
- to_ignore = not row["still_on_hub"] or row["flagged"] or current_model in FLAGGED_MODELS or row["status"] != "FINISHED"
 
 
 
 
 
36
  if to_ignore:
37
  continue
38
 
@@ -54,7 +59,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
54
  return {k: pd.DataFrame(v) for k, v in scores.items()}
55
 
56
 
57
- def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
58
  """
59
  Transforms the scores DataFrame into a new format suitable for plotting.
60
 
@@ -79,9 +84,7 @@ def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
79
  return concat_df
80
 
81
 
82
- def create_metric_plot_obj(
83
- df: pd.DataFrame, metrics: list[str], title: str
84
- ) -> Figure:
85
  """
86
  Create a Plotly figure object with lines representing different metrics
87
  and horizontal dotted lines representing human baselines.
 
 
1
  import numpy as np
2
+ import pandas as pd
3
  import plotly.express as px
4
  from plotly.graph_objs import Figure
5
 
6
+ from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
+ from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
 
9
  from src.leaderboard.read_evals import EvalResult
10
 
11
 
 
12
  def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
13
  """
14
  Generates a DataFrame containing the maximum scores until each date.
 
18
  """
19
  # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
  results_df = pd.DataFrame(raw_data)
21
+ # results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
  results_df.sort_values(by="date", inplace=True)
23
 
24
  # Step 2: Initialize the scores dictionary
 
31
  column = task.col_name
32
  for _, row in results_df.iterrows():
33
  current_model = row["full_model"]
34
+ # We ignore models that are flagged/no longer on the hub/not finished
35
+ to_ignore = (
36
+ not row["still_on_hub"]
37
+ or row["flagged"]
38
+ or current_model in FLAGGED_MODELS
39
+ or row["status"] != "FINISHED"
40
+ )
41
  if to_ignore:
42
  continue
43
 
 
59
  return {k: pd.DataFrame(v) for k, v in scores.items()}
60
 
61
 
62
+ def create_plot_df(scores_df: dict[str : pd.DataFrame]) -> pd.DataFrame:
63
  """
64
  Transforms the scores DataFrame into a new format suitable for plotting.
65
 
 
84
  return concat_df
85
 
86
 
87
+ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
 
 
88
  """
89
  Create a Plotly figure object with lines representing different metrics
90
  and horizontal dotted lines representing human baselines.
update_dynamic.py CHANGED
@@ -1,4 +1,4 @@
1
  from src.scripts.update_all_request_files import update_dynamic_files
2
 
3
  if __name__ == "__main__":
4
- update_dynamic_files()
 
1
  from src.scripts.update_all_request_files import update_dynamic_files
2
 
3
  if __name__ == "__main__":
4
+ update_dynamic_files()