felix commited on
Commit
97453a2
·
2 Parent(s): 9c1a0d4 a6f1b1f

sync with upstream

Browse files
.gitignore CHANGED
@@ -1,4 +1,3 @@
1
- auto_evals/
2
  venv/
3
  __pycache__/
4
  .env
@@ -6,10 +5,8 @@ __pycache__/
6
  *ipynb
7
  .vscode/
8
 
9
- gpt_4_evals/
10
- human_evals/
11
  eval-queue/
12
  eval-results/
13
- auto_evals/
14
 
15
  src/assets/model_counts.html
 
 
1
  venv/
2
  __pycache__/
3
  .env
 
5
  *ipynb
6
  .vscode/
7
 
 
 
8
  eval-queue/
9
  eval-results/
10
+ dynamic-info/
11
 
12
  src/assets/model_counts.html
README.md CHANGED
@@ -4,11 +4,17 @@ emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.8.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
  duplicated_from: HuggingFaceH4/open_llm_leaderboard
 
 
 
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
  duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
+ fullWidth: true
13
+ space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
14
+ private: true
15
+ secrets:
16
+ - HF_TOKEN
17
+ - H4_TOKEN
18
  ---
19
 
20
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -4,7 +4,6 @@ import os
4
  from datetime import datetime, timezone
5
 
6
  import pandas as pd
7
- from apscheduler.schedulers.background import BackgroundScheduler
8
  from huggingface_hub import snapshot_download
9
 
10
  from src.display.about import (
@@ -30,7 +29,7 @@ from src.display.utils import (
30
  WeightType,
31
  Precision
32
  )
33
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
34
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
35
  from src.submission.submit import add_new_eval
36
  from src.tools.collections import update_collections
@@ -44,33 +43,52 @@ from src.tools.plots import (
44
  def restart_space():
45
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
46
 
47
- try:
48
- print(EVAL_REQUESTS_PATH)
49
- snapshot_download(
50
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
51
- )
52
- except Exception:
53
- restart_space()
54
- try:
55
- print(EVAL_RESULTS_PATH)
56
- snapshot_download(
57
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  )
59
- except Exception:
60
- restart_space()
61
 
 
62
 
63
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
64
- update_collections(original_df.copy())
65
- leaderboard_df = original_df.copy()
 
 
66
 
67
- plot_df = create_plot_df(create_scores_df(raw_data))
68
 
69
- (
70
- finished_eval_queue_df,
71
- running_eval_queue_df,
72
- pending_eval_queue_df,
73
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
74
 
75
 
76
  # Searching and filtering
@@ -81,10 +99,12 @@ def update_table(
81
  precision_query: str,
82
  size_query: list,
83
  show_deleted: bool,
 
 
84
  show_flagged: bool,
85
  query: str,
86
  ):
87
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_flagged)
88
  filtered_df = filter_queries(query, filtered_df)
89
  df = select_columns(filtered_df, columns)
90
  return df
@@ -100,13 +120,13 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
100
 
101
 
102
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
103
- always_here_cols = [
104
- AutoEvalColumn.model_type_symbol.name,
105
- AutoEvalColumn.model.name,
106
- ]
107
  # We use COLS to maintain sorting
108
  filtered_df = df[
109
- always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
110
  ]
111
  return filtered_df
112
 
@@ -132,7 +152,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
132
 
133
 
134
  def filter_models(
135
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_flagged: bool
136
  ) -> pd.DataFrame:
137
  # Show all models
138
  if show_deleted:
@@ -140,6 +160,12 @@ def filter_models(
140
  else: # Show only still on the hub models
141
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
142
 
 
 
 
 
 
 
143
  if not show_flagged:
144
  filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
145
 
@@ -154,7 +180,16 @@ def filter_models(
154
 
155
  return filtered_df
156
 
157
- leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, True)
 
 
 
 
 
 
 
 
 
158
 
159
  import unicodedata
160
 
 
4
  from datetime import datetime, timezone
5
 
6
  import pandas as pd
 
7
  from huggingface_hub import snapshot_download
8
 
9
  from src.display.about import (
 
29
  WeightType,
30
  Precision
31
  )
32
+ from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
33
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
34
  from src.submission.submit import add_new_eval
35
  from src.tools.collections import update_collections
 
43
  def restart_space():
44
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
45
 
46
+
47
+ def init_space():
48
+ try:
49
+ print(EVAL_REQUESTS_PATH)
50
+ snapshot_download(
51
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
52
+ )
53
+ except Exception:
54
+ restart_space()
55
+ try:
56
+ print(DYNAMIC_INFO_PATH)
57
+ snapshot_download(
58
+ repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
59
+ )
60
+ except Exception:
61
+ restart_space()
62
+ try:
63
+ print(EVAL_RESULTS_PATH)
64
+ snapshot_download(
65
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
66
+ )
67
+ except Exception:
68
+ restart_space()
69
+
70
+
71
+ raw_data, original_df = get_leaderboard_df(
72
+ results_path=EVAL_RESULTS_PATH,
73
+ requests_path=EVAL_REQUESTS_PATH,
74
+ dynamic_path=DYNAMIC_INFO_FILE_PATH,
75
+ cols=COLS,
76
+ benchmark_cols=BENCHMARK_COLS
77
  )
78
+ update_collections(original_df.copy())
79
+ leaderboard_df = original_df.copy()
80
 
81
+ plot_df = create_plot_df(create_scores_df(raw_data))
82
 
83
+ (
84
+ finished_eval_queue_df,
85
+ running_eval_queue_df,
86
+ pending_eval_queue_df,
87
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
88
 
89
+ return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
90
 
91
+ leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
 
 
 
 
92
 
93
 
94
  # Searching and filtering
 
99
  precision_query: str,
100
  size_query: list,
101
  show_deleted: bool,
102
+ show_merges: bool,
103
+ show_moe: bool,
104
  show_flagged: bool,
105
  query: str,
106
  ):
107
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_merges, show_moe, show_flagged)
108
  filtered_df = filter_queries(query, filtered_df)
109
  df = select_columns(filtered_df, columns)
110
  return df
 
120
 
121
 
122
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
123
+ always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
124
+ dummy_col = [AutoEvalColumn.dummy.name]
125
+ #AutoEvalColumn.model_type_symbol.name,
126
+ #AutoEvalColumn.model.name,
127
  # We use COLS to maintain sorting
128
  filtered_df = df[
129
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
130
  ]
131
  return filtered_df
132
 
 
152
 
153
 
154
  def filter_models(
155
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_merges: bool, show_moe:bool, show_flagged: bool
156
  ) -> pd.DataFrame:
157
  # Show all models
158
  if show_deleted:
 
160
  else: # Show only still on the hub models
161
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
162
 
163
+ if not show_merges:
164
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
165
+
166
+ if not show_moe:
167
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.moe.name] == False]
168
+
169
  if not show_flagged:
170
  filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
171
 
 
180
 
181
  return filtered_df
182
 
183
+ leaderboard_df = filter_models(
184
+ df=leaderboard_df,
185
+ type_query=[t.to_str(" : ") for t in ModelType],
186
+ size_query=list(NUMERIC_INTERVALS.keys()),
187
+ precision_query=[i.value.name for i in Precision],
188
+ show_deleted=False,
189
+ show_merges=False,
190
+ show_moe=True,
191
+ show_flagged=False
192
+ )
193
 
194
  import unicodedata
195
 
src/display/about.py CHANGED
@@ -159,10 +159,13 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
159
  ### 4) Fill up your model card
160
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
161
 
 
 
 
162
  ## In case of model failure
163
  If your model is displayed in the `FAILED` category, its execution stopped.
164
  Make sure you have followed the above steps first.
165
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
166
  """
167
 
168
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
159
  ### 4) Fill up your model card
160
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
161
 
162
+ ### 5) Select the correct precision
163
+ Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range).
164
+
165
  ## In case of model failure
166
  If your model is displayed in the `FAILED` category, its execution stopped.
167
  Make sure you have followed the above steps first.
168
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the command in the About tab under "Reproducibility" with all arguments specified (you can add `--limit` to limit the number of examples per task).
169
  """
170
 
171
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/display/css_html_js.py CHANGED
@@ -1,5 +1,24 @@
1
  custom_css = """
 
 
 
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  .markdown-text {
4
  font-size: 16px !important;
5
  }
@@ -21,14 +40,6 @@ custom_css = """
21
  transform: scale(1.3);
22
  }
23
 
24
- #leaderboard-table {
25
- margin-top: 15px
26
- }
27
-
28
- #leaderboard-table-lite {
29
- margin-top: 15px
30
- }
31
-
32
  #search-bar-table-box > div:first-child {
33
  background: none;
34
  border: none;
@@ -38,36 +49,11 @@ custom_css = """
38
  padding: 0px;
39
  }
40
 
41
- /* Hides the final AutoEvalColumn */
42
- #llm-benchmark-tab-table table td:last-child,
43
- #llm-benchmark-tab-table table th:last-child {
44
- display: none;
45
- }
46
-
47
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
48
- table td:first-child,
49
- table th:first-child {
50
- max-width: 400px;
51
- overflow: auto;
52
- white-space: nowrap;
53
- }
54
-
55
  .tab-buttons button {
56
  font-size: 20px;
57
  }
58
 
59
- #scale-logo {
60
- border-style: none !important;
61
- box-shadow: none;
62
- display: block;
63
- margin-left: auto;
64
- margin-right: auto;
65
- max-width: 600px;
66
- }
67
-
68
- #scale-logo .download {
69
- display: none;
70
- }
71
  #filter_type{
72
  border: 0;
73
  padding-left: 0;
 
1
  custom_css = """
2
+ /* Hides the final AutoEvalColumn */
3
+ #llm-benchmark-tab-table table td:last-child,
4
+ #llm-benchmark-tab-table table th:last-child {
5
+ display: none;
6
+ }
7
 
8
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
9
+ table td:first-child,
10
+ table th:first-child {
11
+ max-width: 400px;
12
+ overflow: auto;
13
+ white-space: nowrap;
14
+ }
15
+
16
+ /* Full width space */
17
+ .gradio-container {
18
+ max-width: 95%!important;
19
+ }
20
+
21
+ /* Text style and margins */
22
  .markdown-text {
23
  font-size: 16px !important;
24
  }
 
40
  transform: scale(1.3);
41
  }
42
 
 
 
 
 
 
 
 
 
43
  #search-bar-table-box > div:first-child {
44
  background: none;
45
  border: none;
 
49
  padding: 0px;
50
  }
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  .tab-buttons button {
53
  font-size: 20px;
54
  }
55
 
56
+ /* Filters style */
 
 
 
 
 
 
 
 
 
 
 
57
  #filter_type{
58
  border: 0;
59
  padding-left: 0;
src/display/utils.py CHANGED
@@ -38,7 +38,7 @@ auto_eval_column_dict = []
38
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
39
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
40
  #Scores
41
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
42
  for task in Tasks:
43
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
44
  # Model information
@@ -46,13 +46,14 @@ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type",
46
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
47
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
48
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
49
- auto_eval_column_dict.append(["merge", ColumnContent, ColumnContent("Merged", "bool", False)])
50
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
51
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
52
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub", "number", False)])
53
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
54
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
55
- auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
 
56
  # Dummy column for the search bar (hidden by the custom CSS)
57
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
58
 
@@ -73,8 +74,8 @@ baseline_row = {
73
  AutoEvalColumn.model.name: "<p>Baseline</p>",
74
  AutoEvalColumn.revision.name: "N/A",
75
  AutoEvalColumn.precision.name: None,
 
76
  AutoEvalColumn.average.name: 31.0,
77
- AutoEvalColumn.merge.name: False,
78
  AutoEvalColumn.arc.name: 25.0,
79
  AutoEvalColumn.hellaswag.name: 25.0,
80
  AutoEvalColumn.mmlu.name: 25.0,
@@ -98,8 +99,8 @@ human_baseline_row = {
98
  AutoEvalColumn.model.name: "<p>Human performance</p>",
99
  AutoEvalColumn.revision.name: "N/A",
100
  AutoEvalColumn.precision.name: None,
101
- AutoEvalColumn.merge.name: False,
102
  AutoEvalColumn.average.name: 92.75,
 
103
  AutoEvalColumn.arc.name: 80.0,
104
  AutoEvalColumn.hellaswag.name: 95.0,
105
  AutoEvalColumn.mmlu.name: 89.8,
@@ -108,6 +109,7 @@ human_baseline_row = {
108
  AutoEvalColumn.gsm8k.name: 100,
109
  AutoEvalColumn.dummy.name: "human_baseline",
110
  AutoEvalColumn.model_type.name: "",
 
111
  }
112
 
113
  @dataclass
@@ -168,10 +170,8 @@ class Precision(Enum):
168
 
169
 
170
  # Column selection
171
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
172
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
173
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
174
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
175
 
176
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
177
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
38
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
39
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
40
  #Scores
41
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
42
  for task in Tasks:
43
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
44
  # Model information
 
46
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
47
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
48
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
49
+ auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
50
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
51
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
52
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
53
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)])
54
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
55
+ auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
56
+ auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
57
  # Dummy column for the search bar (hidden by the custom CSS)
58
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
59
 
 
74
  AutoEvalColumn.model.name: "<p>Baseline</p>",
75
  AutoEvalColumn.revision.name: "N/A",
76
  AutoEvalColumn.precision.name: None,
77
+ AutoEvalColumn.merged.name: False,
78
  AutoEvalColumn.average.name: 31.0,
 
79
  AutoEvalColumn.arc.name: 25.0,
80
  AutoEvalColumn.hellaswag.name: 25.0,
81
  AutoEvalColumn.mmlu.name: 25.0,
 
99
  AutoEvalColumn.model.name: "<p>Human performance</p>",
100
  AutoEvalColumn.revision.name: "N/A",
101
  AutoEvalColumn.precision.name: None,
 
102
  AutoEvalColumn.average.name: 92.75,
103
+ AutoEvalColumn.merged.name: False,
104
  AutoEvalColumn.arc.name: 80.0,
105
  AutoEvalColumn.hellaswag.name: 95.0,
106
  AutoEvalColumn.mmlu.name: 89.8,
 
109
  AutoEvalColumn.gsm8k.name: 100,
110
  AutoEvalColumn.dummy.name: "human_baseline",
111
  AutoEvalColumn.model_type.name: "",
112
+ AutoEvalColumn.flagged.name: False,
113
  }
114
 
115
  @dataclass
 
170
 
171
 
172
  # Column selection
173
+ COLS = [c.name for c in fields(AutoEvalColumn)]
174
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
 
 
175
 
176
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
177
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
src/envs.py CHANGED
@@ -7,6 +7,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
7
 
8
  REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
9
  QUEUE_REPO = "open-llm-leaderboard/requests"
 
10
  RESULTS_REPO = "open-llm-leaderboard/results"
11
 
12
  PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
@@ -18,6 +19,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
20
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
 
21
 
22
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
23
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
 
7
 
8
  REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
9
  QUEUE_REPO = "open-llm-leaderboard/requests"
10
+ DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
11
  RESULTS_REPO = "open-llm-leaderboard/results"
12
 
13
  PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
 
19
 
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
+ DYNAMIC_INFO_PATH = os.path.join(CACHE_PATH, "dynamic-info")
23
+ DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
24
 
25
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
26
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
src/leaderboard/filter_models.py CHANGED
@@ -4,6 +4,7 @@ from src.display.utils import AutoEvalColumn
4
  # Models which have been flagged by users as being problematic for a reason or another
5
  # (Model name to forum discussion link)
6
  FLAGGED_MODELS = {
 
7
  "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
8
  "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
9
  "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
@@ -38,7 +39,49 @@ FLAGGED_MODELS = {
38
  "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
39
  "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
40
  "rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
41
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
 
44
  # Models which have been requested by orgs to not be submitted on the leaderboard
@@ -52,10 +95,16 @@ DO_NOT_SUBMIT_MODELS = [
52
 
53
  def flag_models(leaderboard_data: list[dict]):
54
  for model_data in leaderboard_data:
55
- if model_data["model_name_for_query"] in FLAGGED_MODELS:
56
- issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
 
 
 
 
 
 
57
  issue_link = model_hyperlink(
58
- FLAGGED_MODELS[model_data["model_name_for_query"]],
59
  f"See discussion #{issue_num}",
60
  )
61
  model_data[
 
4
  # Models which have been flagged by users as being problematic for a reason or another
5
  # (Model name to forum discussion link)
6
  FLAGGED_MODELS = {
7
+ "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
8
  "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
9
  "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
10
  "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
 
39
  "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
40
  "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
41
  "rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
42
+ "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/503",
43
+ "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
44
+ "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
45
+ "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
46
+ # Merges not indicated
47
+ "gagan3012/MetaModelv2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
48
+ "gagan3012/MetaModelv3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
49
+ "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
50
+ "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
51
+ "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
52
+ "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
53
+ "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
54
+ "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
55
+ "rwitz/go-bruins-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
56
+ "rwitz/go-bruins": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
57
+ "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
58
+ "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
59
+ "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
60
+ "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
61
+ "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
62
+ "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
63
+ "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
64
+ "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
65
+ "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
66
+ "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
67
+ "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
68
+ "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
69
+ "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
70
+ "elinas/chronos007-70b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
71
+ "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
72
+ "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
73
+ "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
74
+ "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
75
+ "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
76
+ "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
77
+ "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
78
+ "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
79
+ "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
80
+ "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
81
+ "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
82
+ "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
83
+ "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
84
+ "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
85
  }
86
 
87
  # Models which have been requested by orgs to not be submitted on the leaderboard
 
95
 
96
  def flag_models(leaderboard_data: list[dict]):
97
  for model_data in leaderboard_data:
98
+ # Merges and moes are flagged automatically
99
+ if model_data[AutoEvalColumn.flagged.name] == True:
100
+ flag_key = "merged"
101
+ else:
102
+ flag_key = model_data["model_name_for_query"]
103
+
104
+ if flag_key in FLAGGED_MODELS:
105
+ issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
106
  issue_link = model_hyperlink(
107
+ FLAGGED_MODELS[flag_key],
108
  f"See discussion #{issue_num}",
109
  )
110
  model_data[
src/leaderboard/read_evals.py CHANGED
@@ -5,15 +5,12 @@ import os
5
  from dataclasses import dataclass
6
 
7
  import dateutil
8
- from datetime import datetime
9
- from transformers import AutoConfig
10
  import numpy as np
11
 
12
  from huggingface_hub import ModelCard
13
 
14
  from src.display.formatting import make_clickable_model
15
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
16
- from src.submission.check_validity import is_model_on_hub
17
 
18
 
19
  @dataclass
@@ -33,8 +30,11 @@ class EvalResult:
33
  likes: int = 0
34
  num_params: int = 0
35
  date: str = "" # submission date of request file
36
- still_on_hub: bool = False
37
- merge: bool = False
 
 
 
38
 
39
  @classmethod
40
  def init_from_json_file(self, json_filepath):
@@ -43,13 +43,13 @@ class EvalResult:
43
  data = json.load(fp)
44
 
45
  # We manage the legacy config format
46
- config = data.get("config", data.get("config_general", None))
47
 
48
  # Precision
49
  precision = Precision.from_str(config.get("model_dtype"))
50
 
51
  # Get model and org
52
- org_and_model = config.get("model_name", config.get("model_args", None))
53
  org_and_model = org_and_model.split("/", 1)
54
 
55
  if len(org_and_model) == 1:
@@ -62,20 +62,6 @@ class EvalResult:
62
  result_key = f"{org}_{model}_{precision.value.name}"
63
  full_model = "/".join(org_and_model)
64
 
65
- try:
66
- merge = any(t in ["merge", "mergedlm"] for t in ModelCard.load(full_model).data.tags)
67
- except Exception:
68
- merge = False
69
-
70
- still_on_hub, error, model_config = is_model_on_hub(
71
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
72
- )
73
- architecture = "?"
74
- if model_config is not None:
75
- architectures = getattr(model_config, "architectures", None)
76
- if architectures:
77
- architecture = ";".join(architectures)
78
-
79
  # Extract results available in this file (some results are split in several files)
80
  results = {}
81
  for task in Tasks:
@@ -112,9 +98,6 @@ class EvalResult:
112
  results=results,
113
  precision=precision,
114
  revision= config.get("model_sha", ""),
115
- still_on_hub=still_on_hub,
116
- architecture=architecture,
117
- merge=merge
118
  )
119
 
120
  def update_with_request_file(self, requests_path):
@@ -124,15 +107,24 @@ class EvalResult:
124
  try:
125
  with open(request_file, "r") as f:
126
  request = json.load(f)
127
- self.model_type = ModelType.from_str(request.get("model_type", ""))
128
  self.weight_type = WeightType[request.get("weight_type", "Original")]
129
- self.license = request.get("license", "?")
130
- self.likes = request.get("likes", 0)
131
  self.num_params = request.get("params", 0)
132
  self.date = request.get("submitted_time", "")
133
- except Exception:
 
 
 
134
  print(f"Could not find request file for {self.org}/{self.model}")
135
 
 
 
 
 
 
 
 
 
136
  def to_dict(self):
137
  """Converts the Eval Result to a dict compatible with our dataframe display"""
138
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
@@ -140,7 +132,6 @@ class EvalResult:
140
  "eval_name": self.eval_name, # not a column, just a save name,
141
  AutoEvalColumn.precision.name: self.precision.value.name,
142
  AutoEvalColumn.model_type.name: self.model_type.value.name,
143
- AutoEvalColumn.merge.name: self.merge,
144
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
145
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
146
  AutoEvalColumn.architecture.name: self.architecture,
@@ -152,6 +143,9 @@ class EvalResult:
152
  AutoEvalColumn.likes.name: self.likes,
153
  AutoEvalColumn.params.name: self.num_params,
154
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
 
 
155
  }
156
 
157
  for task in Tasks:
@@ -182,7 +176,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
182
  return request_file
183
 
184
 
185
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
186
  """From the path of the results folder root, extract all needed info for results"""
187
  model_result_filepaths = []
188
 
@@ -200,11 +194,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
200
  for file in files:
201
  model_result_filepaths.append(os.path.join(root, file))
202
 
 
 
 
203
  eval_results = {}
204
  for model_result_filepath in model_result_filepaths:
205
  # Creation of result
206
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
207
  eval_result.update_with_request_file(requests_path)
 
 
208
 
209
  # Store results of same eval together
210
  eval_name = eval_result.eval_name
@@ -216,8 +215,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
216
  results = []
217
  for v in eval_results.values():
218
  try:
219
- v.to_dict() # we test if the dict version is complete
220
- results.append(v)
 
221
  except KeyError: # not all eval values present
222
  continue
223
 
 
5
  from dataclasses import dataclass
6
 
7
  import dateutil
 
 
8
  import numpy as np
9
 
10
  from huggingface_hub import ModelCard
11
 
12
  from src.display.formatting import make_clickable_model
13
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 
14
 
15
 
16
  @dataclass
 
30
  likes: int = 0
31
  num_params: int = 0
32
  date: str = "" # submission date of request file
33
+ still_on_hub: bool = True
34
+ is_merge: bool = False
35
+ flagged: bool = False
36
+ status: str = "FINISHED"
37
+ tags: list = None
38
 
39
  @classmethod
40
  def init_from_json_file(self, json_filepath):
 
43
  data = json.load(fp)
44
 
45
  # We manage the legacy config format
46
+ config = data.get("config_general")
47
 
48
  # Precision
49
  precision = Precision.from_str(config.get("model_dtype"))
50
 
51
  # Get model and org
52
+ org_and_model = config.get("model_name")
53
  org_and_model = org_and_model.split("/", 1)
54
 
55
  if len(org_and_model) == 1:
 
62
  result_key = f"{org}_{model}_{precision.value.name}"
63
  full_model = "/".join(org_and_model)
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  # Extract results available in this file (some results are split in several files)
66
  results = {}
67
  for task in Tasks:
 
98
  results=results,
99
  precision=precision,
100
  revision= config.get("model_sha", ""),
 
 
 
101
  )
102
 
103
  def update_with_request_file(self, requests_path):
 
107
  try:
108
  with open(request_file, "r") as f:
109
  request = json.load(f)
110
+ self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
111
  self.weight_type = WeightType[request.get("weight_type", "Original")]
 
 
112
  self.num_params = request.get("params", 0)
113
  self.date = request.get("submitted_time", "")
114
+ self.architecture = request.get("architectures", "Unknown")
115
+ self.status = request.get("status", "FAILED")
116
+ except Exception as e:
117
+ self.status = "FAILED"
118
  print(f"Could not find request file for {self.org}/{self.model}")
119
 
120
+ def update_with_dynamic_file_dict(self, file_dict):
121
+ self.license = file_dict.get("license", "?")
122
+ self.likes = file_dict.get("likes", 0)
123
+ self.still_on_hub = file_dict["still_on_hub"]
124
+ self.flagged = any("flagged" in tag for tag in file_dict["tags"])
125
+ self.tags = file_dict["tags"]
126
+
127
+
128
  def to_dict(self):
129
  """Converts the Eval Result to a dict compatible with our dataframe display"""
130
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
132
  "eval_name": self.eval_name, # not a column, just a save name,
133
  AutoEvalColumn.precision.name: self.precision.value.name,
134
  AutoEvalColumn.model_type.name: self.model_type.value.name,
 
135
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
136
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
137
  AutoEvalColumn.architecture.name: self.architecture,
 
143
  AutoEvalColumn.likes.name: self.likes,
144
  AutoEvalColumn.params.name: self.num_params,
145
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
146
+ AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
147
+ AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
148
+ AutoEvalColumn.flagged.name: self.flagged
149
  }
150
 
151
  for task in Tasks:
 
176
  return request_file
177
 
178
 
179
+ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
180
  """From the path of the results folder root, extract all needed info for results"""
181
  model_result_filepaths = []
182
 
 
194
  for file in files:
195
  model_result_filepaths.append(os.path.join(root, file))
196
 
197
+ with open(dynamic_path) as f:
198
+ dynamic_data = json.load(f)
199
+
200
  eval_results = {}
201
  for model_result_filepath in model_result_filepaths:
202
  # Creation of result
203
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
204
  eval_result.update_with_request_file(requests_path)
205
+ if eval_result.full_model in dynamic_data:
206
+ eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
207
 
208
  # Store results of same eval together
209
  eval_name = eval_result.eval_name
 
215
  results = []
216
  for v in eval_results.values():
217
  try:
218
+ if v.status == "FINISHED":
219
+ v.to_dict() # we test if the dict version is complete
220
+ results.append(v)
221
  except KeyError: # not all eval values present
222
  continue
223
 
src/populate.py CHANGED
@@ -9,8 +9,8 @@ from src.leaderboard.filter_models import filter_models
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
  all_data_json.append(baseline_row)
16
  filter_models(all_data_json)
 
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
+ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
+ raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
  all_data_json.append(baseline_row)
16
  filter_models(all_data_json)
{scripts → src/scripts}/create_request_file.py RENAMED
@@ -1,36 +1,21 @@
1
  import json
2
  import os
3
  import pprint
4
- import re
5
  from datetime import datetime, timezone
6
 
7
  import click
8
  from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
10
 
 
 
 
11
  EVAL_REQUESTS_PATH = "eval-queue"
12
  QUEUE_REPO = "open-llm-leaderboard/requests"
13
 
14
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
15
- model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
16
- weight_types = ("Original", "Delta", "Adapter")
17
-
18
-
19
- def get_model_size(model_info, precision: str):
20
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
21
- try:
22
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
23
- except (AttributeError, TypeError):
24
- try:
25
- size_match = re.search(size_pattern, model_info.modelId.lower())
26
- model_size = size_match.group(0)
27
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
28
- except AttributeError:
29
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
30
-
31
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
32
- model_size = size_factor * model_size
33
- return model_size
34
 
35
 
36
  def main():
 
1
  import json
2
  import os
3
  import pprint
 
4
  from datetime import datetime, timezone
5
 
6
  import click
7
  from colorama import Fore
8
  from huggingface_hub import HfApi, snapshot_download
9
 
10
+ from src.submission.check_validity import get_model_size
11
+ from src.display.utils import ModelType, WeightType
12
+
13
  EVAL_REQUESTS_PATH = "eval-queue"
14
  QUEUE_REPO = "open-llm-leaderboard/requests"
15
 
16
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
17
+ model_types = [e.name for e in ModelType]
18
+ weight_types = [e.name for e in WeightType]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  def main():
src/scripts/update_all_request_files.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import ModelFilter, snapshot_download
2
+ from huggingface_hub import ModelCard
3
+
4
+ import json
5
+ import time
6
+ from src.submission.check_validity import is_model_on_hub, check_model_card
7
+ from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN
8
+
9
+ def update_models(file_path, models):
10
+ """
11
+ Search through all JSON files in the specified root folder and its subfolders,
12
+ and update the likes key in JSON dict from value of input dict
13
+ """
14
+ with open(file_path, "r") as f:
15
+ model_infos = json.load(f)
16
+ for model_id, data in model_infos.items():
17
+ if model_id not in models:
18
+ data['still_on_hub'] = False
19
+ data['likes'] = 0
20
+ data['downloads'] = 0
21
+ data['created_at'] = ""
22
+ continue
23
+
24
+ model_cfg = models[model_id]
25
+ data['likes'] = model_cfg.likes
26
+ data['downloads'] = model_cfg.downloads
27
+ data['created_at'] = str(model_cfg.created_at)
28
+ #data['params'] = get_model_size(model_cfg, data['precision'])
29
+ data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
30
+
31
+ # Is the model still on the hub
32
+ still_on_hub, error, model_config = is_model_on_hub(
33
+ model_name=model_id, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
34
+ )
35
+ # If the model doesn't have a model card or a license, we consider it's deleted
36
+ if still_on_hub:
37
+ try:
38
+ if check_model_card(model_id)[0] is False:
39
+ still_on_hub = False
40
+ except Exception:
41
+ still_on_hub = False
42
+ data['still_on_hub'] = still_on_hub
43
+
44
+ # Check if the model is a merge
45
+ is_merge_from_metadata = False
46
+ is_moe_from_metadata = False
47
+ if still_on_hub:
48
+ model_card = ModelCard.load(model_id)
49
+
50
+ # Storing the model metadata
51
+ tags = []
52
+ if model_card.data.tags:
53
+ is_merge_from_metadata = "merge" in model_card.data.tags
54
+ is_moe_from_metadata = "moe" in model_card.data.tags
55
+ merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
56
+ # If the model is a merge but not saying it in the metadata, we flag it
57
+ is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
58
+ if is_merge_from_model_card or is_merge_from_metadata:
59
+ tags.append("merge")
60
+ if not is_merge_from_metadata:
61
+ tags.append("flagged:undisclosed_merge")
62
+ moe_keywords = ["moe", "mixture of experts", "mixtral"]
63
+ is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)
64
+ is_moe_from_name = "moe" in model_id.lower().replace("/", "-").replace("_", "-").split("-")
65
+ if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
66
+ tags.append("moe")
67
+ if not is_moe_from_metadata:
68
+ tags.append("flagged:undisclosed_moe")
69
+
70
+ data["tags"] = tags
71
+
72
+ with open(file_path, 'w') as f:
73
+ json.dump(model_infos, f, indent=2)
74
+
75
+ def update_dynamic_files():
76
+ """ This will only update metadata for models already linked in the repo, not add missing ones.
77
+ """
78
+ snapshot_download(
79
+ repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
80
+ )
81
+
82
+ print("UPDATE_DYNAMIC: Loaded snapshot")
83
+ # Get models
84
+ start = time.time()
85
+
86
+ models = list(API.list_models(
87
+ filter=ModelFilter(task="text-generation"),
88
+ full=False,
89
+ cardData=True,
90
+ fetch_config=True,
91
+ ))
92
+ id_to_model = {model.id : model for model in models}
93
+
94
+ print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
95
+
96
+ start = time.time()
97
+
98
+ update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
99
+
100
+ print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
101
+
102
+ API.upload_file(
103
+ path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
104
+ path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
105
+ repo_id=DYNAMIC_INFO_REPO,
106
+ repo_type="dataset",
107
+ commit_message=f"Daily request file update.",
108
+ )
109
+ print(f"UPDATE_DYNAMIC: pushed to hub")
src/submission/check_validity.py CHANGED
@@ -6,9 +6,8 @@ from datetime import datetime, timedelta, timezone
6
 
7
  import huggingface_hub
8
  from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig, AutoTokenizer
11
- from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
12
 
13
  from src.envs import HAS_HIGHER_RATE_LIMIT
14
 
@@ -37,9 +36,9 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
37
  return True, ""
38
 
39
 
40
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
41
  try:
42
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
43
  if test_tokenizer:
44
  try:
45
  tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
@@ -53,7 +52,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
53
  return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
54
  return True, None, config
55
 
56
- except ValueError:
57
  return (
58
  False,
59
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
@@ -65,18 +64,24 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
65
 
66
 
67
  def get_model_size(model_info: ModelInfo, precision: str):
68
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
 
69
  try:
70
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
71
- except (AttributeError, TypeError ):
 
 
 
 
 
72
  try:
73
- size_match = re.search(size_pattern, model_info.modelId.lower())
74
  model_size = size_match.group(0)
75
  model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
76
- except AttributeError:
77
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
78
 
79
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
80
  model_size = size_factor * model_size
81
  return model_size
82
 
 
6
 
7
  import huggingface_hub
8
  from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
10
  from transformers import AutoConfig, AutoTokenizer
 
11
 
12
  from src.envs import HAS_HIGHER_RATE_LIMIT
13
 
 
36
  return True, ""
37
 
38
 
39
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str, AutoConfig]:
40
  try:
41
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
42
  if test_tokenizer:
43
  try:
44
  tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
52
  return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
53
  return True, None, config
54
 
55
+ except ValueError as e:
56
  return (
57
  False,
58
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
 
64
 
65
 
66
  def get_model_size(model_info: ModelInfo, precision: str):
67
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
68
+ safetensors = None
69
  try:
70
+ safetensors = get_safetensors_metadata(model_info.id)
71
+ except Exception as e:
72
+ print(e)
73
+
74
+ if safetensors is not None:
75
+ model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
76
+ else:
77
  try:
78
+ size_match = re.search(size_pattern, model_info.id.lower())
79
  model_size = size_match.group(0)
80
  model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
81
+ except AttributeError as e:
82
  return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
83
 
84
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
85
  model_size = size_factor * model_size
86
  return model_size
87
 
src/submission/submit.py CHANGED
@@ -2,8 +2,10 @@ import json
2
  import os
3
  from datetime import datetime, timezone
4
 
 
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
7
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
8
  from src.submission.check_validity import (
9
  already_submitted_models,
@@ -64,10 +66,21 @@ def add_new_eval(
64
  if not base_model_on_hub:
65
  return styled_error(f'Base model "{base_model}" {error}')
66
 
 
 
 
67
  if not weight_type == "Adapter":
68
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
69
  if not model_on_hub:
70
  return styled_error(f'Model "{model}" {error}')
 
 
 
 
 
 
 
 
71
 
72
  # Is the model info correctly filled?
73
  try:
@@ -86,6 +99,31 @@ def add_new_eval(
86
  modelcard_OK, error_msg = check_model_card(model)
87
  if not modelcard_OK:
88
  return styled_error(error_msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  # Seems good, creating the eval
91
  print("Adding new eval")
@@ -96,13 +134,23 @@ def add_new_eval(
96
  "revision": revision,
97
  "private": private,
98
  "precision": precision,
 
 
99
  "weight_type": weight_type,
100
  "status": "PENDING",
101
  "submitted_time": current_time,
102
  "model_type": model_type,
 
 
 
 
 
103
  "likes": model_info.likes,
104
- "params": model_size,
105
  "license": license,
 
 
 
 
106
  }
107
 
108
  # Check for duplicate submission
@@ -126,6 +174,28 @@ def add_new_eval(
126
  commit_message=f"Add {model} to eval queue",
127
  )
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  # Remove the local file
130
  os.remove(out_path)
131
 
 
2
  import os
3
  from datetime import datetime, timezone
4
 
5
+ from huggingface_hub import ModelCard, snapshot_download
6
+
7
  from src.display.formatting import styled_error, styled_message, styled_warning
8
+ from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
9
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
10
  from src.submission.check_validity import (
11
  already_submitted_models,
 
66
  if not base_model_on_hub:
67
  return styled_error(f'Base model "{base_model}" {error}')
68
 
69
+ architecture = "?"
70
+ downloads = 0
71
+ created_at = ""
72
  if not weight_type == "Adapter":
73
+ model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
74
  if not model_on_hub:
75
  return styled_error(f'Model "{model}" {error}')
76
+ if model_config is not None:
77
+ architectures = getattr(model_config, "architectures", None)
78
+ if architectures:
79
+ architecture = ";".join(architectures)
80
+ downloads = getattr(model_config, 'downloads', 0)
81
+ created_at = getattr(model_config, 'created_at', '')
82
+
83
+
84
 
85
  # Is the model info correctly filled?
86
  try:
 
99
  modelcard_OK, error_msg = check_model_card(model)
100
  if not modelcard_OK:
101
  return styled_error(error_msg)
102
+
103
+ is_merge_from_metadata = False
104
+ is_moe_from_metadata = False
105
+ model_card = ModelCard.load(model)
106
+
107
+ # Storing the model tags
108
+ tags = []
109
+ if model_card.data.tags:
110
+ is_merge_from_metadata = "merge" in model_card.data.tags
111
+ is_moe_from_metadata = "moe" in model_card.data.tags
112
+ merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
113
+ # If the model is a merge but not saying it in the metadata, we flag it
114
+ is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
115
+ if is_merge_from_model_card or is_merge_from_metadata:
116
+ tags.append("merge")
117
+ if not is_merge_from_metadata:
118
+ tags.append("flagged:undisclosed_merge")
119
+ moe_keywords = ["moe", "mixture of experts", "mixtral"]
120
+ is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)
121
+ is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
122
+ if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
123
+ tags.append("moe")
124
+ if not is_moe_from_metadata:
125
+ tags.append("flagged:undisclosed_moe")
126
+
127
 
128
  # Seems good, creating the eval
129
  print("Adding new eval")
 
134
  "revision": revision,
135
  "private": private,
136
  "precision": precision,
137
+ "params": model_size,
138
+ "architectures": architecture,
139
  "weight_type": weight_type,
140
  "status": "PENDING",
141
  "submitted_time": current_time,
142
  "model_type": model_type,
143
+ "job_id": -1,
144
+ "job_start_time": None,
145
+ }
146
+
147
+ supplementary_info = {
148
  "likes": model_info.likes,
 
149
  "license": license,
150
+ "still_on_hub": True,
151
+ "tags": tags,
152
+ "downloads": downloads,
153
+ "created_at": created_at
154
  }
155
 
156
  # Check for duplicate submission
 
174
  commit_message=f"Add {model} to eval queue",
175
  )
176
 
177
+ # We want to grab the latest version of the submission file to not accidentally overwrite it
178
+ snapshot_download(
179
+ repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
180
+ )
181
+
182
+ with open(DYNAMIC_INFO_FILE_PATH) as f:
183
+ all_supplementary_info = json.load(f)
184
+
185
+ all_supplementary_info[model] = supplementary_info
186
+ with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
187
+ json.dump(all_supplementary_info, f, indent=2)
188
+
189
+ API.upload_file(
190
+ path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
191
+ path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
192
+ repo_id=DYNAMIC_INFO_REPO,
193
+ repo_type="dataset",
194
+ commit_message=f"Add {model} to dynamic info queue",
195
+ )
196
+
197
+
198
+
199
  # Remove the local file
200
  os.remove(out_path)
201