Spaces:
Runtime error
Runtime error
sync with upstream
Browse files- .gitignore +1 -4
- README.md +7 -1
- app.py +67 -32
- src/display/about.py +4 -1
- src/display/css_html_js.py +20 -34
- src/display/utils.py +11 -11
- src/envs.py +3 -0
- src/leaderboard/filter_models.py +53 -4
- src/leaderboard/read_evals.py +32 -32
- src/populate.py +2 -2
- {scripts → src/scripts}/create_request_file.py +5 -20
- src/scripts/update_all_request_files.py +109 -0
- src/submission/check_validity.py +16 -11
- src/submission/submit.py +73 -3
.gitignore
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
auto_evals/
|
2 |
venv/
|
3 |
__pycache__/
|
4 |
.env
|
@@ -6,10 +5,8 @@ __pycache__/
|
|
6 |
*ipynb
|
7 |
.vscode/
|
8 |
|
9 |
-
gpt_4_evals/
|
10 |
-
human_evals/
|
11 |
eval-queue/
|
12 |
eval-results/
|
13 |
-
|
14 |
|
15 |
src/assets/model_counts.html
|
|
|
|
|
1 |
venv/
|
2 |
__pycache__/
|
3 |
.env
|
|
|
5 |
*ipynb
|
6 |
.vscode/
|
7 |
|
|
|
|
|
8 |
eval-queue/
|
9 |
eval-results/
|
10 |
+
dynamic-info/
|
11 |
|
12 |
src/assets/model_counts.html
|
README.md
CHANGED
@@ -4,11 +4,17 @@ emoji: 🏆
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.9.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
12 |
+
fullWidth: true
|
13 |
+
space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
|
14 |
+
private: true
|
15 |
+
secrets:
|
16 |
+
- HF_TOKEN
|
17 |
+
- H4_TOKEN
|
18 |
---
|
19 |
|
20 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -4,7 +4,6 @@ import os
|
|
4 |
from datetime import datetime, timezone
|
5 |
|
6 |
import pandas as pd
|
7 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from huggingface_hub import snapshot_download
|
9 |
|
10 |
from src.display.about import (
|
@@ -30,7 +29,7 @@ from src.display.utils import (
|
|
30 |
WeightType,
|
31 |
Precision
|
32 |
)
|
33 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
34 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
35 |
from src.submission.submit import add_new_eval
|
36 |
from src.tools.collections import update_collections
|
@@ -44,33 +43,52 @@ from src.tools.plots import (
|
|
44 |
def restart_space():
|
45 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
)
|
59 |
-
|
60 |
-
|
61 |
|
|
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
66 |
|
67 |
-
plot_df
|
68 |
|
69 |
-
(
|
70 |
-
finished_eval_queue_df,
|
71 |
-
running_eval_queue_df,
|
72 |
-
pending_eval_queue_df,
|
73 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
74 |
|
75 |
|
76 |
# Searching and filtering
|
@@ -81,10 +99,12 @@ def update_table(
|
|
81 |
precision_query: str,
|
82 |
size_query: list,
|
83 |
show_deleted: bool,
|
|
|
|
|
84 |
show_flagged: bool,
|
85 |
query: str,
|
86 |
):
|
87 |
-
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_flagged)
|
88 |
filtered_df = filter_queries(query, filtered_df)
|
89 |
df = select_columns(filtered_df, columns)
|
90 |
return df
|
@@ -100,13 +120,13 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
100 |
|
101 |
|
102 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
103 |
-
always_here_cols = [
|
104 |
-
|
105 |
-
AutoEvalColumn.
|
106 |
-
|
107 |
# We use COLS to maintain sorting
|
108 |
filtered_df = df[
|
109 |
-
always_here_cols + [c for c in COLS if c in df.columns and c in columns] +
|
110 |
]
|
111 |
return filtered_df
|
112 |
|
@@ -132,7 +152,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
|
|
132 |
|
133 |
|
134 |
def filter_models(
|
135 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_flagged: bool
|
136 |
) -> pd.DataFrame:
|
137 |
# Show all models
|
138 |
if show_deleted:
|
@@ -140,6 +160,12 @@ def filter_models(
|
|
140 |
else: # Show only still on the hub models
|
141 |
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
if not show_flagged:
|
144 |
filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
|
145 |
|
@@ -154,7 +180,16 @@ def filter_models(
|
|
154 |
|
155 |
return filtered_df
|
156 |
|
157 |
-
leaderboard_df = filter_models(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
import unicodedata
|
160 |
|
|
|
4 |
from datetime import datetime, timezone
|
5 |
|
6 |
import pandas as pd
|
|
|
7 |
from huggingface_hub import snapshot_download
|
8 |
|
9 |
from src.display.about import (
|
|
|
29 |
WeightType,
|
30 |
Precision
|
31 |
)
|
32 |
+
from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
33 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
34 |
from src.submission.submit import add_new_eval
|
35 |
from src.tools.collections import update_collections
|
|
|
43 |
def restart_space():
|
44 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
45 |
|
46 |
+
|
47 |
+
def init_space():
|
48 |
+
try:
|
49 |
+
print(EVAL_REQUESTS_PATH)
|
50 |
+
snapshot_download(
|
51 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
52 |
+
)
|
53 |
+
except Exception:
|
54 |
+
restart_space()
|
55 |
+
try:
|
56 |
+
print(DYNAMIC_INFO_PATH)
|
57 |
+
snapshot_download(
|
58 |
+
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
59 |
+
)
|
60 |
+
except Exception:
|
61 |
+
restart_space()
|
62 |
+
try:
|
63 |
+
print(EVAL_RESULTS_PATH)
|
64 |
+
snapshot_download(
|
65 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
66 |
+
)
|
67 |
+
except Exception:
|
68 |
+
restart_space()
|
69 |
+
|
70 |
+
|
71 |
+
raw_data, original_df = get_leaderboard_df(
|
72 |
+
results_path=EVAL_RESULTS_PATH,
|
73 |
+
requests_path=EVAL_REQUESTS_PATH,
|
74 |
+
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
75 |
+
cols=COLS,
|
76 |
+
benchmark_cols=BENCHMARK_COLS
|
77 |
)
|
78 |
+
update_collections(original_df.copy())
|
79 |
+
leaderboard_df = original_df.copy()
|
80 |
|
81 |
+
plot_df = create_plot_df(create_scores_df(raw_data))
|
82 |
|
83 |
+
(
|
84 |
+
finished_eval_queue_df,
|
85 |
+
running_eval_queue_df,
|
86 |
+
pending_eval_queue_df,
|
87 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
88 |
|
89 |
+
return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
90 |
|
91 |
+
leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
|
|
|
|
|
|
|
|
92 |
|
93 |
|
94 |
# Searching and filtering
|
|
|
99 |
precision_query: str,
|
100 |
size_query: list,
|
101 |
show_deleted: bool,
|
102 |
+
show_merges: bool,
|
103 |
+
show_moe: bool,
|
104 |
show_flagged: bool,
|
105 |
query: str,
|
106 |
):
|
107 |
+
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_merges, show_moe, show_flagged)
|
108 |
filtered_df = filter_queries(query, filtered_df)
|
109 |
df = select_columns(filtered_df, columns)
|
110 |
return df
|
|
|
120 |
|
121 |
|
122 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
123 |
+
always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
124 |
+
dummy_col = [AutoEvalColumn.dummy.name]
|
125 |
+
#AutoEvalColumn.model_type_symbol.name,
|
126 |
+
#AutoEvalColumn.model.name,
|
127 |
# We use COLS to maintain sorting
|
128 |
filtered_df = df[
|
129 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
|
130 |
]
|
131 |
return filtered_df
|
132 |
|
|
|
152 |
|
153 |
|
154 |
def filter_models(
|
155 |
+
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_merges: bool, show_moe:bool, show_flagged: bool
|
156 |
) -> pd.DataFrame:
|
157 |
# Show all models
|
158 |
if show_deleted:
|
|
|
160 |
else: # Show only still on the hub models
|
161 |
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
162 |
|
163 |
+
if not show_merges:
|
164 |
+
filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
|
165 |
+
|
166 |
+
if not show_moe:
|
167 |
+
filtered_df = filtered_df[filtered_df[AutoEvalColumn.moe.name] == False]
|
168 |
+
|
169 |
if not show_flagged:
|
170 |
filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
|
171 |
|
|
|
180 |
|
181 |
return filtered_df
|
182 |
|
183 |
+
leaderboard_df = filter_models(
|
184 |
+
df=leaderboard_df,
|
185 |
+
type_query=[t.to_str(" : ") for t in ModelType],
|
186 |
+
size_query=list(NUMERIC_INTERVALS.keys()),
|
187 |
+
precision_query=[i.value.name for i in Precision],
|
188 |
+
show_deleted=False,
|
189 |
+
show_merges=False,
|
190 |
+
show_moe=True,
|
191 |
+
show_flagged=False
|
192 |
+
)
|
193 |
|
194 |
import unicodedata
|
195 |
|
src/display/about.py
CHANGED
@@ -159,10 +159,13 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
|
|
159 |
### 4) Fill up your model card
|
160 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
161 |
|
|
|
|
|
|
|
162 |
## In case of model failure
|
163 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
164 |
Make sure you have followed the above steps first.
|
165 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the
|
166 |
"""
|
167 |
|
168 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
159 |
### 4) Fill up your model card
|
160 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
161 |
|
162 |
+
### 5) Select the correct precision
|
163 |
+
Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range).
|
164 |
+
|
165 |
## In case of model failure
|
166 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
167 |
Make sure you have followed the above steps first.
|
168 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the command in the About tab under "Reproducibility" with all arguments specified (you can add `--limit` to limit the number of examples per task).
|
169 |
"""
|
170 |
|
171 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/display/css_html_js.py
CHANGED
@@ -1,5 +1,24 @@
|
|
1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
.markdown-text {
|
4 |
font-size: 16px !important;
|
5 |
}
|
@@ -21,14 +40,6 @@ custom_css = """
|
|
21 |
transform: scale(1.3);
|
22 |
}
|
23 |
|
24 |
-
#leaderboard-table {
|
25 |
-
margin-top: 15px
|
26 |
-
}
|
27 |
-
|
28 |
-
#leaderboard-table-lite {
|
29 |
-
margin-top: 15px
|
30 |
-
}
|
31 |
-
|
32 |
#search-bar-table-box > div:first-child {
|
33 |
background: none;
|
34 |
border: none;
|
@@ -38,36 +49,11 @@ custom_css = """
|
|
38 |
padding: 0px;
|
39 |
}
|
40 |
|
41 |
-
/* Hides the final AutoEvalColumn */
|
42 |
-
#llm-benchmark-tab-table table td:last-child,
|
43 |
-
#llm-benchmark-tab-table table th:last-child {
|
44 |
-
display: none;
|
45 |
-
}
|
46 |
-
|
47 |
-
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
48 |
-
table td:first-child,
|
49 |
-
table th:first-child {
|
50 |
-
max-width: 400px;
|
51 |
-
overflow: auto;
|
52 |
-
white-space: nowrap;
|
53 |
-
}
|
54 |
-
|
55 |
.tab-buttons button {
|
56 |
font-size: 20px;
|
57 |
}
|
58 |
|
59 |
-
|
60 |
-
border-style: none !important;
|
61 |
-
box-shadow: none;
|
62 |
-
display: block;
|
63 |
-
margin-left: auto;
|
64 |
-
margin-right: auto;
|
65 |
-
max-width: 600px;
|
66 |
-
}
|
67 |
-
|
68 |
-
#scale-logo .download {
|
69 |
-
display: none;
|
70 |
-
}
|
71 |
#filter_type{
|
72 |
border: 0;
|
73 |
padding-left: 0;
|
|
|
1 |
custom_css = """
|
2 |
+
/* Hides the final AutoEvalColumn */
|
3 |
+
#llm-benchmark-tab-table table td:last-child,
|
4 |
+
#llm-benchmark-tab-table table th:last-child {
|
5 |
+
display: none;
|
6 |
+
}
|
7 |
|
8 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
9 |
+
table td:first-child,
|
10 |
+
table th:first-child {
|
11 |
+
max-width: 400px;
|
12 |
+
overflow: auto;
|
13 |
+
white-space: nowrap;
|
14 |
+
}
|
15 |
+
|
16 |
+
/* Full width space */
|
17 |
+
.gradio-container {
|
18 |
+
max-width: 95%!important;
|
19 |
+
}
|
20 |
+
|
21 |
+
/* Text style and margins */
|
22 |
.markdown-text {
|
23 |
font-size: 16px !important;
|
24 |
}
|
|
|
40 |
transform: scale(1.3);
|
41 |
}
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
#search-bar-table-box > div:first-child {
|
44 |
background: none;
|
45 |
border: none;
|
|
|
49 |
padding: 0px;
|
50 |
}
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
.tab-buttons button {
|
53 |
font-size: 20px;
|
54 |
}
|
55 |
|
56 |
+
/* Filters style */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
#filter_type{
|
58 |
border: 0;
|
59 |
padding-left: 0;
|
src/display/utils.py
CHANGED
@@ -38,7 +38,7 @@ auto_eval_column_dict = []
|
|
38 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
39 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
40 |
#Scores
|
41 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
42 |
for task in Tasks:
|
43 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
44 |
# Model information
|
@@ -46,13 +46,14 @@ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type",
|
|
46 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
47 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
48 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
49 |
-
auto_eval_column_dict.append(["
|
50 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
51 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
52 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub", "number", False)])
|
53 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
54 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
55 |
-
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False,
|
|
|
56 |
# Dummy column for the search bar (hidden by the custom CSS)
|
57 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
58 |
|
@@ -73,8 +74,8 @@ baseline_row = {
|
|
73 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
74 |
AutoEvalColumn.revision.name: "N/A",
|
75 |
AutoEvalColumn.precision.name: None,
|
|
|
76 |
AutoEvalColumn.average.name: 31.0,
|
77 |
-
AutoEvalColumn.merge.name: False,
|
78 |
AutoEvalColumn.arc.name: 25.0,
|
79 |
AutoEvalColumn.hellaswag.name: 25.0,
|
80 |
AutoEvalColumn.mmlu.name: 25.0,
|
@@ -98,8 +99,8 @@ human_baseline_row = {
|
|
98 |
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
99 |
AutoEvalColumn.revision.name: "N/A",
|
100 |
AutoEvalColumn.precision.name: None,
|
101 |
-
AutoEvalColumn.merge.name: False,
|
102 |
AutoEvalColumn.average.name: 92.75,
|
|
|
103 |
AutoEvalColumn.arc.name: 80.0,
|
104 |
AutoEvalColumn.hellaswag.name: 95.0,
|
105 |
AutoEvalColumn.mmlu.name: 89.8,
|
@@ -108,6 +109,7 @@ human_baseline_row = {
|
|
108 |
AutoEvalColumn.gsm8k.name: 100,
|
109 |
AutoEvalColumn.dummy.name: "human_baseline",
|
110 |
AutoEvalColumn.model_type.name: "",
|
|
|
111 |
}
|
112 |
|
113 |
@dataclass
|
@@ -168,10 +170,8 @@ class Precision(Enum):
|
|
168 |
|
169 |
|
170 |
# Column selection
|
171 |
-
COLS = [c.name for c in fields(AutoEvalColumn)
|
172 |
-
TYPES = [c.type for c in fields(AutoEvalColumn)
|
173 |
-
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
174 |
-
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
175 |
|
176 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
177 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
38 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
39 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
40 |
#Scores
|
41 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
42 |
for task in Tasks:
|
43 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
44 |
# Model information
|
|
|
46 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
47 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
48 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
49 |
+
auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
|
50 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
51 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
52 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
53 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)])
|
54 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
55 |
+
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
56 |
+
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
57 |
# Dummy column for the search bar (hidden by the custom CSS)
|
58 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
59 |
|
|
|
74 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
75 |
AutoEvalColumn.revision.name: "N/A",
|
76 |
AutoEvalColumn.precision.name: None,
|
77 |
+
AutoEvalColumn.merged.name: False,
|
78 |
AutoEvalColumn.average.name: 31.0,
|
|
|
79 |
AutoEvalColumn.arc.name: 25.0,
|
80 |
AutoEvalColumn.hellaswag.name: 25.0,
|
81 |
AutoEvalColumn.mmlu.name: 25.0,
|
|
|
99 |
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
100 |
AutoEvalColumn.revision.name: "N/A",
|
101 |
AutoEvalColumn.precision.name: None,
|
|
|
102 |
AutoEvalColumn.average.name: 92.75,
|
103 |
+
AutoEvalColumn.merged.name: False,
|
104 |
AutoEvalColumn.arc.name: 80.0,
|
105 |
AutoEvalColumn.hellaswag.name: 95.0,
|
106 |
AutoEvalColumn.mmlu.name: 89.8,
|
|
|
109 |
AutoEvalColumn.gsm8k.name: 100,
|
110 |
AutoEvalColumn.dummy.name: "human_baseline",
|
111 |
AutoEvalColumn.model_type.name: "",
|
112 |
+
AutoEvalColumn.flagged.name: False,
|
113 |
}
|
114 |
|
115 |
@dataclass
|
|
|
170 |
|
171 |
|
172 |
# Column selection
|
173 |
+
COLS = [c.name for c in fields(AutoEvalColumn)]
|
174 |
+
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
|
|
|
|
175 |
|
176 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
177 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
src/envs.py
CHANGED
@@ -7,6 +7,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
|
7 |
|
8 |
REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
|
9 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
|
|
10 |
RESULTS_REPO = "open-llm-leaderboard/results"
|
11 |
|
12 |
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
@@ -18,6 +19,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
18 |
|
19 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
20 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
|
|
21 |
|
22 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
23 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
|
|
7 |
|
8 |
REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
|
9 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
10 |
+
DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
|
11 |
RESULTS_REPO = "open-llm-leaderboard/results"
|
12 |
|
13 |
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
|
|
19 |
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
22 |
+
DYNAMIC_INFO_PATH = os.path.join(CACHE_PATH, "dynamic-info")
|
23 |
+
DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
|
24 |
|
25 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
26 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
src/leaderboard/filter_models.py
CHANGED
@@ -4,6 +4,7 @@ from src.display.utils import AutoEvalColumn
|
|
4 |
# Models which have been flagged by users as being problematic for a reason or another
|
5 |
# (Model name to forum discussion link)
|
6 |
FLAGGED_MODELS = {
|
|
|
7 |
"Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
|
8 |
"deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
|
9 |
"Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
|
@@ -38,7 +39,49 @@ FLAGGED_MODELS = {
|
|
38 |
"v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
39 |
"v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
40 |
"rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
}
|
43 |
|
44 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
@@ -52,10 +95,16 @@ DO_NOT_SUBMIT_MODELS = [
|
|
52 |
|
53 |
def flag_models(leaderboard_data: list[dict]):
|
54 |
for model_data in leaderboard_data:
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
issue_link = model_hyperlink(
|
58 |
-
FLAGGED_MODELS[
|
59 |
f"See discussion #{issue_num}",
|
60 |
)
|
61 |
model_data[
|
|
|
4 |
# Models which have been flagged by users as being problematic for a reason or another
|
5 |
# (Model name to forum discussion link)
|
6 |
FLAGGED_MODELS = {
|
7 |
+
"merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
8 |
"Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
|
9 |
"deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
|
10 |
"Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
|
|
|
39 |
"v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
40 |
"v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
41 |
"rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
42 |
+
"zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/503",
|
43 |
+
"dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
44 |
+
"udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
|
45 |
+
"dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
|
46 |
+
# Merges not indicated
|
47 |
+
"gagan3012/MetaModelv2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
48 |
+
"gagan3012/MetaModelv3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
49 |
+
"kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
50 |
+
"kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
51 |
+
"kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
52 |
+
"kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
53 |
+
"fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
54 |
+
"perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
55 |
+
"rwitz/go-bruins-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
56 |
+
"rwitz/go-bruins": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
57 |
+
"Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
58 |
+
"aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
59 |
+
"NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
60 |
+
"Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
61 |
+
"OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
62 |
+
"perlthoughts/Falkor-7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
63 |
+
"v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
64 |
+
"Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
65 |
+
"DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
66 |
+
"PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
67 |
+
"Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
68 |
+
"Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
69 |
+
"perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
70 |
+
"elinas/chronos007-70b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
71 |
+
"Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
72 |
+
"Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
73 |
+
"diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
74 |
+
"Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
75 |
+
"Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
76 |
+
"Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
77 |
+
"garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
78 |
+
"Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
79 |
+
"uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
80 |
+
"DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
|
81 |
+
"cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
|
82 |
+
"DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
|
83 |
+
"DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
|
84 |
+
"gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
|
85 |
}
|
86 |
|
87 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
|
|
95 |
|
96 |
def flag_models(leaderboard_data: list[dict]):
|
97 |
for model_data in leaderboard_data:
|
98 |
+
# Merges and moes are flagged automatically
|
99 |
+
if model_data[AutoEvalColumn.flagged.name] == True:
|
100 |
+
flag_key = "merged"
|
101 |
+
else:
|
102 |
+
flag_key = model_data["model_name_for_query"]
|
103 |
+
|
104 |
+
if flag_key in FLAGGED_MODELS:
|
105 |
+
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
106 |
issue_link = model_hyperlink(
|
107 |
+
FLAGGED_MODELS[flag_key],
|
108 |
f"See discussion #{issue_num}",
|
109 |
)
|
110 |
model_data[
|
src/leaderboard/read_evals.py
CHANGED
@@ -5,15 +5,12 @@ import os
|
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
import dateutil
|
8 |
-
from datetime import datetime
|
9 |
-
from transformers import AutoConfig
|
10 |
import numpy as np
|
11 |
|
12 |
from huggingface_hub import ModelCard
|
13 |
|
14 |
from src.display.formatting import make_clickable_model
|
15 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
16 |
-
from src.submission.check_validity import is_model_on_hub
|
17 |
|
18 |
|
19 |
@dataclass
|
@@ -33,8 +30,11 @@ class EvalResult:
|
|
33 |
likes: int = 0
|
34 |
num_params: int = 0
|
35 |
date: str = "" # submission date of request file
|
36 |
-
still_on_hub: bool =
|
37 |
-
|
|
|
|
|
|
|
38 |
|
39 |
@classmethod
|
40 |
def init_from_json_file(self, json_filepath):
|
@@ -43,13 +43,13 @@ class EvalResult:
|
|
43 |
data = json.load(fp)
|
44 |
|
45 |
# We manage the legacy config format
|
46 |
-
config = data.get("
|
47 |
|
48 |
# Precision
|
49 |
precision = Precision.from_str(config.get("model_dtype"))
|
50 |
|
51 |
# Get model and org
|
52 |
-
org_and_model = config.get("model_name"
|
53 |
org_and_model = org_and_model.split("/", 1)
|
54 |
|
55 |
if len(org_and_model) == 1:
|
@@ -62,20 +62,6 @@ class EvalResult:
|
|
62 |
result_key = f"{org}_{model}_{precision.value.name}"
|
63 |
full_model = "/".join(org_and_model)
|
64 |
|
65 |
-
try:
|
66 |
-
merge = any(t in ["merge", "mergedlm"] for t in ModelCard.load(full_model).data.tags)
|
67 |
-
except Exception:
|
68 |
-
merge = False
|
69 |
-
|
70 |
-
still_on_hub, error, model_config = is_model_on_hub(
|
71 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
72 |
-
)
|
73 |
-
architecture = "?"
|
74 |
-
if model_config is not None:
|
75 |
-
architectures = getattr(model_config, "architectures", None)
|
76 |
-
if architectures:
|
77 |
-
architecture = ";".join(architectures)
|
78 |
-
|
79 |
# Extract results available in this file (some results are split in several files)
|
80 |
results = {}
|
81 |
for task in Tasks:
|
@@ -112,9 +98,6 @@ class EvalResult:
|
|
112 |
results=results,
|
113 |
precision=precision,
|
114 |
revision= config.get("model_sha", ""),
|
115 |
-
still_on_hub=still_on_hub,
|
116 |
-
architecture=architecture,
|
117 |
-
merge=merge
|
118 |
)
|
119 |
|
120 |
def update_with_request_file(self, requests_path):
|
@@ -124,15 +107,24 @@ class EvalResult:
|
|
124 |
try:
|
125 |
with open(request_file, "r") as f:
|
126 |
request = json.load(f)
|
127 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
128 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
129 |
-
self.license = request.get("license", "?")
|
130 |
-
self.likes = request.get("likes", 0)
|
131 |
self.num_params = request.get("params", 0)
|
132 |
self.date = request.get("submitted_time", "")
|
133 |
-
|
|
|
|
|
|
|
134 |
print(f"Could not find request file for {self.org}/{self.model}")
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
def to_dict(self):
|
137 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
138 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
@@ -140,7 +132,6 @@ class EvalResult:
|
|
140 |
"eval_name": self.eval_name, # not a column, just a save name,
|
141 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
142 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
143 |
-
AutoEvalColumn.merge.name: self.merge,
|
144 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
145 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
146 |
AutoEvalColumn.architecture.name: self.architecture,
|
@@ -152,6 +143,9 @@ class EvalResult:
|
|
152 |
AutoEvalColumn.likes.name: self.likes,
|
153 |
AutoEvalColumn.params.name: self.num_params,
|
154 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
|
|
|
|
|
|
155 |
}
|
156 |
|
157 |
for task in Tasks:
|
@@ -182,7 +176,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
182 |
return request_file
|
183 |
|
184 |
|
185 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
186 |
"""From the path of the results folder root, extract all needed info for results"""
|
187 |
model_result_filepaths = []
|
188 |
|
@@ -200,11 +194,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
200 |
for file in files:
|
201 |
model_result_filepaths.append(os.path.join(root, file))
|
202 |
|
|
|
|
|
|
|
203 |
eval_results = {}
|
204 |
for model_result_filepath in model_result_filepaths:
|
205 |
# Creation of result
|
206 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
207 |
eval_result.update_with_request_file(requests_path)
|
|
|
|
|
208 |
|
209 |
# Store results of same eval together
|
210 |
eval_name = eval_result.eval_name
|
@@ -216,8 +215,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
216 |
results = []
|
217 |
for v in eval_results.values():
|
218 |
try:
|
219 |
-
v.
|
220 |
-
|
|
|
221 |
except KeyError: # not all eval values present
|
222 |
continue
|
223 |
|
|
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
import dateutil
|
|
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from huggingface_hub import ModelCard
|
11 |
|
12 |
from src.display.formatting import make_clickable_model
|
13 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
|
|
14 |
|
15 |
|
16 |
@dataclass
|
|
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
32 |
date: str = "" # submission date of request file
|
33 |
+
still_on_hub: bool = True
|
34 |
+
is_merge: bool = False
|
35 |
+
flagged: bool = False
|
36 |
+
status: str = "FINISHED"
|
37 |
+
tags: list = None
|
38 |
|
39 |
@classmethod
|
40 |
def init_from_json_file(self, json_filepath):
|
|
|
43 |
data = json.load(fp)
|
44 |
|
45 |
# We manage the legacy config format
|
46 |
+
config = data.get("config_general")
|
47 |
|
48 |
# Precision
|
49 |
precision = Precision.from_str(config.get("model_dtype"))
|
50 |
|
51 |
# Get model and org
|
52 |
+
org_and_model = config.get("model_name")
|
53 |
org_and_model = org_and_model.split("/", 1)
|
54 |
|
55 |
if len(org_and_model) == 1:
|
|
|
62 |
result_key = f"{org}_{model}_{precision.value.name}"
|
63 |
full_model = "/".join(org_and_model)
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
# Extract results available in this file (some results are split in several files)
|
66 |
results = {}
|
67 |
for task in Tasks:
|
|
|
98 |
results=results,
|
99 |
precision=precision,
|
100 |
revision= config.get("model_sha", ""),
|
|
|
|
|
|
|
101 |
)
|
102 |
|
103 |
def update_with_request_file(self, requests_path):
|
|
|
107 |
try:
|
108 |
with open(request_file, "r") as f:
|
109 |
request = json.load(f)
|
110 |
+
self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
|
111 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
|
|
|
|
112 |
self.num_params = request.get("params", 0)
|
113 |
self.date = request.get("submitted_time", "")
|
114 |
+
self.architecture = request.get("architectures", "Unknown")
|
115 |
+
self.status = request.get("status", "FAILED")
|
116 |
+
except Exception as e:
|
117 |
+
self.status = "FAILED"
|
118 |
print(f"Could not find request file for {self.org}/{self.model}")
|
119 |
|
120 |
+
def update_with_dynamic_file_dict(self, file_dict):
|
121 |
+
self.license = file_dict.get("license", "?")
|
122 |
+
self.likes = file_dict.get("likes", 0)
|
123 |
+
self.still_on_hub = file_dict["still_on_hub"]
|
124 |
+
self.flagged = any("flagged" in tag for tag in file_dict["tags"])
|
125 |
+
self.tags = file_dict["tags"]
|
126 |
+
|
127 |
+
|
128 |
def to_dict(self):
|
129 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
130 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
132 |
"eval_name": self.eval_name, # not a column, just a save name,
|
133 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
134 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
|
|
135 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
136 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
137 |
AutoEvalColumn.architecture.name: self.architecture,
|
|
|
143 |
AutoEvalColumn.likes.name: self.likes,
|
144 |
AutoEvalColumn.params.name: self.num_params,
|
145 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
146 |
+
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
147 |
+
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
148 |
+
AutoEvalColumn.flagged.name: self.flagged
|
149 |
}
|
150 |
|
151 |
for task in Tasks:
|
|
|
176 |
return request_file
|
177 |
|
178 |
|
179 |
+
def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
|
180 |
"""From the path of the results folder root, extract all needed info for results"""
|
181 |
model_result_filepaths = []
|
182 |
|
|
|
194 |
for file in files:
|
195 |
model_result_filepaths.append(os.path.join(root, file))
|
196 |
|
197 |
+
with open(dynamic_path) as f:
|
198 |
+
dynamic_data = json.load(f)
|
199 |
+
|
200 |
eval_results = {}
|
201 |
for model_result_filepath in model_result_filepaths:
|
202 |
# Creation of result
|
203 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
204 |
eval_result.update_with_request_file(requests_path)
|
205 |
+
if eval_result.full_model in dynamic_data:
|
206 |
+
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
207 |
|
208 |
# Store results of same eval together
|
209 |
eval_name = eval_result.eval_name
|
|
|
215 |
results = []
|
216 |
for v in eval_results.values():
|
217 |
try:
|
218 |
+
if v.status == "FINISHED":
|
219 |
+
v.to_dict() # we test if the dict version is complete
|
220 |
+
results.append(v)
|
221 |
except KeyError: # not all eval values present
|
222 |
continue
|
223 |
|
src/populate.py
CHANGED
@@ -9,8 +9,8 @@ from src.leaderboard.filter_models import filter_models
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
all_data_json.append(baseline_row)
|
16 |
filter_models(all_data_json)
|
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
+
def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
+
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
all_data_json.append(baseline_row)
|
16 |
filter_models(all_data_json)
|
{scripts → src/scripts}/create_request_file.py
RENAMED
@@ -1,36 +1,21 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
import pprint
|
4 |
-
import re
|
5 |
from datetime import datetime, timezone
|
6 |
|
7 |
import click
|
8 |
from colorama import Fore
|
9 |
from huggingface_hub import HfApi, snapshot_download
|
10 |
|
|
|
|
|
|
|
11 |
EVAL_REQUESTS_PATH = "eval-queue"
|
12 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
13 |
|
14 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
15 |
-
model_types =
|
16 |
-
weight_types =
|
17 |
-
|
18 |
-
|
19 |
-
def get_model_size(model_info, precision: str):
|
20 |
-
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
21 |
-
try:
|
22 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
23 |
-
except (AttributeError, TypeError):
|
24 |
-
try:
|
25 |
-
size_match = re.search(size_pattern, model_info.modelId.lower())
|
26 |
-
model_size = size_match.group(0)
|
27 |
-
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
28 |
-
except AttributeError:
|
29 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
30 |
-
|
31 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
32 |
-
model_size = size_factor * model_size
|
33 |
-
return model_size
|
34 |
|
35 |
|
36 |
def main():
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import pprint
|
|
|
4 |
from datetime import datetime, timezone
|
5 |
|
6 |
import click
|
7 |
from colorama import Fore
|
8 |
from huggingface_hub import HfApi, snapshot_download
|
9 |
|
10 |
+
from src.submission.check_validity import get_model_size
|
11 |
+
from src.display.utils import ModelType, WeightType
|
12 |
+
|
13 |
EVAL_REQUESTS_PATH = "eval-queue"
|
14 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
15 |
|
16 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
17 |
+
model_types = [e.name for e in ModelType]
|
18 |
+
weight_types = [e.name for e in WeightType]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
def main():
|
src/scripts/update_all_request_files.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import ModelFilter, snapshot_download
|
2 |
+
from huggingface_hub import ModelCard
|
3 |
+
|
4 |
+
import json
|
5 |
+
import time
|
6 |
+
from src.submission.check_validity import is_model_on_hub, check_model_card
|
7 |
+
from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN
|
8 |
+
|
9 |
+
def update_models(file_path, models):
|
10 |
+
"""
|
11 |
+
Search through all JSON files in the specified root folder and its subfolders,
|
12 |
+
and update the likes key in JSON dict from value of input dict
|
13 |
+
"""
|
14 |
+
with open(file_path, "r") as f:
|
15 |
+
model_infos = json.load(f)
|
16 |
+
for model_id, data in model_infos.items():
|
17 |
+
if model_id not in models:
|
18 |
+
data['still_on_hub'] = False
|
19 |
+
data['likes'] = 0
|
20 |
+
data['downloads'] = 0
|
21 |
+
data['created_at'] = ""
|
22 |
+
continue
|
23 |
+
|
24 |
+
model_cfg = models[model_id]
|
25 |
+
data['likes'] = model_cfg.likes
|
26 |
+
data['downloads'] = model_cfg.downloads
|
27 |
+
data['created_at'] = str(model_cfg.created_at)
|
28 |
+
#data['params'] = get_model_size(model_cfg, data['precision'])
|
29 |
+
data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
|
30 |
+
|
31 |
+
# Is the model still on the hub
|
32 |
+
still_on_hub, error, model_config = is_model_on_hub(
|
33 |
+
model_name=model_id, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
|
34 |
+
)
|
35 |
+
# If the model doesn't have a model card or a license, we consider it's deleted
|
36 |
+
if still_on_hub:
|
37 |
+
try:
|
38 |
+
if check_model_card(model_id)[0] is False:
|
39 |
+
still_on_hub = False
|
40 |
+
except Exception:
|
41 |
+
still_on_hub = False
|
42 |
+
data['still_on_hub'] = still_on_hub
|
43 |
+
|
44 |
+
# Check if the model is a merge
|
45 |
+
is_merge_from_metadata = False
|
46 |
+
is_moe_from_metadata = False
|
47 |
+
if still_on_hub:
|
48 |
+
model_card = ModelCard.load(model_id)
|
49 |
+
|
50 |
+
# Storing the model metadata
|
51 |
+
tags = []
|
52 |
+
if model_card.data.tags:
|
53 |
+
is_merge_from_metadata = "merge" in model_card.data.tags
|
54 |
+
is_moe_from_metadata = "moe" in model_card.data.tags
|
55 |
+
merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
|
56 |
+
# If the model is a merge but not saying it in the metadata, we flag it
|
57 |
+
is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
|
58 |
+
if is_merge_from_model_card or is_merge_from_metadata:
|
59 |
+
tags.append("merge")
|
60 |
+
if not is_merge_from_metadata:
|
61 |
+
tags.append("flagged:undisclosed_merge")
|
62 |
+
moe_keywords = ["moe", "mixture of experts", "mixtral"]
|
63 |
+
is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)
|
64 |
+
is_moe_from_name = "moe" in model_id.lower().replace("/", "-").replace("_", "-").split("-")
|
65 |
+
if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
|
66 |
+
tags.append("moe")
|
67 |
+
if not is_moe_from_metadata:
|
68 |
+
tags.append("flagged:undisclosed_moe")
|
69 |
+
|
70 |
+
data["tags"] = tags
|
71 |
+
|
72 |
+
with open(file_path, 'w') as f:
|
73 |
+
json.dump(model_infos, f, indent=2)
|
74 |
+
|
75 |
+
def update_dynamic_files():
|
76 |
+
""" This will only update metadata for models already linked in the repo, not add missing ones.
|
77 |
+
"""
|
78 |
+
snapshot_download(
|
79 |
+
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
80 |
+
)
|
81 |
+
|
82 |
+
print("UPDATE_DYNAMIC: Loaded snapshot")
|
83 |
+
# Get models
|
84 |
+
start = time.time()
|
85 |
+
|
86 |
+
models = list(API.list_models(
|
87 |
+
filter=ModelFilter(task="text-generation"),
|
88 |
+
full=False,
|
89 |
+
cardData=True,
|
90 |
+
fetch_config=True,
|
91 |
+
))
|
92 |
+
id_to_model = {model.id : model for model in models}
|
93 |
+
|
94 |
+
print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
|
95 |
+
|
96 |
+
start = time.time()
|
97 |
+
|
98 |
+
update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
|
99 |
+
|
100 |
+
print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
|
101 |
+
|
102 |
+
API.upload_file(
|
103 |
+
path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
|
104 |
+
path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
|
105 |
+
repo_id=DYNAMIC_INFO_REPO,
|
106 |
+
repo_type="dataset",
|
107 |
+
commit_message=f"Daily request file update.",
|
108 |
+
)
|
109 |
+
print(f"UPDATE_DYNAMIC: pushed to hub")
|
src/submission/check_validity.py
CHANGED
@@ -6,9 +6,8 @@ from datetime import datetime, timedelta, timezone
|
|
6 |
|
7 |
import huggingface_hub
|
8 |
from huggingface_hub import ModelCard
|
9 |
-
from huggingface_hub.hf_api import ModelInfo
|
10 |
from transformers import AutoConfig, AutoTokenizer
|
11 |
-
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
|
12 |
|
13 |
from src.envs import HAS_HIGHER_RATE_LIMIT
|
14 |
|
@@ -37,9 +36,9 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
37 |
return True, ""
|
38 |
|
39 |
|
40 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
41 |
try:
|
42 |
-
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
43 |
if test_tokenizer:
|
44 |
try:
|
45 |
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
@@ -53,7 +52,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
53 |
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
54 |
return True, None, config
|
55 |
|
56 |
-
except ValueError:
|
57 |
return (
|
58 |
False,
|
59 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
@@ -65,18 +64,24 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
65 |
|
66 |
|
67 |
def get_model_size(model_info: ModelInfo, precision: str):
|
68 |
-
size_pattern =
|
|
|
69 |
try:
|
70 |
-
|
71 |
-
except
|
|
|
|
|
|
|
|
|
|
|
72 |
try:
|
73 |
-
size_match = re.search(size_pattern, model_info.
|
74 |
model_size = size_match.group(0)
|
75 |
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
76 |
-
except AttributeError:
|
77 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
78 |
|
79 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.
|
80 |
model_size = size_factor * model_size
|
81 |
return model_size
|
82 |
|
|
|
6 |
|
7 |
import huggingface_hub
|
8 |
from huggingface_hub import ModelCard
|
9 |
+
from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
|
10 |
from transformers import AutoConfig, AutoTokenizer
|
|
|
11 |
|
12 |
from src.envs import HAS_HIGHER_RATE_LIMIT
|
13 |
|
|
|
36 |
return True, ""
|
37 |
|
38 |
|
39 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str, AutoConfig]:
|
40 |
try:
|
41 |
+
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
|
42 |
if test_tokenizer:
|
43 |
try:
|
44 |
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
|
|
52 |
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
53 |
return True, None, config
|
54 |
|
55 |
+
except ValueError as e:
|
56 |
return (
|
57 |
False,
|
58 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
|
|
64 |
|
65 |
|
66 |
def get_model_size(model_info: ModelInfo, precision: str):
|
67 |
+
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
68 |
+
safetensors = None
|
69 |
try:
|
70 |
+
safetensors = get_safetensors_metadata(model_info.id)
|
71 |
+
except Exception as e:
|
72 |
+
print(e)
|
73 |
+
|
74 |
+
if safetensors is not None:
|
75 |
+
model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
|
76 |
+
else:
|
77 |
try:
|
78 |
+
size_match = re.search(size_pattern, model_info.id.lower())
|
79 |
model_size = size_match.group(0)
|
80 |
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
81 |
+
except AttributeError as e:
|
82 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
83 |
|
84 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
|
85 |
model_size = size_factor * model_size
|
86 |
return model_size
|
87 |
|
src/submission/submit.py
CHANGED
@@ -2,8 +2,10 @@ import json
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
|
|
|
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
7 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
8 |
from src.submission.check_validity import (
|
9 |
already_submitted_models,
|
@@ -64,10 +66,21 @@ def add_new_eval(
|
|
64 |
if not base_model_on_hub:
|
65 |
return styled_error(f'Base model "{base_model}" {error}')
|
66 |
|
|
|
|
|
|
|
67 |
if not weight_type == "Adapter":
|
68 |
-
model_on_hub, error,
|
69 |
if not model_on_hub:
|
70 |
return styled_error(f'Model "{model}" {error}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
# Is the model info correctly filled?
|
73 |
try:
|
@@ -86,6 +99,31 @@ def add_new_eval(
|
|
86 |
modelcard_OK, error_msg = check_model_card(model)
|
87 |
if not modelcard_OK:
|
88 |
return styled_error(error_msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
# Seems good, creating the eval
|
91 |
print("Adding new eval")
|
@@ -96,13 +134,23 @@ def add_new_eval(
|
|
96 |
"revision": revision,
|
97 |
"private": private,
|
98 |
"precision": precision,
|
|
|
|
|
99 |
"weight_type": weight_type,
|
100 |
"status": "PENDING",
|
101 |
"submitted_time": current_time,
|
102 |
"model_type": model_type,
|
|
|
|
|
|
|
|
|
|
|
103 |
"likes": model_info.likes,
|
104 |
-
"params": model_size,
|
105 |
"license": license,
|
|
|
|
|
|
|
|
|
106 |
}
|
107 |
|
108 |
# Check for duplicate submission
|
@@ -126,6 +174,28 @@ def add_new_eval(
|
|
126 |
commit_message=f"Add {model} to eval queue",
|
127 |
)
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
# Remove the local file
|
130 |
os.remove(out_path)
|
131 |
|
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
+
from huggingface_hub import ModelCard, snapshot_download
|
6 |
+
|
7 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
8 |
+
from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
9 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
10 |
from src.submission.check_validity import (
|
11 |
already_submitted_models,
|
|
|
66 |
if not base_model_on_hub:
|
67 |
return styled_error(f'Base model "{base_model}" {error}')
|
68 |
|
69 |
+
architecture = "?"
|
70 |
+
downloads = 0
|
71 |
+
created_at = ""
|
72 |
if not weight_type == "Adapter":
|
73 |
+
model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
|
74 |
if not model_on_hub:
|
75 |
return styled_error(f'Model "{model}" {error}')
|
76 |
+
if model_config is not None:
|
77 |
+
architectures = getattr(model_config, "architectures", None)
|
78 |
+
if architectures:
|
79 |
+
architecture = ";".join(architectures)
|
80 |
+
downloads = getattr(model_config, 'downloads', 0)
|
81 |
+
created_at = getattr(model_config, 'created_at', '')
|
82 |
+
|
83 |
+
|
84 |
|
85 |
# Is the model info correctly filled?
|
86 |
try:
|
|
|
99 |
modelcard_OK, error_msg = check_model_card(model)
|
100 |
if not modelcard_OK:
|
101 |
return styled_error(error_msg)
|
102 |
+
|
103 |
+
is_merge_from_metadata = False
|
104 |
+
is_moe_from_metadata = False
|
105 |
+
model_card = ModelCard.load(model)
|
106 |
+
|
107 |
+
# Storing the model tags
|
108 |
+
tags = []
|
109 |
+
if model_card.data.tags:
|
110 |
+
is_merge_from_metadata = "merge" in model_card.data.tags
|
111 |
+
is_moe_from_metadata = "moe" in model_card.data.tags
|
112 |
+
merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
|
113 |
+
# If the model is a merge but not saying it in the metadata, we flag it
|
114 |
+
is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
|
115 |
+
if is_merge_from_model_card or is_merge_from_metadata:
|
116 |
+
tags.append("merge")
|
117 |
+
if not is_merge_from_metadata:
|
118 |
+
tags.append("flagged:undisclosed_merge")
|
119 |
+
moe_keywords = ["moe", "mixture of experts", "mixtral"]
|
120 |
+
is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)
|
121 |
+
is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
|
122 |
+
if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
|
123 |
+
tags.append("moe")
|
124 |
+
if not is_moe_from_metadata:
|
125 |
+
tags.append("flagged:undisclosed_moe")
|
126 |
+
|
127 |
|
128 |
# Seems good, creating the eval
|
129 |
print("Adding new eval")
|
|
|
134 |
"revision": revision,
|
135 |
"private": private,
|
136 |
"precision": precision,
|
137 |
+
"params": model_size,
|
138 |
+
"architectures": architecture,
|
139 |
"weight_type": weight_type,
|
140 |
"status": "PENDING",
|
141 |
"submitted_time": current_time,
|
142 |
"model_type": model_type,
|
143 |
+
"job_id": -1,
|
144 |
+
"job_start_time": None,
|
145 |
+
}
|
146 |
+
|
147 |
+
supplementary_info = {
|
148 |
"likes": model_info.likes,
|
|
|
149 |
"license": license,
|
150 |
+
"still_on_hub": True,
|
151 |
+
"tags": tags,
|
152 |
+
"downloads": downloads,
|
153 |
+
"created_at": created_at
|
154 |
}
|
155 |
|
156 |
# Check for duplicate submission
|
|
|
174 |
commit_message=f"Add {model} to eval queue",
|
175 |
)
|
176 |
|
177 |
+
# We want to grab the latest version of the submission file to not accidentally overwrite it
|
178 |
+
snapshot_download(
|
179 |
+
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
180 |
+
)
|
181 |
+
|
182 |
+
with open(DYNAMIC_INFO_FILE_PATH) as f:
|
183 |
+
all_supplementary_info = json.load(f)
|
184 |
+
|
185 |
+
all_supplementary_info[model] = supplementary_info
|
186 |
+
with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
|
187 |
+
json.dump(all_supplementary_info, f, indent=2)
|
188 |
+
|
189 |
+
API.upload_file(
|
190 |
+
path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
|
191 |
+
path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
|
192 |
+
repo_id=DYNAMIC_INFO_REPO,
|
193 |
+
repo_type="dataset",
|
194 |
+
commit_message=f"Add {model} to dynamic info queue",
|
195 |
+
)
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
# Remove the local file
|
200 |
os.remove(out_path)
|
201 |
|