Spaces:
Runtime error
Runtime error
Sai Vinay G
commited on
Commit
·
010b2a5
1
Parent(s):
e16ecd0
updates
Browse files- .pre-commit-config.yaml +53 -0
- Makefile +13 -0
- app.py +225 -220
- pyproject.toml +13 -0
- requirements.txt +2 -2
- src/assets/css_html_js.py +4 -4
- src/assets/hardcoded_evals.py +10 -11
- src/assets/text_content.py +15 -15
- src/auto_leaderboard/model_metadata_type.py +0 -551
- src/{auto_leaderboard → display_models}/get_model_metadata.py +40 -19
- src/{auto_leaderboard → display_models}/model_metadata_flags.py +2 -2
- src/display_models/model_metadata_type.py +550 -0
- src/{auto_leaderboard/load_results.py → display_models/read_results.py} +26 -16
- src/{utils_display.py → display_models/utils.py} +44 -16
- src/init.py +0 -51
- src/load_from_hub.py +139 -0
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
default_language_version:
|
16 |
+
python: python3
|
17 |
+
|
18 |
+
ci:
|
19 |
+
autofix_prs: true
|
20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
21 |
+
autoupdate_schedule: quarterly
|
22 |
+
|
23 |
+
repos:
|
24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
25 |
+
rev: v4.3.0
|
26 |
+
hooks:
|
27 |
+
- id: check-yaml
|
28 |
+
- id: check-case-conflict
|
29 |
+
- id: detect-private-key
|
30 |
+
- id: check-added-large-files
|
31 |
+
args: ['--maxkb=1000']
|
32 |
+
- id: requirements-txt-fixer
|
33 |
+
- id: end-of-file-fixer
|
34 |
+
- id: trailing-whitespace
|
35 |
+
|
36 |
+
- repo: https://github.com/PyCQA/isort
|
37 |
+
rev: 5.12.0
|
38 |
+
hooks:
|
39 |
+
- id: isort
|
40 |
+
name: Format imports
|
41 |
+
|
42 |
+
- repo: https://github.com/psf/black
|
43 |
+
rev: 22.12.0
|
44 |
+
hooks:
|
45 |
+
- id: black
|
46 |
+
name: Format code
|
47 |
+
additional_dependencies: ['click==8.0.2']
|
48 |
+
|
49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
50 |
+
# Ruff version.
|
51 |
+
rev: 'v0.0.267'
|
52 |
+
hooks:
|
53 |
+
- id: ruff
|
Makefile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: style format
|
2 |
+
|
3 |
+
|
4 |
+
style:
|
5 |
+
python -m black --line-length 119 .
|
6 |
+
python -m isort .
|
7 |
+
ruff check --fix .
|
8 |
+
|
9 |
+
|
10 |
+
quality:
|
11 |
+
python -m black --check --line-length 119 .
|
12 |
+
python -m isort --check-only .
|
13 |
+
ruff check .
|
app.py
CHANGED
@@ -2,23 +2,31 @@ import json
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
-
|
6 |
import gradio as gr
|
7 |
-
import numpy as np
|
8 |
import pandas as pd
|
9 |
from apscheduler.schedulers.background import BackgroundScheduler
|
10 |
from huggingface_hub import HfApi
|
11 |
-
from transformers import AutoConfig
|
12 |
-
|
13 |
-
from src.auto_leaderboard.get_model_metadata import apply_metadata, DO_NOT_SUBMIT_MODELS
|
14 |
-
from src.assets.text_content import *
|
15 |
-
from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
|
16 |
-
from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
|
17 |
from src.assets.css_html_js import custom_css, get_window_url_params
|
18 |
-
from src.
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# clone / pull the lmeh eval data
|
24 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
@@ -37,20 +45,16 @@ EVAL_RESULTS_PATH = "eval-results"
|
|
37 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
38 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
39 |
|
40 |
-
api = HfApi()
|
|
|
41 |
|
42 |
def restart_space():
|
43 |
api.restart_space(
|
44 |
repo_id="gsaivinay/open_llm_leaderboard", token=H4_TOKEN
|
45 |
)
|
46 |
|
47 |
-
eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
|
48 |
-
|
49 |
-
if not IS_PUBLIC:
|
50 |
-
eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
|
51 |
-
else:
|
52 |
-
eval_queue_private, eval_results_private = None, None
|
53 |
|
|
|
54 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
55 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
56 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
@@ -63,116 +67,41 @@ if not IS_PUBLIC:
|
|
63 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
64 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
65 |
|
66 |
-
BENCHMARK_COLS = [
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
def get_leaderboard_df():
|
78 |
-
if eval_results:
|
79 |
-
print("Pulling evaluation results for the leaderboard.")
|
80 |
-
eval_results.git_pull()
|
81 |
-
if eval_results_private:
|
82 |
-
print("Pulling evaluation results for the leaderboard.")
|
83 |
-
eval_results_private.git_pull()
|
84 |
-
|
85 |
-
all_data = get_eval_results_dicts()
|
86 |
-
|
87 |
-
# if not IS_PUBLIC:
|
88 |
-
all_data.append(gpt4_values)
|
89 |
-
all_data.append(gpt35_values)
|
90 |
-
|
91 |
-
all_data.append(baseline)
|
92 |
-
apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
|
93 |
-
|
94 |
-
df = pd.DataFrame.from_records(all_data)
|
95 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
96 |
-
df = df[COLS].round(decimals=2)
|
97 |
-
|
98 |
-
# filter out if any of the benchmarks have not been produced
|
99 |
-
df = df[has_no_nan_values(df, BENCHMARK_COLS)]
|
100 |
-
return df
|
101 |
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
110 |
|
111 |
-
|
112 |
-
entry
|
113 |
-
for entry in os.listdir(EVAL_REQUESTS_PATH)
|
114 |
-
if not entry.startswith(".")
|
115 |
-
]
|
116 |
-
all_evals = []
|
117 |
-
|
118 |
-
for entry in entries:
|
119 |
-
if ".json" in entry:
|
120 |
-
file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
|
121 |
-
with open(file_path) as fp:
|
122 |
-
data = json.load(fp)
|
123 |
-
|
124 |
-
data["# params"] = "unknown"
|
125 |
-
data["model"] = make_clickable_model(data["model"])
|
126 |
-
data["revision"] = data.get("revision", "main")
|
127 |
-
|
128 |
-
all_evals.append(data)
|
129 |
-
elif ".md" not in entry:
|
130 |
-
# this is a folder
|
131 |
-
sub_entries = [
|
132 |
-
e
|
133 |
-
for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
|
134 |
-
if not e.startswith(".")
|
135 |
-
]
|
136 |
-
for sub_entry in sub_entries:
|
137 |
-
file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
|
138 |
-
with open(file_path) as fp:
|
139 |
-
data = json.load(fp)
|
140 |
-
|
141 |
-
# data["# params"] = get_n_params(data["model"])
|
142 |
-
data["model"] = make_clickable_model(data["model"])
|
143 |
-
all_evals.append(data)
|
144 |
-
|
145 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
146 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
147 |
-
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
148 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
|
149 |
-
df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
|
150 |
-
df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
|
151 |
-
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
original_df = get_leaderboard_df()
|
156 |
leaderboard_df = original_df.copy()
|
157 |
(
|
158 |
finished_eval_queue_df,
|
159 |
running_eval_queue_df,
|
160 |
pending_eval_queue_df,
|
161 |
-
) = get_evaluation_queue_df()
|
162 |
-
|
163 |
-
def is_model_on_hub(model_name, revision) -> bool:
|
164 |
-
try:
|
165 |
-
AutoConfig.from_pretrained(model_name, revision=revision)
|
166 |
-
return True, None
|
167 |
-
|
168 |
-
except ValueError as e:
|
169 |
-
return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
|
170 |
-
|
171 |
-
except Exception as e:
|
172 |
-
print(f"Could not get the model config from the hub.: {e}")
|
173 |
-
return False, "was not found on hub!"
|
174 |
|
175 |
|
|
|
176 |
def add_new_eval(
|
177 |
model: str,
|
178 |
base_model: str,
|
@@ -196,13 +125,12 @@ def add_new_eval(
|
|
196 |
base_model_on_hub, error = is_model_on_hub(base_model, revision)
|
197 |
if not base_model_on_hub:
|
198 |
return styled_error(f'Base model "{base_model}" {error}')
|
199 |
-
|
200 |
|
201 |
if not weight_type == "Adapter":
|
202 |
model_on_hub, error = is_model_on_hub(model, revision)
|
203 |
if not model_on_hub:
|
204 |
return styled_error(f'Model "{model}" {error}')
|
205 |
-
|
206 |
print("adding new eval")
|
207 |
|
208 |
eval_entry = {
|
@@ -233,7 +161,7 @@ def add_new_eval(
|
|
233 |
|
234 |
# Check for duplicate submission
|
235 |
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
236 |
-
return styled_warning("This model has been already submitted.")
|
237 |
|
238 |
with open(out_path, "w") as f:
|
239 |
f.write(json.dumps(eval_entry))
|
@@ -242,7 +170,6 @@ def add_new_eval(
|
|
242 |
path_or_fileobj=out_path,
|
243 |
path_in_repo=out_path.split("eval-queue/")[1],
|
244 |
repo_id=QUEUE_REPO,
|
245 |
-
token=H4_TOKEN,
|
246 |
repo_type="dataset",
|
247 |
commit_message=f"Add {model} to eval queue",
|
248 |
)
|
@@ -250,16 +177,19 @@ def add_new_eval(
|
|
250 |
# remove the local file
|
251 |
os.remove(out_path)
|
252 |
|
253 |
-
return styled_message(
|
|
|
|
|
254 |
|
255 |
|
256 |
-
|
257 |
-
|
|
|
258 |
(
|
259 |
finished_eval_queue_df,
|
260 |
running_eval_queue_df,
|
261 |
pending_eval_queue_df,
|
262 |
-
) = get_evaluation_queue_df()
|
263 |
return (
|
264 |
leaderboard_df,
|
265 |
finished_eval_queue_df,
|
@@ -268,74 +198,72 @@ def refresh():
|
|
268 |
)
|
269 |
|
270 |
|
271 |
-
def
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
filtered_df = df[
|
274 |
(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
|
275 |
| (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
|
276 |
-
|
277 |
else:
|
278 |
filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
279 |
-
return filtered_df[
|
280 |
|
281 |
|
282 |
-
def select_columns(df, columns):
|
283 |
-
always_here_cols = [
|
284 |
-
|
285 |
-
|
|
|
|
|
|
|
|
|
|
|
286 |
return filtered_df
|
287 |
|
288 |
-
#TODO allow this to filter by values of any columns
|
289 |
-
def filter_items(df, leaderboard_table, query):
|
290 |
-
if query == "all":
|
291 |
-
return df[leaderboard_table.columns]
|
292 |
-
else:
|
293 |
-
query = query[0] #take only the emoji character
|
294 |
-
if AutoEvalColumn.model_type_symbol.name in leaderboard_table.columns:
|
295 |
-
filtered_df = df[(df[AutoEvalColumn.model_type_symbol.name] == query)]
|
296 |
-
else:
|
297 |
-
return filtered_df[leaderboard_table.columns]
|
298 |
-
return filtered_df[leaderboard_table.columns]
|
299 |
-
|
300 |
-
def filter_items_size(df, leaderboard_table, query):
|
301 |
-
numeric_intervals = {
|
302 |
-
"all": None,
|
303 |
-
"< 1B": (0, 1),
|
304 |
-
"~3B": (1, 5),
|
305 |
-
"~7B": (6, 11),
|
306 |
-
"~13B": (12, 15),
|
307 |
-
"~35B": (16, 55),
|
308 |
-
"60B+": (55, 1000)
|
309 |
-
}
|
310 |
-
|
311 |
-
if query == "all":
|
312 |
-
return df[leaderboard_table.columns]
|
313 |
-
|
314 |
-
numeric_interval = numeric_intervals[query]
|
315 |
-
|
316 |
-
if AutoEvalColumn.params.name in leaderboard_table.columns:
|
317 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors='coerce')
|
318 |
-
filtered_df = df[params_column.between(*numeric_interval)]
|
319 |
-
else:
|
320 |
-
return filtered_df[leaderboard_table.columns]
|
321 |
-
return filtered_df[leaderboard_table.columns]
|
322 |
|
323 |
-
def
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
|
336 |
-
|
337 |
-
shown_columns.append(AutoEvalColumn.params.name)
|
338 |
-
return gr.update(visible=(input_type == 'types')), gr.update(visible=(input_type == 'sizes')), shown_columns
|
339 |
|
340 |
|
341 |
demo = gr.Blocks(css=custom_css)
|
@@ -346,13 +274,39 @@ with demo:
|
|
346 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
347 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
348 |
with gr.Row():
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
with gr.Column(min_width=320):
|
357 |
search_bar = gr.Textbox(
|
358 |
placeholder="🔍 Search for your model and press ENTER...",
|
@@ -360,46 +314,47 @@ with demo:
|
|
360 |
elem_id="search-bar",
|
361 |
)
|
362 |
with gr.Box(elem_id="box-filter"):
|
363 |
-
|
364 |
-
label="⏚ Filter model",
|
365 |
-
choices=["types", "sizes"], value="types",
|
366 |
-
interactive=True,
|
367 |
-
elem_id="filter_type"
|
368 |
-
)
|
369 |
-
filter_columns = gr.Radio(
|
370 |
label="⏚ Filter model types",
|
371 |
-
|
372 |
-
|
373 |
-
"all",
|
374 |
ModelType.PT.to_str(),
|
375 |
ModelType.FT.to_str(),
|
376 |
ModelType.IFT.to_str(),
|
377 |
-
ModelType.RL.to_str(),
|
378 |
],
|
379 |
value="all",
|
380 |
-
|
|
|
381 |
)
|
382 |
filter_columns_size = gr.Radio(
|
383 |
label="⏚ Filter model sizes",
|
384 |
-
|
385 |
-
choices = [
|
386 |
"all",
|
387 |
"< 1B",
|
388 |
"~3B",
|
389 |
"~7B",
|
390 |
"~13B",
|
391 |
"~35B",
|
392 |
-
"60B+"
|
393 |
],
|
394 |
value="all",
|
395 |
-
visible=False,
|
396 |
interactive=True,
|
397 |
-
elem_id="filter-columns-size"
|
398 |
)
|
399 |
-
|
400 |
leaderboard_table = gr.components.Dataframe(
|
401 |
-
value=leaderboard_df[
|
402 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
datatype=TYPES,
|
404 |
max_rows=None,
|
405 |
elem_id="leaderboard-table",
|
@@ -417,14 +372,55 @@ with demo:
|
|
417 |
)
|
418 |
search_bar.submit(
|
419 |
search_table,
|
420 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
leaderboard_table,
|
|
|
422 |
)
|
423 |
-
|
424 |
-
filter_type.change(update_filter_type,inputs=[filter_type, shown_columns],outputs=[filter_columns, filter_columns_size, shown_columns],queue=False).then(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table, queue=False)
|
425 |
-
shown_columns.change(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table, queue=False)
|
426 |
-
filter_columns.change(filter_items, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns], leaderboard_table, queue=False)
|
427 |
-
filter_columns_size.change(filter_items_size, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns_size], leaderboard_table, queue=False)
|
428 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
429 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
430 |
|
@@ -434,7 +430,10 @@ with demo:
|
|
434 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
435 |
|
436 |
with gr.Column():
|
437 |
-
with gr.Accordion(
|
|
|
|
|
|
|
438 |
with gr.Row():
|
439 |
finished_eval_table = gr.components.Dataframe(
|
440 |
value=finished_eval_queue_df,
|
@@ -442,7 +441,10 @@ with demo:
|
|
442 |
datatype=EVAL_TYPES,
|
443 |
max_rows=5,
|
444 |
)
|
445 |
-
with gr.Accordion(
|
|
|
|
|
|
|
446 |
with gr.Row():
|
447 |
running_eval_table = gr.components.Dataframe(
|
448 |
value=running_eval_queue_df,
|
@@ -451,7 +453,10 @@ with demo:
|
|
451 |
max_rows=5,
|
452 |
)
|
453 |
|
454 |
-
with gr.Accordion(
|
|
|
|
|
|
|
455 |
with gr.Row():
|
456 |
pending_eval_table = gr.components.Dataframe(
|
457 |
value=pending_eval_queue_df,
|
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
|
|
5 |
import gradio as gr
|
|
|
6 |
import pandas as pd
|
7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from huggingface_hub import HfApi
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from src.assets.css_html_js import custom_css, get_window_url_params
|
10 |
+
from src.assets.text_content import (
|
11 |
+
CITATION_BUTTON_LABEL,
|
12 |
+
CITATION_BUTTON_TEXT,
|
13 |
+
EVALUATION_QUEUE_TEXT,
|
14 |
+
INTRODUCTION_TEXT,
|
15 |
+
LLM_BENCHMARKS_TEXT,
|
16 |
+
TITLE,
|
17 |
+
)
|
18 |
+
from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
|
19 |
+
from src.display_models.utils import (
|
20 |
+
AutoEvalColumn,
|
21 |
+
EvalQueueColumn,
|
22 |
+
fields,
|
23 |
+
styled_error,
|
24 |
+
styled_message,
|
25 |
+
styled_warning,
|
26 |
+
)
|
27 |
+
from src.load_from_hub import get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub, load_all_info_from_hub
|
28 |
+
|
29 |
+
pd.set_option("display.precision", 1)
|
30 |
|
31 |
# clone / pull the lmeh eval data
|
32 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
|
|
45 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
46 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
47 |
|
48 |
+
api = HfApi(token=H4_TOKEN)
|
49 |
+
|
50 |
|
51 |
def restart_space():
|
52 |
api.restart_space(
|
53 |
repo_id="gsaivinay/open_llm_leaderboard", token=H4_TOKEN
|
54 |
)
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
# Column selection
|
58 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
59 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
60 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
|
|
67 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
68 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
69 |
|
70 |
+
BENCHMARK_COLS = [
|
71 |
+
c.name
|
72 |
+
for c in [
|
73 |
+
AutoEvalColumn.arc,
|
74 |
+
AutoEvalColumn.hellaswag,
|
75 |
+
AutoEvalColumn.mmlu,
|
76 |
+
AutoEvalColumn.truthfulqa,
|
77 |
+
]
|
78 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
+
## LOAD INFO FROM HUB
|
81 |
+
eval_queue, requested_models, eval_results = load_all_info_from_hub(
|
82 |
+
QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
|
83 |
+
)
|
84 |
|
85 |
+
if not IS_PUBLIC:
|
86 |
+
(eval_queue_private, requested_models_private, eval_results_private,) = load_all_info_from_hub(
|
87 |
+
PRIVATE_QUEUE_REPO,
|
88 |
+
PRIVATE_RESULTS_REPO,
|
89 |
+
EVAL_REQUESTS_PATH_PRIVATE,
|
90 |
+
EVAL_RESULTS_PATH_PRIVATE,
|
91 |
+
)
|
92 |
+
else:
|
93 |
+
eval_queue_private, eval_results_private = None, None
|
94 |
|
95 |
+
original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
leaderboard_df = original_df.copy()
|
97 |
(
|
98 |
finished_eval_queue_df,
|
99 |
running_eval_queue_df,
|
100 |
pending_eval_queue_df,
|
101 |
+
) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, EVAL_COLS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
|
104 |
+
## INTERACTION FUNCTIONS
|
105 |
def add_new_eval(
|
106 |
model: str,
|
107 |
base_model: str,
|
|
|
125 |
base_model_on_hub, error = is_model_on_hub(base_model, revision)
|
126 |
if not base_model_on_hub:
|
127 |
return styled_error(f'Base model "{base_model}" {error}')
|
|
|
128 |
|
129 |
if not weight_type == "Adapter":
|
130 |
model_on_hub, error = is_model_on_hub(model, revision)
|
131 |
if not model_on_hub:
|
132 |
return styled_error(f'Model "{model}" {error}')
|
133 |
+
|
134 |
print("adding new eval")
|
135 |
|
136 |
eval_entry = {
|
|
|
161 |
|
162 |
# Check for duplicate submission
|
163 |
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
164 |
+
return styled_warning("This model has been already submitted.")
|
165 |
|
166 |
with open(out_path, "w") as f:
|
167 |
f.write(json.dumps(eval_entry))
|
|
|
170 |
path_or_fileobj=out_path,
|
171 |
path_in_repo=out_path.split("eval-queue/")[1],
|
172 |
repo_id=QUEUE_REPO,
|
|
|
173 |
repo_type="dataset",
|
174 |
commit_message=f"Add {model} to eval queue",
|
175 |
)
|
|
|
177 |
# remove the local file
|
178 |
os.remove(out_path)
|
179 |
|
180 |
+
return styled_message(
|
181 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
182 |
+
)
|
183 |
|
184 |
|
185 |
+
# Basics
|
186 |
+
def refresh() -> list[pd.DataFrame]:
|
187 |
+
leaderboard_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
|
188 |
(
|
189 |
finished_eval_queue_df,
|
190 |
running_eval_queue_df,
|
191 |
pending_eval_queue_df,
|
192 |
+
) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, COLS)
|
193 |
return (
|
194 |
leaderboard_df,
|
195 |
finished_eval_queue_df,
|
|
|
198 |
)
|
199 |
|
200 |
|
201 |
+
def change_tab(query_param: str):
|
202 |
+
query_param = query_param.replace("'", '"')
|
203 |
+
query_param = json.loads(query_param)
|
204 |
+
|
205 |
+
if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
|
206 |
+
return gr.Tabs.update(selected=1)
|
207 |
+
else:
|
208 |
+
return gr.Tabs.update(selected=0)
|
209 |
+
|
210 |
+
|
211 |
+
# Searching and filtering
|
212 |
+
def search_table(df: pd.DataFrame, current_columns_df: pd.DataFrame, query: str) -> pd.DataFrame:
|
213 |
+
current_columns = current_columns_df.columns
|
214 |
+
if AutoEvalColumn.model_type.name in current_columns:
|
215 |
filtered_df = df[
|
216 |
(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
|
217 |
| (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
|
218 |
+
]
|
219 |
else:
|
220 |
filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
221 |
+
return filtered_df[current_columns]
|
222 |
|
223 |
|
224 |
+
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
225 |
+
always_here_cols = [
|
226 |
+
AutoEvalColumn.model_type_symbol.name,
|
227 |
+
AutoEvalColumn.model.name,
|
228 |
+
]
|
229 |
+
# We use COLS to maintain sorting
|
230 |
+
filtered_df = df[
|
231 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
232 |
+
]
|
233 |
return filtered_df
|
234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
+
def filter_models(
|
237 |
+
df: pd.DataFrame, current_columns_df: pd.DataFrame, type_query: str, size_query: str, show_deleted: bool
|
238 |
+
) -> pd.DataFrame:
|
239 |
+
current_columns = current_columns_df.columns
|
240 |
+
|
241 |
+
# Show all models
|
242 |
+
if show_deleted:
|
243 |
+
filtered_df = df[current_columns]
|
244 |
+
else: # Show only still on the hub models
|
245 |
+
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True][current_columns]
|
246 |
+
|
247 |
+
if type_query != "all":
|
248 |
+
type_emoji = type_query[0]
|
249 |
+
filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
|
250 |
+
|
251 |
+
if size_query != "all":
|
252 |
+
numeric_intervals = {
|
253 |
+
"all": (0, 10000),
|
254 |
+
"< 1B": (0, 1),
|
255 |
+
"~3B": (1, 5),
|
256 |
+
"~7B": (6, 11),
|
257 |
+
"~13B": (12, 15),
|
258 |
+
"~35B": (16, 55),
|
259 |
+
"60B+": (55, 10000),
|
260 |
+
}
|
261 |
+
numeric_interval = numeric_intervals[size_query]
|
262 |
+
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
263 |
+
|
264 |
+
filtered_df = filtered_df[params_column.between(*numeric_interval)]
|
265 |
|
266 |
+
return filtered_df
|
|
|
|
|
267 |
|
268 |
|
269 |
demo = gr.Blocks(css=custom_css)
|
|
|
274 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
275 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
276 |
with gr.Row():
|
277 |
+
with gr.Column():
|
278 |
+
with gr.Row():
|
279 |
+
shown_columns = gr.CheckboxGroup(
|
280 |
+
choices=[
|
281 |
+
c
|
282 |
+
for c in COLS
|
283 |
+
if c
|
284 |
+
not in [
|
285 |
+
AutoEvalColumn.dummy.name,
|
286 |
+
AutoEvalColumn.model.name,
|
287 |
+
AutoEvalColumn.model_type_symbol.name,
|
288 |
+
AutoEvalColumn.still_on_hub.name,
|
289 |
+
]
|
290 |
+
],
|
291 |
+
value=[
|
292 |
+
c
|
293 |
+
for c in COLS_LITE
|
294 |
+
if c
|
295 |
+
not in [
|
296 |
+
AutoEvalColumn.dummy.name,
|
297 |
+
AutoEvalColumn.model.name,
|
298 |
+
AutoEvalColumn.model_type_symbol.name,
|
299 |
+
AutoEvalColumn.still_on_hub.name,
|
300 |
+
]
|
301 |
+
],
|
302 |
+
label="Select columns to show",
|
303 |
+
elem_id="column-select",
|
304 |
+
interactive=True,
|
305 |
+
)
|
306 |
+
with gr.Row():
|
307 |
+
deleted_models_visibility = gr.Checkbox(
|
308 |
+
value=True, label="Show models removed from the hub", interactive=True
|
309 |
+
)
|
310 |
with gr.Column(min_width=320):
|
311 |
search_bar = gr.Textbox(
|
312 |
placeholder="🔍 Search for your model and press ENTER...",
|
|
|
314 |
elem_id="search-bar",
|
315 |
)
|
316 |
with gr.Box(elem_id="box-filter"):
|
317 |
+
filter_columns_type = gr.Radio(
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
label="⏚ Filter model types",
|
319 |
+
choices=[
|
320 |
+
"all",
|
|
|
321 |
ModelType.PT.to_str(),
|
322 |
ModelType.FT.to_str(),
|
323 |
ModelType.IFT.to_str(),
|
324 |
+
ModelType.RL.to_str(),
|
325 |
],
|
326 |
value="all",
|
327 |
+
interactive=True,
|
328 |
+
elem_id="filter-columns-type",
|
329 |
)
|
330 |
filter_columns_size = gr.Radio(
|
331 |
label="⏚ Filter model sizes",
|
332 |
+
choices=[
|
|
|
333 |
"all",
|
334 |
"< 1B",
|
335 |
"~3B",
|
336 |
"~7B",
|
337 |
"~13B",
|
338 |
"~35B",
|
339 |
+
"60B+",
|
340 |
],
|
341 |
value="all",
|
|
|
342 |
interactive=True,
|
343 |
+
elem_id="filter-columns-size",
|
344 |
)
|
345 |
+
|
346 |
leaderboard_table = gr.components.Dataframe(
|
347 |
+
value=leaderboard_df[
|
348 |
+
[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
|
349 |
+
+ shown_columns.value
|
350 |
+
+ [AutoEvalColumn.dummy.name]
|
351 |
+
],
|
352 |
+
headers=[
|
353 |
+
AutoEvalColumn.model_type_symbol.name,
|
354 |
+
AutoEvalColumn.model.name,
|
355 |
+
]
|
356 |
+
+ shown_columns.value
|
357 |
+
+ [AutoEvalColumn.dummy.name],
|
358 |
datatype=TYPES,
|
359 |
max_rows=None,
|
360 |
elem_id="leaderboard-table",
|
|
|
372 |
)
|
373 |
search_bar.submit(
|
374 |
search_table,
|
375 |
+
[
|
376 |
+
hidden_leaderboard_table_for_search,
|
377 |
+
leaderboard_table,
|
378 |
+
search_bar,
|
379 |
+
],
|
380 |
+
leaderboard_table,
|
381 |
+
)
|
382 |
+
shown_columns.change(
|
383 |
+
select_columns,
|
384 |
+
[hidden_leaderboard_table_for_search, shown_columns],
|
385 |
+
leaderboard_table,
|
386 |
+
queue=False,
|
387 |
+
)
|
388 |
+
filter_columns_type.change(
|
389 |
+
filter_models,
|
390 |
+
[
|
391 |
+
hidden_leaderboard_table_for_search,
|
392 |
+
leaderboard_table,
|
393 |
+
filter_columns_type,
|
394 |
+
filter_columns_size,
|
395 |
+
deleted_models_visibility,
|
396 |
+
],
|
397 |
+
leaderboard_table,
|
398 |
+
queue=False,
|
399 |
+
)
|
400 |
+
filter_columns_size.change(
|
401 |
+
filter_models,
|
402 |
+
[
|
403 |
+
hidden_leaderboard_table_for_search,
|
404 |
+
leaderboard_table,
|
405 |
+
filter_columns_type,
|
406 |
+
filter_columns_size,
|
407 |
+
deleted_models_visibility,
|
408 |
+
],
|
409 |
+
leaderboard_table,
|
410 |
+
queue=False,
|
411 |
+
)
|
412 |
+
deleted_models_visibility.change(
|
413 |
+
filter_models,
|
414 |
+
[
|
415 |
+
hidden_leaderboard_table_for_search,
|
416 |
+
leaderboard_table,
|
417 |
+
filter_columns_type,
|
418 |
+
filter_columns_size,
|
419 |
+
deleted_models_visibility,
|
420 |
+
],
|
421 |
leaderboard_table,
|
422 |
+
queue=False,
|
423 |
)
|
|
|
|
|
|
|
|
|
|
|
424 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
425 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
426 |
|
|
|
430 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
431 |
|
432 |
with gr.Column():
|
433 |
+
with gr.Accordion(
|
434 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
435 |
+
open=False,
|
436 |
+
):
|
437 |
with gr.Row():
|
438 |
finished_eval_table = gr.components.Dataframe(
|
439 |
value=finished_eval_queue_df,
|
|
|
441 |
datatype=EVAL_TYPES,
|
442 |
max_rows=5,
|
443 |
)
|
444 |
+
with gr.Accordion(
|
445 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
446 |
+
open=False,
|
447 |
+
):
|
448 |
with gr.Row():
|
449 |
running_eval_table = gr.components.Dataframe(
|
450 |
value=running_eval_queue_df,
|
|
|
453 |
max_rows=5,
|
454 |
)
|
455 |
|
456 |
+
with gr.Accordion(
|
457 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
458 |
+
open=False,
|
459 |
+
):
|
460 |
with gr.Row():
|
461 |
pending_eval_table = gr.components.Dataframe(
|
462 |
value=pending_eval_queue_df,
|
pyproject.toml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.ruff]
|
2 |
+
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
+
select = ["E", "F"]
|
4 |
+
ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
+
line-length = 119
|
6 |
+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
+
|
8 |
+
[tool.isort]
|
9 |
+
profile = "black"
|
10 |
+
line_length = 119
|
11 |
+
|
12 |
+
[tool.black]
|
13 |
+
line-length = 119
|
requirements.txt
CHANGED
@@ -24,7 +24,7 @@ gradio_client==0.1.3
|
|
24 |
h11==0.14.0
|
25 |
httpcore==0.17.0
|
26 |
httpx==0.24.0
|
27 |
-
huggingface-hub==0.
|
28 |
idna==3.4
|
29 |
Jinja2==3.1.2
|
30 |
jsonschema==4.17.3
|
@@ -59,7 +59,7 @@ sniffio==1.3.0
|
|
59 |
starlette==0.26.1
|
60 |
toolz==0.12.0
|
61 |
tqdm==4.65.0
|
62 |
-
transformers==4.
|
63 |
typing_extensions==4.5.0
|
64 |
tzdata==2023.3
|
65 |
tzlocal==4.3
|
|
|
24 |
h11==0.14.0
|
25 |
httpcore==0.17.0
|
26 |
httpx==0.24.0
|
27 |
+
huggingface-hub==0.16.4
|
28 |
idna==3.4
|
29 |
Jinja2==3.1.2
|
30 |
jsonschema==4.17.3
|
|
|
59 |
starlette==0.26.1
|
60 |
toolz==0.12.0
|
61 |
tqdm==4.65.0
|
62 |
+
transformers==4.32.0
|
63 |
typing_extensions==4.5.0
|
64 |
tzdata==2023.3
|
65 |
tzlocal==4.3
|
src/assets/css_html_js.py
CHANGED
@@ -89,13 +89,13 @@ table th:first-child {
|
|
89 |
#filter_type label > .wrap .wrap-inner input{
|
90 |
width: 1px
|
91 |
}
|
92 |
-
#filter-columns{
|
93 |
border:0;
|
94 |
-
padding:0;
|
95 |
}
|
96 |
#filter-columns-size{
|
97 |
border:0;
|
98 |
-
padding:0;
|
99 |
}
|
100 |
#box-filter > .form{
|
101 |
border: 0
|
@@ -108,4 +108,4 @@ get_window_url_params = """
|
|
108 |
url_params = Object.fromEntries(params);
|
109 |
return url_params;
|
110 |
}
|
111 |
-
"""
|
|
|
89 |
#filter_type label > .wrap .wrap-inner input{
|
90 |
width: 1px
|
91 |
}
|
92 |
+
#filter-columns-type{
|
93 |
border:0;
|
94 |
+
padding:0.5;
|
95 |
}
|
96 |
#filter-columns-size{
|
97 |
border:0;
|
98 |
+
padding:0.5;
|
99 |
}
|
100 |
#box-filter > .form{
|
101 |
border: 0
|
|
|
108 |
url_params = Object.fromEntries(params);
|
109 |
return url_params;
|
110 |
}
|
111 |
+
"""
|
src/assets/hardcoded_evals.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from src.
|
2 |
|
3 |
gpt4_values = {
|
4 |
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
|
@@ -6,9 +6,9 @@ gpt4_values = {
|
|
6 |
AutoEvalColumn.precision.name: None,
|
7 |
AutoEvalColumn.average.name: 84.3,
|
8 |
AutoEvalColumn.arc.name: 96.3,
|
9 |
-
AutoEvalColumn.hellaswag.name:
|
10 |
-
AutoEvalColumn.mmlu.name:
|
11 |
-
AutoEvalColumn.truthfulqa.name:
|
12 |
AutoEvalColumn.dummy.name: "GPT-4",
|
13 |
AutoEvalColumn.model_type.name: "",
|
14 |
}
|
@@ -19,9 +19,9 @@ gpt35_values = {
|
|
19 |
AutoEvalColumn.precision.name: None,
|
20 |
AutoEvalColumn.average.name: 71.9,
|
21 |
AutoEvalColumn.arc.name: 85.2,
|
22 |
-
AutoEvalColumn.hellaswag.name:
|
23 |
-
AutoEvalColumn.mmlu.name:
|
24 |
-
AutoEvalColumn.truthfulqa.name:
|
25 |
AutoEvalColumn.dummy.name: "GPT-3.5",
|
26 |
AutoEvalColumn.model_type.name: "",
|
27 |
}
|
@@ -32,10 +32,9 @@ baseline = {
|
|
32 |
AutoEvalColumn.precision.name: None,
|
33 |
AutoEvalColumn.average.name: 25.0,
|
34 |
AutoEvalColumn.arc.name: 25.0,
|
35 |
-
AutoEvalColumn.hellaswag.name:
|
36 |
-
AutoEvalColumn.mmlu.name:
|
37 |
-
AutoEvalColumn.truthfulqa.name:
|
38 |
AutoEvalColumn.dummy.name: "baseline",
|
39 |
AutoEvalColumn.model_type.name: "",
|
40 |
}
|
41 |
-
|
|
|
1 |
+
from src.display_models.utils import AutoEvalColumn, model_hyperlink
|
2 |
|
3 |
gpt4_values = {
|
4 |
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
|
|
|
6 |
AutoEvalColumn.precision.name: None,
|
7 |
AutoEvalColumn.average.name: 84.3,
|
8 |
AutoEvalColumn.arc.name: 96.3,
|
9 |
+
AutoEvalColumn.hellaswag.name: 95.3,
|
10 |
+
AutoEvalColumn.mmlu.name: 86.4,
|
11 |
+
AutoEvalColumn.truthfulqa.name: 59.0,
|
12 |
AutoEvalColumn.dummy.name: "GPT-4",
|
13 |
AutoEvalColumn.model_type.name: "",
|
14 |
}
|
|
|
19 |
AutoEvalColumn.precision.name: None,
|
20 |
AutoEvalColumn.average.name: 71.9,
|
21 |
AutoEvalColumn.arc.name: 85.2,
|
22 |
+
AutoEvalColumn.hellaswag.name: 85.5,
|
23 |
+
AutoEvalColumn.mmlu.name: 70.0,
|
24 |
+
AutoEvalColumn.truthfulqa.name: 47.0,
|
25 |
AutoEvalColumn.dummy.name: "GPT-3.5",
|
26 |
AutoEvalColumn.model_type.name: "",
|
27 |
}
|
|
|
32 |
AutoEvalColumn.precision.name: None,
|
33 |
AutoEvalColumn.average.name: 25.0,
|
34 |
AutoEvalColumn.arc.name: 25.0,
|
35 |
+
AutoEvalColumn.hellaswag.name: 25.0,
|
36 |
+
AutoEvalColumn.mmlu.name: 25.0,
|
37 |
+
AutoEvalColumn.truthfulqa.name: 25.0,
|
38 |
AutoEvalColumn.dummy.name: "baseline",
|
39 |
AutoEvalColumn.model_type.name: "",
|
40 |
}
|
|
src/assets/text_content.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
-
from
|
2 |
|
3 |
TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
|
4 |
|
5 |
-
INTRODUCTION_TEXT =
|
6 |
📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
|
7 |
|
8 |
-
🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
|
9 |
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
10 |
"""
|
11 |
|
12 |
LLM_BENCHMARKS_TEXT = f"""
|
13 |
# Context
|
14 |
-
With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
|
15 |
|
16 |
## Icons
|
17 |
{ModelType.PT.to_str(" : ")} model
|
@@ -25,14 +25,14 @@ If there is no icon, we have not uploaded the information on the model yet, feel
|
|
25 |
|
26 |
## How it works
|
27 |
|
28 |
-
📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
|
29 |
|
30 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
31 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
32 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
33 |
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
|
34 |
|
35 |
-
For all these evaluations, a higher score is a better score.
|
36 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
37 |
|
38 |
## Details and logs
|
@@ -46,7 +46,7 @@ To reproduce our results, here is the commands you can run, using [this version]
|
|
46 |
`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
47 |
` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
|
48 |
|
49 |
-
The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
|
50 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
51 |
|
52 |
The tasks and few shots parameters are:
|
@@ -65,7 +65,7 @@ If you still have questions, you can check our FAQ [here](https://huggingface.co
|
|
65 |
We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
|
66 |
"""
|
67 |
|
68 |
-
EVALUATION_QUEUE_TEXT =
|
69 |
# Evaluation Queue for the 🤗 Open LLM Leaderboard
|
70 |
|
71 |
Models added here will be automatically evaluated on the 🤗 cluster.
|
@@ -79,7 +79,7 @@ config = AutoConfig.from_pretrained("your model name", revision=revision)
|
|
79 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
80 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
81 |
```
|
82 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
83 |
|
84 |
Note: make sure your model is public!
|
85 |
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
@@ -94,8 +94,8 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
|
|
94 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
95 |
|
96 |
## In case of model failure
|
97 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
98 |
-
Make sure you have followed the above steps first.
|
99 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
100 |
"""
|
101 |
|
@@ -135,7 +135,7 @@ CITATION_BUTTON_TEXT = r"""
|
|
135 |
url = {https://doi.org/10.5281/zenodo.5371628}
|
136 |
}
|
137 |
@misc{clark2018think,
|
138 |
-
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
139 |
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
140 |
year={2018},
|
141 |
eprint={1803.05457},
|
@@ -143,7 +143,7 @@ CITATION_BUTTON_TEXT = r"""
|
|
143 |
primaryClass={cs.AI}
|
144 |
}
|
145 |
@misc{zellers2019hellaswag,
|
146 |
-
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
147 |
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
148 |
year={2019},
|
149 |
eprint={1905.07830},
|
@@ -151,7 +151,7 @@ CITATION_BUTTON_TEXT = r"""
|
|
151 |
primaryClass={cs.CL}
|
152 |
}
|
153 |
@misc{hendrycks2021measuring,
|
154 |
-
title={Measuring Massive Multitask Language Understanding},
|
155 |
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
156 |
year={2021},
|
157 |
eprint={2009.03300},
|
@@ -159,7 +159,7 @@ CITATION_BUTTON_TEXT = r"""
|
|
159 |
primaryClass={cs.CY}
|
160 |
}
|
161 |
@misc{lin2022truthfulqa,
|
162 |
-
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
163 |
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
164 |
year={2022},
|
165 |
eprint={2109.07958},
|
|
|
1 |
+
from src.display_models.model_metadata_type import ModelType
|
2 |
|
3 |
TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
|
4 |
|
5 |
+
INTRODUCTION_TEXT = """
|
6 |
📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
|
7 |
|
8 |
+
🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
|
9 |
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
10 |
"""
|
11 |
|
12 |
LLM_BENCHMARKS_TEXT = f"""
|
13 |
# Context
|
14 |
+
With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
|
15 |
|
16 |
## Icons
|
17 |
{ModelType.PT.to_str(" : ")} model
|
|
|
25 |
|
26 |
## How it works
|
27 |
|
28 |
+
📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
|
29 |
|
30 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
31 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
32 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
33 |
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
|
34 |
|
35 |
+
For all these evaluations, a higher score is a better score.
|
36 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
37 |
|
38 |
## Details and logs
|
|
|
46 |
`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
47 |
` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
|
48 |
|
49 |
+
The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
|
50 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
51 |
|
52 |
The tasks and few shots parameters are:
|
|
|
65 |
We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
|
66 |
"""
|
67 |
|
68 |
+
EVALUATION_QUEUE_TEXT = """
|
69 |
# Evaluation Queue for the 🤗 Open LLM Leaderboard
|
70 |
|
71 |
Models added here will be automatically evaluated on the 🤗 cluster.
|
|
|
79 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
80 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
81 |
```
|
82 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
83 |
|
84 |
Note: make sure your model is public!
|
85 |
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
|
|
94 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
95 |
|
96 |
## In case of model failure
|
97 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
98 |
+
Make sure you have followed the above steps first.
|
99 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
100 |
"""
|
101 |
|
|
|
135 |
url = {https://doi.org/10.5281/zenodo.5371628}
|
136 |
}
|
137 |
@misc{clark2018think,
|
138 |
+
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
139 |
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
140 |
year={2018},
|
141 |
eprint={1803.05457},
|
|
|
143 |
primaryClass={cs.AI}
|
144 |
}
|
145 |
@misc{zellers2019hellaswag,
|
146 |
+
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
147 |
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
148 |
year={2019},
|
149 |
eprint={1905.07830},
|
|
|
151 |
primaryClass={cs.CL}
|
152 |
}
|
153 |
@misc{hendrycks2021measuring,
|
154 |
+
title={Measuring Massive Multitask Language Understanding},
|
155 |
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
156 |
year={2021},
|
157 |
eprint={2009.03300},
|
|
|
159 |
primaryClass={cs.CY}
|
160 |
}
|
161 |
@misc{lin2022truthfulqa,
|
162 |
+
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
163 |
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
164 |
year={2022},
|
165 |
eprint={2109.07958},
|
src/auto_leaderboard/model_metadata_type.py
DELETED
@@ -1,551 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from enum import Enum
|
3 |
-
from typing import Dict
|
4 |
-
|
5 |
-
|
6 |
-
@dataclass
|
7 |
-
class ModelInfo:
|
8 |
-
name: str
|
9 |
-
symbol: str # emoji
|
10 |
-
|
11 |
-
|
12 |
-
class ModelType(Enum):
|
13 |
-
PT = ModelInfo(name="pretrained", symbol="🟢")
|
14 |
-
FT = ModelInfo(name="fine-tuned", symbol="🔶")
|
15 |
-
IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
|
16 |
-
RL = ModelInfo(name="RL-tuned", symbol="🟦")
|
17 |
-
Unknown = ModelInfo(name="Unknown, add type to request file!", symbol="?")
|
18 |
-
|
19 |
-
def to_str(self, separator = " "):
|
20 |
-
return f"{self.value.symbol}{separator}{self.value.name}"
|
21 |
-
|
22 |
-
|
23 |
-
MODEL_TYPE_METADATA: Dict[str, ModelType] = {
|
24 |
-
'notstoic/PygmalionCoT-7b': ModelType.IFT,
|
25 |
-
'aisquared/dlite-v1-355m': ModelType.IFT,
|
26 |
-
'aisquared/dlite-v1-1_5b': ModelType.IFT,
|
27 |
-
'aisquared/dlite-v1-774m': ModelType.IFT,
|
28 |
-
'aisquared/dlite-v1-124m': ModelType.IFT,
|
29 |
-
'aisquared/chopt-2_7b': ModelType.IFT,
|
30 |
-
'aisquared/dlite-v2-124m': ModelType.IFT,
|
31 |
-
'aisquared/dlite-v2-774m': ModelType.IFT,
|
32 |
-
'aisquared/dlite-v2-1_5b': ModelType.IFT,
|
33 |
-
'aisquared/chopt-1_3b': ModelType.IFT,
|
34 |
-
'aisquared/dlite-v2-355m': ModelType.IFT,
|
35 |
-
'augtoma/qCammel-13': ModelType.IFT,
|
36 |
-
'Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload': ModelType.IFT,
|
37 |
-
'Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload': ModelType.IFT,
|
38 |
-
'TheBloke/alpaca-lora-65B-HF': ModelType.FT,
|
39 |
-
'TheBloke/tulu-7B-fp16': ModelType.IFT,
|
40 |
-
'TheBloke/guanaco-7B-HF': ModelType.FT,
|
41 |
-
'TheBloke/koala-7B-HF': ModelType.FT,
|
42 |
-
'TheBloke/wizardLM-7B-HF': ModelType.IFT,
|
43 |
-
'TheBloke/airoboros-13B-HF': ModelType.IFT,
|
44 |
-
'TheBloke/koala-13B-HF': ModelType.FT,
|
45 |
-
'TheBloke/Wizard-Vicuna-7B-Uncensored-HF': ModelType.FT,
|
46 |
-
'TheBloke/dromedary-65b-lora-HF': ModelType.IFT,
|
47 |
-
'TheBloke/wizardLM-13B-1.0-fp16': ModelType.IFT,
|
48 |
-
'TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16': ModelType.FT,
|
49 |
-
'TheBloke/Wizard-Vicuna-30B-Uncensored-fp16': ModelType.FT,
|
50 |
-
'TheBloke/wizard-vicuna-13B-HF': ModelType.IFT,
|
51 |
-
'TheBloke/UltraLM-13B-fp16': ModelType.IFT,
|
52 |
-
'TheBloke/OpenAssistant-FT-7-Llama-30B-HF': ModelType.FT,
|
53 |
-
'TheBloke/vicuna-13B-1.1-HF': ModelType.IFT,
|
54 |
-
'TheBloke/guanaco-13B-HF': ModelType.FT,
|
55 |
-
'TheBloke/guanaco-65B-HF': ModelType.FT,
|
56 |
-
'TheBloke/airoboros-7b-gpt4-fp16': ModelType.IFT,
|
57 |
-
'TheBloke/llama-30b-supercot-SuperHOT-8K-fp16': ModelType.IFT,
|
58 |
-
'TheBloke/Llama-2-13B-fp16': ModelType.PT,
|
59 |
-
'TheBloke/llama-2-70b-Guanaco-QLoRA-fp16': ModelType.FT,
|
60 |
-
'TheBloke/landmark-attention-llama7b-fp16': ModelType.IFT,
|
61 |
-
'TheBloke/Planner-7B-fp16': ModelType.IFT,
|
62 |
-
'TheBloke/Wizard-Vicuna-13B-Uncensored-HF': ModelType.FT,
|
63 |
-
'TheBloke/gpt4-alpaca-lora-13B-HF': ModelType.IFT,
|
64 |
-
'TheBloke/gpt4-x-vicuna-13B-HF': ModelType.IFT,
|
65 |
-
'TheBloke/gpt4-alpaca-lora_mlp-65B-HF': ModelType.IFT,
|
66 |
-
'TheBloke/tulu-13B-fp16': ModelType.IFT,
|
67 |
-
'TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16': ModelType.IFT,
|
68 |
-
'TheBloke/Llama-2-70B-fp16': ModelType.IFT,
|
69 |
-
'TheBloke/WizardLM-30B-fp16': ModelType.IFT,
|
70 |
-
'TheBloke/robin-13B-v2-fp16': ModelType.FT,
|
71 |
-
'TheBloke/robin-33B-v2-fp16': ModelType.FT,
|
72 |
-
'TheBloke/Vicuna-13B-CoT-fp16': ModelType.IFT,
|
73 |
-
'TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16': ModelType.IFT,
|
74 |
-
'TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16': ModelType.FT,
|
75 |
-
'TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16': ModelType.IFT,
|
76 |
-
'TheBloke/GPlatty-30B-SuperHOT-8K-fp16': ModelType.FT,
|
77 |
-
'TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16': ModelType.IFT,
|
78 |
-
'TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16': ModelType.IFT,
|
79 |
-
'jphme/orca_mini_v2_ger_7b': ModelType.IFT,
|
80 |
-
'Ejafa/vicuna_7B_vanilla_1.1': ModelType.FT,
|
81 |
-
'kevinpro/Vicuna-13B-CoT': ModelType.IFT,
|
82 |
-
'AlekseyKorshuk/pygmalion-6b-vicuna-chatml': ModelType.FT,
|
83 |
-
'AlekseyKorshuk/chatml-pyg-v1': ModelType.FT,
|
84 |
-
'concedo/Vicuzard-30B-Uncensored': ModelType.FT,
|
85 |
-
'concedo/OPT-19M-ChatSalad': ModelType.FT,
|
86 |
-
'concedo/Pythia-70M-ChatSalad': ModelType.FT,
|
87 |
-
'digitous/13B-HyperMantis': ModelType.IFT,
|
88 |
-
'digitous/Adventien-GPTJ': ModelType.FT,
|
89 |
-
'digitous/Alpacino13b': ModelType.IFT,
|
90 |
-
'digitous/GPT-R': ModelType.IFT,
|
91 |
-
'digitous/Javelin-R': ModelType.IFT,
|
92 |
-
'digitous/Javalion-GPTJ': ModelType.IFT,
|
93 |
-
'digitous/Javalion-R': ModelType.IFT,
|
94 |
-
'digitous/Skegma-GPTJ': ModelType.FT,
|
95 |
-
'digitous/Alpacino30b': ModelType.IFT,
|
96 |
-
'digitous/Janin-GPTJ': ModelType.FT,
|
97 |
-
'digitous/Janin-R': ModelType.FT,
|
98 |
-
'digitous/Javelin-GPTJ': ModelType.FT,
|
99 |
-
'SaylorTwift/gpt2_test': ModelType.PT,
|
100 |
-
'anton-l/gpt-j-tiny-random': ModelType.FT,
|
101 |
-
'Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca': ModelType.FT,
|
102 |
-
'Lazycuber/pyg-instruct-wizardlm': ModelType.FT,
|
103 |
-
'Lazycuber/Janemalion-6B': ModelType.FT,
|
104 |
-
'IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1': ModelType.FT,
|
105 |
-
'IDEA-CCNL/Ziya-LLaMA-13B-v1': ModelType.IFT,
|
106 |
-
'dsvv-cair/alpaca-cleaned-llama-30b-bf16': ModelType.FT,
|
107 |
-
'gpt2-medium': ModelType.PT,
|
108 |
-
'camel-ai/CAMEL-13B-Combined-Data': ModelType.IFT,
|
109 |
-
'camel-ai/CAMEL-13B-Role-Playing-Data': ModelType.FT,
|
110 |
-
'camel-ai/CAMEL-33B-Combined-Data': ModelType.IFT,
|
111 |
-
'PygmalionAI/pygmalion-6b': ModelType.FT,
|
112 |
-
'PygmalionAI/metharme-1.3b': ModelType.IFT,
|
113 |
-
'PygmalionAI/pygmalion-1.3b': ModelType.FT,
|
114 |
-
'PygmalionAI/pygmalion-350m': ModelType.FT,
|
115 |
-
'PygmalionAI/pygmalion-2.7b': ModelType.FT,
|
116 |
-
'medalpaca/medalpaca-7b': ModelType.FT,
|
117 |
-
'lilloukas/Platypus-30B': ModelType.IFT,
|
118 |
-
'lilloukas/GPlatty-30B': ModelType.FT,
|
119 |
-
'mncai/chatdoctor': ModelType.FT,
|
120 |
-
'chaoyi-wu/MedLLaMA_13B': ModelType.FT,
|
121 |
-
'LoupGarou/WizardCoder-Guanaco-15B-V1.0': ModelType.IFT,
|
122 |
-
'LoupGarou/WizardCoder-Guanaco-15B-V1.1': ModelType.FT,
|
123 |
-
'hakurei/instruct-12b': ModelType.IFT,
|
124 |
-
'hakurei/lotus-12B': ModelType.FT,
|
125 |
-
'shibing624/chinese-llama-plus-13b-hf': ModelType.IFT,
|
126 |
-
'shibing624/chinese-alpaca-plus-7b-hf': ModelType.IFT,
|
127 |
-
'shibing624/chinese-alpaca-plus-13b-hf': ModelType.IFT,
|
128 |
-
'mosaicml/mpt-7b-instruct': ModelType.IFT,
|
129 |
-
'mosaicml/mpt-30b-chat': ModelType.IFT,
|
130 |
-
'mosaicml/mpt-7b-storywriter': ModelType.FT,
|
131 |
-
'mosaicml/mpt-30b-instruct': ModelType.IFT,
|
132 |
-
'mosaicml/mpt-7b-chat': ModelType.IFT,
|
133 |
-
'mosaicml/mpt-30b': ModelType.PT,
|
134 |
-
'Corianas/111m': ModelType.IFT,
|
135 |
-
'Corianas/Quokka_1.3b': ModelType.IFT,
|
136 |
-
'Corianas/256_5epoch': ModelType.FT,
|
137 |
-
'Corianas/Quokka_256m': ModelType.IFT,
|
138 |
-
'Corianas/Quokka_590m': ModelType.IFT,
|
139 |
-
'Corianas/gpt-j-6B-Dolly': ModelType.FT,
|
140 |
-
'Corianas/Quokka_2.7b': ModelType.IFT,
|
141 |
-
'cyberagent/open-calm-7b': ModelType.FT,
|
142 |
-
'Aspik101/Nous-Hermes-13b-pl-lora_unload': ModelType.IFT,
|
143 |
-
'THUDM/chatglm2-6b': ModelType.IFT,
|
144 |
-
'MetaIX/GPT4-X-Alpasta-30b': ModelType.IFT,
|
145 |
-
'NYTK/PULI-GPTrio': ModelType.PT,
|
146 |
-
'EleutherAI/pythia-1.3b': ModelType.PT,
|
147 |
-
'EleutherAI/pythia-2.8b-deduped': ModelType.PT,
|
148 |
-
'EleutherAI/gpt-neo-125m': ModelType.PT,
|
149 |
-
'EleutherAI/pythia-160m': ModelType.PT,
|
150 |
-
'EleutherAI/gpt-neo-2.7B': ModelType.PT,
|
151 |
-
'EleutherAI/pythia-1b-deduped': ModelType.PT,
|
152 |
-
'EleutherAI/pythia-6.7b': ModelType.PT,
|
153 |
-
'EleutherAI/pythia-70m-deduped': ModelType.PT,
|
154 |
-
'EleutherAI/gpt-neox-20b': ModelType.PT,
|
155 |
-
'EleutherAI/pythia-1.4b-deduped': ModelType.PT,
|
156 |
-
'EleutherAI/pythia-2.7b': ModelType.PT,
|
157 |
-
'EleutherAI/pythia-6.9b-deduped': ModelType.PT,
|
158 |
-
'EleutherAI/pythia-70m': ModelType.PT,
|
159 |
-
'EleutherAI/gpt-j-6b': ModelType.PT,
|
160 |
-
'EleutherAI/pythia-12b-deduped': ModelType.PT,
|
161 |
-
'EleutherAI/gpt-neo-1.3B': ModelType.PT,
|
162 |
-
'EleutherAI/pythia-410m-deduped': ModelType.PT,
|
163 |
-
'EleutherAI/pythia-160m-deduped': ModelType.PT,
|
164 |
-
'EleutherAI/polyglot-ko-12.8b': ModelType.PT,
|
165 |
-
'EleutherAI/pythia-12b': ModelType.PT,
|
166 |
-
'roneneldan/TinyStories-33M': ModelType.PT,
|
167 |
-
'roneneldan/TinyStories-28M': ModelType.PT,
|
168 |
-
'roneneldan/TinyStories-1M': ModelType.PT,
|
169 |
-
'roneneldan/TinyStories-8M': ModelType.PT,
|
170 |
-
'roneneldan/TinyStories-3M': ModelType.PT,
|
171 |
-
'jerryjalapeno/nart-100k-7b': ModelType.FT,
|
172 |
-
'lmsys/vicuna-13b-v1.3': ModelType.IFT,
|
173 |
-
'lmsys/vicuna-7b-v1.3': ModelType.IFT,
|
174 |
-
'lmsys/vicuna-13b-v1.1': ModelType.IFT,
|
175 |
-
'lmsys/vicuna-13b-delta-v1.1': ModelType.IFT,
|
176 |
-
'lmsys/vicuna-7b-delta-v1.1': ModelType.IFT,
|
177 |
-
'abhiramtirumala/DialoGPT-sarcastic-medium': ModelType.FT,
|
178 |
-
'haonan-li/bactrian-x-llama-13b-merged': ModelType.IFT,
|
179 |
-
'Gryphe/MythoLogic-13b': ModelType.IFT,
|
180 |
-
'Gryphe/MythoBoros-13b': ModelType.IFT,
|
181 |
-
'pillowtalks-ai/delta13b': ModelType.FT,
|
182 |
-
'wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard': ModelType.FT,
|
183 |
-
'bigscience/bloom-7b1': ModelType.PT,
|
184 |
-
'bigcode/tiny_starcoder_py': ModelType.PT,
|
185 |
-
'bigcode/starcoderplus': ModelType.FT,
|
186 |
-
'bigcode/gpt_bigcode-santacoder': ModelType.PT,
|
187 |
-
'bigcode/starcoder': ModelType.PT,
|
188 |
-
'Open-Orca/OpenOrca-Preview1-13B': ModelType.IFT,
|
189 |
-
'microsoft/DialoGPT-large': ModelType.FT,
|
190 |
-
'microsoft/DialoGPT-small': ModelType.FT,
|
191 |
-
'microsoft/DialoGPT-medium': ModelType.FT,
|
192 |
-
'microsoft/CodeGPT-small-py': ModelType.FT,
|
193 |
-
'Tincando/fiction_story_generator': ModelType.FT,
|
194 |
-
'Pirr/pythia-13b-deduped-green_devil': ModelType.FT,
|
195 |
-
'Aeala/GPT4-x-AlpacaDente2-30b': ModelType.FT,
|
196 |
-
'Aeala/GPT4-x-AlpacaDente-30b': ModelType.FT,
|
197 |
-
'Aeala/GPT4-x-Alpasta-13b': ModelType.FT,
|
198 |
-
'Aeala/VicUnlocked-alpaca-30b': ModelType.IFT,
|
199 |
-
'Tap-M/Luna-AI-Llama2-Uncensored': ModelType.FT,
|
200 |
-
'illuin/test-custom-llama': ModelType.FT,
|
201 |
-
'dvruette/oasst-llama-13b-2-epochs': ModelType.FT,
|
202 |
-
'dvruette/oasst-gpt-neox-20b-1000-steps': ModelType.FT,
|
203 |
-
'dvruette/llama-13b-pretrained-dropout': ModelType.PT,
|
204 |
-
'dvruette/llama-13b-pretrained': ModelType.PT,
|
205 |
-
'dvruette/llama-13b-pretrained-sft-epoch-1': ModelType.FT,
|
206 |
-
'dvruette/llama-13b-pretrained-sft-do2': ModelType.FT,
|
207 |
-
'dvruette/oasst-gpt-neox-20b-3000-steps': ModelType.FT,
|
208 |
-
'dvruette/oasst-pythia-12b-pretrained-sft': ModelType.FT,
|
209 |
-
'dvruette/oasst-pythia-6.9b-4000-steps': ModelType.FT,
|
210 |
-
'dvruette/gpt-neox-20b-full-precision': ModelType.FT,
|
211 |
-
'dvruette/oasst-llama-13b-1000-steps': ModelType.FT,
|
212 |
-
'openlm-research/open_llama_7b_700bt_preview': ModelType.PT,
|
213 |
-
'openlm-research/open_llama_7b': ModelType.PT,
|
214 |
-
'openlm-research/open_llama_7b_v2': ModelType.PT,
|
215 |
-
'openlm-research/open_llama_3b': ModelType.PT,
|
216 |
-
'openlm-research/open_llama_13b': ModelType.PT,
|
217 |
-
'openlm-research/open_llama_3b_v2': ModelType.PT,
|
218 |
-
'PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged': ModelType.IFT,
|
219 |
-
'GeorgiaTechResearchInstitute/galpaca-30b': ModelType.IFT,
|
220 |
-
'GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct': ModelType.IFT,
|
221 |
-
'databricks/dolly-v2-7b': ModelType.IFT,
|
222 |
-
'databricks/dolly-v2-3b': ModelType.IFT,
|
223 |
-
'databricks/dolly-v2-12b': ModelType.IFT,
|
224 |
-
'Rachneet/gpt2-xl-alpaca': ModelType.FT,
|
225 |
-
'Locutusque/gpt2-conversational-or-qa': ModelType.FT,
|
226 |
-
'psyche/kogpt': ModelType.FT,
|
227 |
-
'NbAiLab/nb-gpt-j-6B-alpaca': ModelType.IFT,
|
228 |
-
'Mikael110/llama-2-7b-guanaco-fp16': ModelType.FT,
|
229 |
-
'Mikael110/llama-2-13b-guanaco-fp16': ModelType.FT,
|
230 |
-
'Fredithefish/CrimsonPajama': ModelType.IFT,
|
231 |
-
'Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K': ModelType.FT,
|
232 |
-
'Fredithefish/ScarletPajama-3B-HF': ModelType.FT,
|
233 |
-
'Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4': ModelType.IFT,
|
234 |
-
'acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1': ModelType.IFT,
|
235 |
-
'eachadea/vicuna-13b-1.1': ModelType.FT,
|
236 |
-
'eachadea/vicuna-7b-1.1': ModelType.FT,
|
237 |
-
'eachadea/vicuna-13b': ModelType.FT,
|
238 |
-
'openaccess-ai-collective/wizard-mega-13b': ModelType.IFT,
|
239 |
-
'openaccess-ai-collective/manticore-13b': ModelType.IFT,
|
240 |
-
'openaccess-ai-collective/manticore-30b-chat-pyg-alpha': ModelType.IFT,
|
241 |
-
'openaccess-ai-collective/minotaur-13b': ModelType.IFT,
|
242 |
-
'openaccess-ai-collective/minotaur-13b-fixed': ModelType.IFT,
|
243 |
-
'openaccess-ai-collective/hippogriff-30b-chat': ModelType.IFT,
|
244 |
-
'openaccess-ai-collective/manticore-13b-chat-pyg': ModelType.IFT,
|
245 |
-
'pythainlp/wangchanglm-7.5B-sft-enth': ModelType.IFT,
|
246 |
-
'pythainlp/wangchanglm-7.5B-sft-en-sharded': ModelType.IFT,
|
247 |
-
'euclaise/gpt-neox-122m-minipile-digits': ModelType.FT,
|
248 |
-
'stabilityai/StableBeluga1-Delta': ModelType.IFT,
|
249 |
-
'stabilityai/stablelm-tuned-alpha-7b': ModelType.IFT,
|
250 |
-
'stabilityai/StableBeluga2': ModelType.IFT,
|
251 |
-
'stabilityai/StableBeluga-13B': ModelType.IFT,
|
252 |
-
'stabilityai/StableBeluga-7B': ModelType.IFT,
|
253 |
-
'stabilityai/stablelm-base-alpha-7b': ModelType.PT,
|
254 |
-
'stabilityai/stablelm-base-alpha-3b': ModelType.PT,
|
255 |
-
'stabilityai/stablelm-tuned-alpha-3b': ModelType.IFT,
|
256 |
-
'alibidaran/medical_transcription_generator': ModelType.FT,
|
257 |
-
'CalderaAI/30B-Lazarus': ModelType.IFT,
|
258 |
-
'CalderaAI/13B-BlueMethod': ModelType.IFT,
|
259 |
-
'CalderaAI/13B-Ouroboros': ModelType.IFT,
|
260 |
-
'KoboldAI/OPT-13B-Erebus': ModelType.FT,
|
261 |
-
'KoboldAI/GPT-J-6B-Janeway': ModelType.FT,
|
262 |
-
'KoboldAI/GPT-J-6B-Shinen': ModelType.FT,
|
263 |
-
'KoboldAI/fairseq-dense-2.7B': ModelType.PT,
|
264 |
-
'KoboldAI/OPT-6B-nerys-v2': ModelType.FT,
|
265 |
-
'KoboldAI/GPT-NeoX-20B-Skein': ModelType.FT,
|
266 |
-
'KoboldAI/PPO_Pygway-6b-Mix': ModelType.FT,
|
267 |
-
'KoboldAI/fairseq-dense-6.7B': ModelType.PT,
|
268 |
-
'KoboldAI/fairseq-dense-125M': ModelType.PT,
|
269 |
-
'KoboldAI/OPT-13B-Nerybus-Mix': ModelType.FT,
|
270 |
-
'KoboldAI/OPT-2.7B-Erebus': ModelType.FT,
|
271 |
-
'KoboldAI/OPT-350M-Nerys-v2': ModelType.FT,
|
272 |
-
'KoboldAI/OPT-2.7B-Nerys-v2': ModelType.FT,
|
273 |
-
'KoboldAI/OPT-2.7B-Nerybus-Mix': ModelType.FT,
|
274 |
-
'KoboldAI/OPT-13B-Nerys-v2': ModelType.FT,
|
275 |
-
'KoboldAI/GPT-NeoX-20B-Erebus': ModelType.FT,
|
276 |
-
'KoboldAI/OPT-6.7B-Erebus': ModelType.FT,
|
277 |
-
'KoboldAI/fairseq-dense-355M': ModelType.PT,
|
278 |
-
'KoboldAI/OPT-6.7B-Nerybus-Mix': ModelType.FT,
|
279 |
-
'KoboldAI/GPT-J-6B-Adventure': ModelType.FT,
|
280 |
-
'KoboldAI/OPT-350M-Erebus': ModelType.FT,
|
281 |
-
'KoboldAI/GPT-J-6B-Skein': ModelType.FT,
|
282 |
-
'KoboldAI/OPT-30B-Erebus': ModelType.FT,
|
283 |
-
'klosax/pythia-160m-deduped-step92k-193bt': ModelType.PT,
|
284 |
-
'klosax/open_llama_3b_350bt_preview': ModelType.PT,
|
285 |
-
'klosax/openllama-3b-350bt': ModelType.PT,
|
286 |
-
'klosax/pythia-70m-deduped-step44k-92bt': ModelType.PT,
|
287 |
-
'klosax/open_llama_13b_600bt_preview': ModelType.PT,
|
288 |
-
'klosax/open_llama_7b_400bt_preview': ModelType.PT,
|
289 |
-
'kfkas/Llama-2-ko-7b-Chat': ModelType.IFT,
|
290 |
-
'WeOpenML/Alpaca-7B-v1': ModelType.IFT,
|
291 |
-
'WeOpenML/PandaLM-Alpaca-7B-v1': ModelType.IFT,
|
292 |
-
'TFLai/gpt2-turkish-uncased': ModelType.FT,
|
293 |
-
'ehartford/WizardLM-13B-Uncensored': ModelType.IFT,
|
294 |
-
'ehartford/dolphin-llama-13b': ModelType.IFT,
|
295 |
-
'ehartford/Wizard-Vicuna-30B-Uncensored': ModelType.FT,
|
296 |
-
'ehartford/WizardLM-30B-Uncensored': ModelType.IFT,
|
297 |
-
'ehartford/Wizard-Vicuna-13B-Uncensored': ModelType.FT,
|
298 |
-
'ehartford/WizardLM-7B-Uncensored': ModelType.IFT,
|
299 |
-
'ehartford/based-30b': ModelType.FT,
|
300 |
-
'ehartford/Wizard-Vicuna-7B-Uncensored': ModelType.FT,
|
301 |
-
'wahaha1987/llama_7b_sharegpt94k_fastchat': ModelType.FT,
|
302 |
-
'wahaha1987/llama_13b_sharegpt94k_fastchat': ModelType.FT,
|
303 |
-
'OpenAssistant/oasst-sft-1-pythia-12b': ModelType.FT,
|
304 |
-
'OpenAssistant/stablelm-7b-sft-v7-epoch-3': ModelType.IFT,
|
305 |
-
'OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5': ModelType.FT,
|
306 |
-
'OpenAssistant/pythia-12b-sft-v8-2.5k-steps': ModelType.IFT,
|
307 |
-
'OpenAssistant/pythia-12b-sft-v8-7k-steps': ModelType.IFT,
|
308 |
-
'OpenAssistant/pythia-12b-pre-v8-12.5k-steps': ModelType.IFT,
|
309 |
-
'OpenAssistant/llama2-13b-orca-8k-3319': ModelType.IFT,
|
310 |
-
'junelee/wizard-vicuna-13b': ModelType.FT,
|
311 |
-
'BreadAi/gpt-YA-1-1_160M': ModelType.PT,
|
312 |
-
'BreadAi/MuseCan': ModelType.PT,
|
313 |
-
'BreadAi/MusePy-1-2': ModelType.PT,
|
314 |
-
'BreadAi/DiscordPy': ModelType.PT,
|
315 |
-
'BreadAi/PM_modelV2': ModelType.PT,
|
316 |
-
'BreadAi/gpt-Youtube': ModelType.PT,
|
317 |
-
'BreadAi/StoryPy': ModelType.FT,
|
318 |
-
'julianweng/Llama-2-7b-chat-orcah': ModelType.FT,
|
319 |
-
'AGI-inc/lora_moe_7b_baseline': ModelType.FT,
|
320 |
-
'AGI-inc/lora_moe_7b': ModelType.FT,
|
321 |
-
'togethercomputer/GPT-NeoXT-Chat-Base-20B': ModelType.IFT,
|
322 |
-
'togethercomputer/RedPajama-INCITE-Chat-7B-v0.1': ModelType.IFT,
|
323 |
-
'togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1': ModelType.IFT,
|
324 |
-
'togethercomputer/RedPajama-INCITE-7B-Base': ModelType.PT,
|
325 |
-
'togethercomputer/RedPajama-INCITE-7B-Instruct': ModelType.IFT,
|
326 |
-
'togethercomputer/RedPajama-INCITE-Base-3B-v1': ModelType.PT,
|
327 |
-
'togethercomputer/Pythia-Chat-Base-7B': ModelType.IFT,
|
328 |
-
'togethercomputer/RedPajama-INCITE-Base-7B-v0.1': ModelType.PT,
|
329 |
-
'togethercomputer/GPT-JT-6B-v1': ModelType.IFT,
|
330 |
-
'togethercomputer/GPT-JT-6B-v0': ModelType.IFT,
|
331 |
-
'togethercomputer/RedPajama-INCITE-Chat-3B-v1': ModelType.IFT,
|
332 |
-
'togethercomputer/RedPajama-INCITE-7B-Chat': ModelType.IFT,
|
333 |
-
'togethercomputer/RedPajama-INCITE-Instruct-3B-v1': ModelType.IFT,
|
334 |
-
'Writer/camel-5b-hf': ModelType.IFT,
|
335 |
-
'Writer/palmyra-base': ModelType.PT,
|
336 |
-
'MBZUAI/LaMini-GPT-1.5B': ModelType.IFT,
|
337 |
-
'MBZUAI/lamini-cerebras-111m': ModelType.IFT,
|
338 |
-
'MBZUAI/lamini-neo-1.3b': ModelType.IFT,
|
339 |
-
'MBZUAI/lamini-cerebras-1.3b': ModelType.IFT,
|
340 |
-
'MBZUAI/lamini-cerebras-256m': ModelType.IFT,
|
341 |
-
'MBZUAI/LaMini-GPT-124M': ModelType.IFT,
|
342 |
-
'MBZUAI/lamini-neo-125m': ModelType.IFT,
|
343 |
-
'TehVenom/DiffMerge-DollyGPT-Pygmalion': ModelType.FT,
|
344 |
-
'TehVenom/PPO_Shygmalion-6b': ModelType.FT,
|
345 |
-
'TehVenom/Dolly_Shygmalion-6b-Dev_V8P2': ModelType.FT,
|
346 |
-
'TehVenom/Pygmalion_AlpacaLora-7b': ModelType.FT,
|
347 |
-
'TehVenom/PPO_Pygway-V8p4_Dev-6b': ModelType.FT,
|
348 |
-
'TehVenom/Dolly_Malion-6b': ModelType.FT,
|
349 |
-
'TehVenom/PPO_Shygmalion-V8p4_Dev-6b': ModelType.FT,
|
350 |
-
'TehVenom/ChanMalion': ModelType.FT,
|
351 |
-
'TehVenom/GPT-J-Pyg_PPO-6B': ModelType.IFT,
|
352 |
-
'TehVenom/Pygmalion-13b-Merged': ModelType.FT,
|
353 |
-
'TehVenom/Metharme-13b-Merged': ModelType.IFT,
|
354 |
-
'TehVenom/Dolly_Shygmalion-6b': ModelType.FT,
|
355 |
-
'TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4': ModelType.IFT,
|
356 |
-
'georgesung/llama2_7b_chat_uncensored': ModelType.FT,
|
357 |
-
'vicgalle/gpt2-alpaca': ModelType.IFT,
|
358 |
-
'vicgalle/alpaca-7b': ModelType.FT,
|
359 |
-
'vicgalle/gpt2-alpaca-gpt4': ModelType.IFT,
|
360 |
-
'facebook/opt-350m': ModelType.PT,
|
361 |
-
'facebook/opt-125m': ModelType.PT,
|
362 |
-
'facebook/xglm-4.5B': ModelType.PT,
|
363 |
-
'facebook/opt-2.7b': ModelType.PT,
|
364 |
-
'facebook/opt-6.7b': ModelType.PT,
|
365 |
-
'facebook/galactica-30b': ModelType.PT,
|
366 |
-
'facebook/opt-13b': ModelType.PT,
|
367 |
-
'facebook/opt-66b': ModelType.PT,
|
368 |
-
'facebook/xglm-7.5B': ModelType.PT,
|
369 |
-
'facebook/xglm-564M': ModelType.PT,
|
370 |
-
'facebook/opt-30b': ModelType.PT,
|
371 |
-
'golaxy/gogpt-7b': ModelType.FT,
|
372 |
-
'golaxy/gogpt2-7b': ModelType.FT,
|
373 |
-
'golaxy/gogpt-7b-bloom': ModelType.FT,
|
374 |
-
'golaxy/gogpt-3b-bloom': ModelType.FT,
|
375 |
-
'psmathur/orca_mini_v2_7b': ModelType.IFT,
|
376 |
-
'psmathur/orca_mini_7b': ModelType.IFT,
|
377 |
-
'psmathur/orca_mini_3b': ModelType.IFT,
|
378 |
-
'psmathur/orca_mini_v2_13b': ModelType.IFT,
|
379 |
-
'gpt2-xl': ModelType.PT,
|
380 |
-
'lxe/Cerebras-GPT-2.7B-Alpaca-SP': ModelType.FT,
|
381 |
-
'Monero/Manticore-13b-Chat-Pyg-Guanaco': ModelType.FT,
|
382 |
-
'Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b': ModelType.IFT,
|
383 |
-
'Monero/WizardLM-13b-OpenAssistant-Uncensored': ModelType.IFT,
|
384 |
-
'Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b': ModelType.IFT,
|
385 |
-
'jzjiao/opt-1.3b-rlhf': ModelType.FT,
|
386 |
-
'HuggingFaceH4/starchat-beta': ModelType.IFT,
|
387 |
-
'KnutJaegersberg/gpt-2-xl-EvolInstruct': ModelType.IFT,
|
388 |
-
'KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct': ModelType.IFT,
|
389 |
-
'KnutJaegersberg/galactica-orca-wizardlm-1.3b': ModelType.IFT,
|
390 |
-
'openchat/openchat_8192': ModelType.IFT,
|
391 |
-
'openchat/openchat_v2': ModelType.IFT,
|
392 |
-
'openchat/openchat_v2_w': ModelType.IFT,
|
393 |
-
'ausboss/llama-13b-supercot': ModelType.IFT,
|
394 |
-
'ausboss/llama-30b-supercot': ModelType.IFT,
|
395 |
-
'Neko-Institute-of-Science/metharme-7b': ModelType.IFT,
|
396 |
-
'Neko-Institute-of-Science/pygmalion-7b': ModelType.FT,
|
397 |
-
'SebastianSchramm/Cerebras-GPT-111M-instruction': ModelType.IFT,
|
398 |
-
'victor123/WizardLM-13B-1.0': ModelType.IFT,
|
399 |
-
'OpenBuddy/openbuddy-openllama-13b-v7-fp16': ModelType.FT,
|
400 |
-
'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16': ModelType.FT,
|
401 |
-
'OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16': ModelType.FT,
|
402 |
-
'baichuan-inc/Baichuan-7B': ModelType.PT,
|
403 |
-
'tiiuae/falcon-40b-instruct': ModelType.IFT,
|
404 |
-
'tiiuae/falcon-40b': ModelType.PT,
|
405 |
-
'tiiuae/falcon-7b': ModelType.PT,
|
406 |
-
'YeungNLP/firefly-llama-13b': ModelType.FT,
|
407 |
-
'YeungNLP/firefly-llama-13b-v1.2': ModelType.FT,
|
408 |
-
'YeungNLP/firefly-llama2-13b': ModelType.FT,
|
409 |
-
'YeungNLP/firefly-ziya-13b': ModelType.FT,
|
410 |
-
'shaohang/Sparse0.5_OPT-1.3': ModelType.FT,
|
411 |
-
'xzuyn/Alpacino-SuperCOT-13B': ModelType.IFT,
|
412 |
-
'xzuyn/MedicWizard-7B': ModelType.FT,
|
413 |
-
'xDAN-AI/xDAN_13b_l2_lora': ModelType.FT,
|
414 |
-
'beomi/KoAlpaca-Polyglot-5.8B': ModelType.FT,
|
415 |
-
'beomi/llama-2-ko-7b': ModelType.IFT,
|
416 |
-
'Salesforce/codegen-6B-multi': ModelType.PT,
|
417 |
-
'Salesforce/codegen-16B-nl': ModelType.PT,
|
418 |
-
'Salesforce/codegen-6B-nl': ModelType.PT,
|
419 |
-
'ai-forever/rugpt3large_based_on_gpt2': ModelType.FT,
|
420 |
-
'gpt2-large': ModelType.PT,
|
421 |
-
'frank098/orca_mini_3b_juniper': ModelType.FT,
|
422 |
-
'frank098/WizardLM_13B_juniper': ModelType.FT,
|
423 |
-
'FPHam/Free_Sydney_13b_HF': ModelType.FT,
|
424 |
-
'huggingface/llama-13b': ModelType.PT,
|
425 |
-
'huggingface/llama-7b': ModelType.PT,
|
426 |
-
'huggingface/llama-65b': ModelType.PT,
|
427 |
-
'huggingface/llama-30b': ModelType.PT,
|
428 |
-
'Henk717/chronoboros-33B': ModelType.IFT,
|
429 |
-
'jondurbin/airoboros-13b-gpt4-1.4': ModelType.IFT,
|
430 |
-
'jondurbin/airoboros-7b': ModelType.IFT,
|
431 |
-
'jondurbin/airoboros-7b-gpt4': ModelType.IFT,
|
432 |
-
'jondurbin/airoboros-7b-gpt4-1.1': ModelType.IFT,
|
433 |
-
'jondurbin/airoboros-7b-gpt4-1.2': ModelType.IFT,
|
434 |
-
'jondurbin/airoboros-7b-gpt4-1.3': ModelType.IFT,
|
435 |
-
'jondurbin/airoboros-7b-gpt4-1.4': ModelType.IFT,
|
436 |
-
'jondurbin/airoboros-l2-7b-gpt4-1.4.1': ModelType.IFT,
|
437 |
-
'jondurbin/airoboros-l2-13b-gpt4-1.4.1': ModelType.IFT,
|
438 |
-
'jondurbin/airoboros-l2-70b-gpt4-1.4.1': ModelType.IFT,
|
439 |
-
'jondurbin/airoboros-13b': ModelType.IFT,
|
440 |
-
'jondurbin/airoboros-33b-gpt4-1.4': ModelType.IFT,
|
441 |
-
'jondurbin/airoboros-33b-gpt4-1.2': ModelType.IFT,
|
442 |
-
'jondurbin/airoboros-65b-gpt4-1.2': ModelType.IFT,
|
443 |
-
'ariellee/SuperPlatty-30B': ModelType.IFT,
|
444 |
-
'danielhanchen/open_llama_3b_600bt_preview': ModelType.FT,
|
445 |
-
'cerebras/Cerebras-GPT-256M': ModelType.PT,
|
446 |
-
'cerebras/Cerebras-GPT-1.3B': ModelType.PT,
|
447 |
-
'cerebras/Cerebras-GPT-13B': ModelType.PT,
|
448 |
-
'cerebras/Cerebras-GPT-2.7B': ModelType.PT,
|
449 |
-
'cerebras/Cerebras-GPT-111M': ModelType.PT,
|
450 |
-
'cerebras/Cerebras-GPT-6.7B': ModelType.PT,
|
451 |
-
'Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf': ModelType.RL,
|
452 |
-
'Yhyu13/llama-30B-hf-openassitant': ModelType.FT,
|
453 |
-
'NousResearch/Nous-Hermes-Llama2-13b': ModelType.IFT,
|
454 |
-
'NousResearch/Nous-Hermes-llama-2-7b': ModelType.IFT,
|
455 |
-
'NousResearch/Redmond-Puffin-13B': ModelType.IFT,
|
456 |
-
'NousResearch/Nous-Hermes-13b': ModelType.IFT,
|
457 |
-
'project-baize/baize-v2-7b': ModelType.IFT,
|
458 |
-
'project-baize/baize-v2-13b': ModelType.IFT,
|
459 |
-
'LLMs/WizardLM-13B-V1.0': ModelType.FT,
|
460 |
-
'LLMs/AlpacaGPT4-7B-elina': ModelType.FT,
|
461 |
-
'wenge-research/yayi-7b': ModelType.FT,
|
462 |
-
'wenge-research/yayi-7b-llama2': ModelType.FT,
|
463 |
-
'wenge-research/yayi-13b-llama2': ModelType.FT,
|
464 |
-
'yhyhy3/open_llama_7b_v2_med_instruct': ModelType.IFT,
|
465 |
-
'llama-anon/instruct-13b': ModelType.IFT,
|
466 |
-
'huggingtweets/jerma985': ModelType.FT,
|
467 |
-
'huggingtweets/gladosystem': ModelType.FT,
|
468 |
-
'huggingtweets/bladeecity-jerma985': ModelType.FT,
|
469 |
-
'huggyllama/llama-13b': ModelType.PT,
|
470 |
-
'huggyllama/llama-65b': ModelType.PT,
|
471 |
-
'FabbriSimo01/Facebook_opt_1.3b_Quantized': ModelType.PT,
|
472 |
-
'upstage/Llama-2-70b-instruct': ModelType.IFT,
|
473 |
-
'upstage/Llama-2-70b-instruct-1024': ModelType.IFT,
|
474 |
-
'upstage/llama-65b-instruct': ModelType.IFT,
|
475 |
-
'upstage/llama-30b-instruct-2048': ModelType.IFT,
|
476 |
-
'upstage/llama-30b-instruct': ModelType.IFT,
|
477 |
-
'WizardLM/WizardLM-13B-1.0': ModelType.IFT,
|
478 |
-
'WizardLM/WizardLM-13B-V1.1': ModelType.IFT,
|
479 |
-
'WizardLM/WizardLM-13B-V1.2': ModelType.IFT,
|
480 |
-
'WizardLM/WizardLM-30B-V1.0': ModelType.IFT,
|
481 |
-
'WizardLM/WizardCoder-15B-V1.0': ModelType.IFT,
|
482 |
-
'gpt2': ModelType.PT,
|
483 |
-
'keyfan/vicuna-chinese-replication-v1.1': ModelType.IFT,
|
484 |
-
'nthngdy/pythia-owt2-70m-100k': ModelType.FT,
|
485 |
-
'nthngdy/pythia-owt2-70m-50k': ModelType.FT,
|
486 |
-
'quantumaikr/KoreanLM-hf': ModelType.FT,
|
487 |
-
'quantumaikr/open_llama_7b_hf': ModelType.FT,
|
488 |
-
'quantumaikr/QuantumLM-70B-hf': ModelType.IFT,
|
489 |
-
'MayaPH/FinOPT-Lincoln': ModelType.FT,
|
490 |
-
'MayaPH/FinOPT-Franklin': ModelType.FT,
|
491 |
-
'MayaPH/GodziLLa-30B': ModelType.IFT,
|
492 |
-
'MayaPH/GodziLLa-30B-plus': ModelType.IFT,
|
493 |
-
'MayaPH/FinOPT-Washington': ModelType.FT,
|
494 |
-
'ogimgio/gpt-neo-125m-neurallinguisticpioneers': ModelType.FT,
|
495 |
-
'layoric/llama-2-13b-code-alpaca': ModelType.FT,
|
496 |
-
'CobraMamba/mamba-gpt-3b': ModelType.FT,
|
497 |
-
'CobraMamba/mamba-gpt-3b-v2': ModelType.FT,
|
498 |
-
'CobraMamba/mamba-gpt-3b-v3': ModelType.FT,
|
499 |
-
'timdettmers/guanaco-33b-merged': ModelType.FT,
|
500 |
-
'elinas/chronos-33b': ModelType.IFT,
|
501 |
-
'heegyu/RedTulu-Uncensored-3B-0719': ModelType.IFT,
|
502 |
-
'heegyu/WizardVicuna-Uncensored-3B-0719': ModelType.IFT,
|
503 |
-
'heegyu/WizardVicuna-3B-0719': ModelType.IFT,
|
504 |
-
'meta-llama/Llama-2-7b-chat-hf': ModelType.RL,
|
505 |
-
'meta-llama/Llama-2-7b-hf': ModelType.PT,
|
506 |
-
'meta-llama/Llama-2-13b-chat-hf': ModelType.RL,
|
507 |
-
'meta-llama/Llama-2-13b-hf': ModelType.PT,
|
508 |
-
'meta-llama/Llama-2-70b-chat-hf': ModelType.RL,
|
509 |
-
'meta-llama/Llama-2-70b-hf': ModelType.PT,
|
510 |
-
'xhyi/PT_GPTNEO350_ATG': ModelType.FT,
|
511 |
-
'h2oai/h2ogpt-gm-oasst1-en-1024-20b': ModelType.FT,
|
512 |
-
'h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt': ModelType.FT,
|
513 |
-
'h2oai/h2ogpt-oig-oasst1-512-6_9b': ModelType.IFT,
|
514 |
-
'h2oai/h2ogpt-oasst1-512-12b': ModelType.IFT,
|
515 |
-
'h2oai/h2ogpt-oig-oasst1-256-6_9b': ModelType.IFT,
|
516 |
-
'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt': ModelType.FT,
|
517 |
-
'h2oai/h2ogpt-oasst1-512-20b': ModelType.IFT,
|
518 |
-
'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2': ModelType.FT,
|
519 |
-
'h2oai/h2ogpt-gm-oasst1-en-1024-12b': ModelType.FT,
|
520 |
-
'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b': ModelType.FT,
|
521 |
-
'bofenghuang/vigogne-13b-instruct': ModelType.IFT,
|
522 |
-
'bofenghuang/vigogne-13b-chat': ModelType.FT,
|
523 |
-
'bofenghuang/vigogne-2-7b-instruct': ModelType.IFT,
|
524 |
-
'bofenghuang/vigogne-7b-instruct': ModelType.IFT,
|
525 |
-
'bofenghuang/vigogne-7b-chat': ModelType.FT,
|
526 |
-
'Vmware/open-llama-7b-v2-open-instruct': ModelType.IFT,
|
527 |
-
'VMware/open-llama-0.7T-7B-open-instruct-v1.1': ModelType.IFT,
|
528 |
-
'ewof/koishi-instruct-3b': ModelType.IFT,
|
529 |
-
'gywy/llama2-13b-chinese-v1': ModelType.FT,
|
530 |
-
'GOAT-AI/GOAT-7B-Community': ModelType.FT,
|
531 |
-
'psyche/kollama2-7b': ModelType.FT,
|
532 |
-
'TheTravellingEngineer/llama2-7b-hf-guanaco': ModelType.FT,
|
533 |
-
'beaugogh/pythia-1.4b-deduped-sharegpt': ModelType.FT,
|
534 |
-
'augtoma/qCammel-70-x': ModelType.IFT,
|
535 |
-
'Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload': ModelType.IFT,
|
536 |
-
'anhnv125/pygmalion-6b-roleplay': ModelType.FT,
|
537 |
-
'64bits/LexPodLM-13B': ModelType.FT,
|
538 |
-
}
|
539 |
-
|
540 |
-
|
541 |
-
def model_type_from_str(type):
|
542 |
-
if "fine-tuned" in type or "🔶" in type:
|
543 |
-
return ModelType.FT
|
544 |
-
if "pretrained" in type or "🟢" in type:
|
545 |
-
return ModelType.PT
|
546 |
-
if "RL-tuned" in type or "🟦" in type:
|
547 |
-
return ModelType.RL
|
548 |
-
if "instruction-tuned" in type or "⭕" in type:
|
549 |
-
return ModelType.IFT
|
550 |
-
return ModelType.Unknown
|
551 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{auto_leaderboard → display_models}/get_model_metadata.py
RENAMED
@@ -1,17 +1,17 @@
|
|
1 |
-
import re
|
2 |
-
import os
|
3 |
import glob
|
4 |
import json
|
5 |
import os
|
|
|
6 |
from typing import List
|
|
|
|
|
|
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
-
from src.
|
10 |
-
from src.
|
11 |
-
from src.
|
12 |
|
13 |
-
from huggingface_hub import HfApi
|
14 |
-
import huggingface_hub
|
15 |
api = HfApi(token=os.environ.get("H4_TOKEN", None))
|
16 |
|
17 |
|
@@ -38,15 +38,18 @@ def get_model_license(model_info):
|
|
38 |
except Exception:
|
39 |
return None
|
40 |
|
|
|
41 |
def get_model_likes(model_info):
|
42 |
return model_info.likes
|
43 |
|
|
|
44 |
size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
45 |
|
|
|
46 |
def get_model_size(model_name, model_info):
|
47 |
# In billions
|
48 |
try:
|
49 |
-
return round(model_info.safetensors["total"] / 1e9, 3)
|
50 |
except AttributeError:
|
51 |
try:
|
52 |
size_match = re.search(size_pattern, model_name.lower())
|
@@ -58,7 +61,10 @@ def get_model_size(model_name, model_info):
|
|
58 |
|
59 |
def get_model_type(leaderboard_data: List[dict]):
|
60 |
for model_data in leaderboard_data:
|
61 |
-
request_files = os.path.join(
|
|
|
|
|
|
|
62 |
request_files = glob.glob(request_files)
|
63 |
|
64 |
# Select correct request file (precision)
|
@@ -70,9 +76,12 @@ def get_model_type(leaderboard_data: List[dict]):
|
|
70 |
for tmp_request_file in request_files:
|
71 |
with open(tmp_request_file, "r") as f:
|
72 |
req_content = json.load(f)
|
73 |
-
if
|
|
|
|
|
|
|
74 |
request_file = tmp_request_file
|
75 |
-
|
76 |
if request_file == "":
|
77 |
model_data[AutoEvalColumn.model_type.name] = ""
|
78 |
model_data[AutoEvalColumn.model_type_symbol.name] = ""
|
@@ -81,30 +90,41 @@ def get_model_type(leaderboard_data: List[dict]):
|
|
81 |
try:
|
82 |
with open(request_file, "r") as f:
|
83 |
request = json.load(f)
|
84 |
-
|
85 |
except Exception:
|
86 |
-
|
87 |
|
88 |
try:
|
89 |
with open(request_file, "r") as f:
|
90 |
request = json.load(f)
|
91 |
model_type = model_type_from_str(request["model_type"])
|
92 |
model_data[AutoEvalColumn.model_type.name] = model_type.value.name
|
93 |
-
model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol
|
94 |
except KeyError:
|
95 |
if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
|
96 |
-
model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
|
97 |
-
|
|
|
|
|
|
|
|
|
98 |
else:
|
99 |
model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
|
100 |
model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
|
101 |
|
102 |
-
|
|
|
103 |
for model_data in leaderboard_data:
|
104 |
if model_data["model_name_for_query"] in FLAGGED_MODELS:
|
105 |
issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
|
106 |
-
issue_link = model_hyperlink(
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
def remove_forbidden_models(leaderboard_data: List[dict]):
|
110 |
indices_to_remove = []
|
@@ -116,6 +136,7 @@ def remove_forbidden_models(leaderboard_data: List[dict]):
|
|
116 |
leaderboard_data.pop(ix)
|
117 |
return leaderboard_data
|
118 |
|
|
|
119 |
def apply_metadata(leaderboard_data: List[dict]):
|
120 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
121 |
get_model_type(leaderboard_data)
|
|
|
|
|
|
|
1 |
import glob
|
2 |
import json
|
3 |
import os
|
4 |
+
import re
|
5 |
from typing import List
|
6 |
+
|
7 |
+
import huggingface_hub
|
8 |
+
from huggingface_hub import HfApi
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
+
from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
|
12 |
+
from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
|
13 |
+
from src.display_models.utils import AutoEvalColumn, model_hyperlink
|
14 |
|
|
|
|
|
15 |
api = HfApi(token=os.environ.get("H4_TOKEN", None))
|
16 |
|
17 |
|
|
|
38 |
except Exception:
|
39 |
return None
|
40 |
|
41 |
+
|
42 |
def get_model_likes(model_info):
|
43 |
return model_info.likes
|
44 |
|
45 |
+
|
46 |
size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
47 |
|
48 |
+
|
49 |
def get_model_size(model_name, model_info):
|
50 |
# In billions
|
51 |
try:
|
52 |
+
return round(model_info.safetensors["total"] / 1e9, 3)
|
53 |
except AttributeError:
|
54 |
try:
|
55 |
size_match = re.search(size_pattern, model_name.lower())
|
|
|
61 |
|
62 |
def get_model_type(leaderboard_data: List[dict]):
|
63 |
for model_data in leaderboard_data:
|
64 |
+
request_files = os.path.join(
|
65 |
+
"eval-queue",
|
66 |
+
model_data["model_name_for_query"] + "_eval_request_*" + ".json",
|
67 |
+
)
|
68 |
request_files = glob.glob(request_files)
|
69 |
|
70 |
# Select correct request file (precision)
|
|
|
76 |
for tmp_request_file in request_files:
|
77 |
with open(tmp_request_file, "r") as f:
|
78 |
req_content = json.load(f)
|
79 |
+
if (
|
80 |
+
req_content["status"] == "FINISHED"
|
81 |
+
and req_content["precision"] == model_data["Precision"].split(".")[-1]
|
82 |
+
):
|
83 |
request_file = tmp_request_file
|
84 |
+
|
85 |
if request_file == "":
|
86 |
model_data[AutoEvalColumn.model_type.name] = ""
|
87 |
model_data[AutoEvalColumn.model_type_symbol.name] = ""
|
|
|
90 |
try:
|
91 |
with open(request_file, "r") as f:
|
92 |
request = json.load(f)
|
93 |
+
request["weight_type"] != "Original"
|
94 |
except Exception:
|
95 |
+
pass
|
96 |
|
97 |
try:
|
98 |
with open(request_file, "r") as f:
|
99 |
request = json.load(f)
|
100 |
model_type = model_type_from_str(request["model_type"])
|
101 |
model_data[AutoEvalColumn.model_type.name] = model_type.value.name
|
102 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol # + ("🔺" if is_delta else "")
|
103 |
except KeyError:
|
104 |
if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
|
105 |
+
model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
|
106 |
+
model_data["model_name_for_query"]
|
107 |
+
].value.name
|
108 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[
|
109 |
+
model_data["model_name_for_query"]
|
110 |
+
].value.symbol # + ("🔺" if is_delta else "")
|
111 |
else:
|
112 |
model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
|
113 |
model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
|
114 |
|
115 |
+
|
116 |
+
def flag_models(leaderboard_data: List[dict]):
|
117 |
for model_data in leaderboard_data:
|
118 |
if model_data["model_name_for_query"] in FLAGGED_MODELS:
|
119 |
issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
|
120 |
+
issue_link = model_hyperlink(
|
121 |
+
FLAGGED_MODELS[model_data["model_name_for_query"]],
|
122 |
+
f"See discussion #{issue_num}",
|
123 |
+
)
|
124 |
+
model_data[
|
125 |
+
AutoEvalColumn.model.name
|
126 |
+
] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
127 |
+
|
128 |
|
129 |
def remove_forbidden_models(leaderboard_data: List[dict]):
|
130 |
indices_to_remove = []
|
|
|
136 |
leaderboard_data.pop(ix)
|
137 |
return leaderboard_data
|
138 |
|
139 |
+
|
140 |
def apply_metadata(leaderboard_data: List[dict]):
|
141 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
142 |
get_model_type(leaderboard_data)
|
src/{auto_leaderboard → display_models}/model_metadata_flags.py
RENAMED
@@ -8,5 +8,5 @@ FLAGGED_MODELS = {
|
|
8 |
|
9 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
10 |
DO_NOT_SUBMIT_MODELS = [
|
11 |
-
"Voicelab/trurl-2-13b",
|
12 |
-
]
|
|
|
8 |
|
9 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
10 |
DO_NOT_SUBMIT_MODELS = [
|
11 |
+
"Voicelab/trurl-2-13b", # trained on MMLU
|
12 |
+
]
|
src/display_models/model_metadata_type.py
ADDED
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
+
from typing import Dict
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class ModelInfo:
|
8 |
+
name: str
|
9 |
+
symbol: str # emoji
|
10 |
+
|
11 |
+
|
12 |
+
class ModelType(Enum):
|
13 |
+
PT = ModelInfo(name="pretrained", symbol="🟢")
|
14 |
+
FT = ModelInfo(name="fine-tuned", symbol="🔶")
|
15 |
+
IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
|
16 |
+
RL = ModelInfo(name="RL-tuned", symbol="🟦")
|
17 |
+
Unknown = ModelInfo(name="Unknown, add type to request file!", symbol="?")
|
18 |
+
|
19 |
+
def to_str(self, separator=" "):
|
20 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
21 |
+
|
22 |
+
|
23 |
+
MODEL_TYPE_METADATA: Dict[str, ModelType] = {
|
24 |
+
"notstoic/PygmalionCoT-7b": ModelType.IFT,
|
25 |
+
"aisquared/dlite-v1-355m": ModelType.IFT,
|
26 |
+
"aisquared/dlite-v1-1_5b": ModelType.IFT,
|
27 |
+
"aisquared/dlite-v1-774m": ModelType.IFT,
|
28 |
+
"aisquared/dlite-v1-124m": ModelType.IFT,
|
29 |
+
"aisquared/chopt-2_7b": ModelType.IFT,
|
30 |
+
"aisquared/dlite-v2-124m": ModelType.IFT,
|
31 |
+
"aisquared/dlite-v2-774m": ModelType.IFT,
|
32 |
+
"aisquared/dlite-v2-1_5b": ModelType.IFT,
|
33 |
+
"aisquared/chopt-1_3b": ModelType.IFT,
|
34 |
+
"aisquared/dlite-v2-355m": ModelType.IFT,
|
35 |
+
"augtoma/qCammel-13": ModelType.IFT,
|
36 |
+
"Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload": ModelType.IFT,
|
37 |
+
"Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload": ModelType.IFT,
|
38 |
+
"TheBloke/alpaca-lora-65B-HF": ModelType.FT,
|
39 |
+
"TheBloke/tulu-7B-fp16": ModelType.IFT,
|
40 |
+
"TheBloke/guanaco-7B-HF": ModelType.FT,
|
41 |
+
"TheBloke/koala-7B-HF": ModelType.FT,
|
42 |
+
"TheBloke/wizardLM-7B-HF": ModelType.IFT,
|
43 |
+
"TheBloke/airoboros-13B-HF": ModelType.IFT,
|
44 |
+
"TheBloke/koala-13B-HF": ModelType.FT,
|
45 |
+
"TheBloke/Wizard-Vicuna-7B-Uncensored-HF": ModelType.FT,
|
46 |
+
"TheBloke/dromedary-65b-lora-HF": ModelType.IFT,
|
47 |
+
"TheBloke/wizardLM-13B-1.0-fp16": ModelType.IFT,
|
48 |
+
"TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16": ModelType.FT,
|
49 |
+
"TheBloke/Wizard-Vicuna-30B-Uncensored-fp16": ModelType.FT,
|
50 |
+
"TheBloke/wizard-vicuna-13B-HF": ModelType.IFT,
|
51 |
+
"TheBloke/UltraLM-13B-fp16": ModelType.IFT,
|
52 |
+
"TheBloke/OpenAssistant-FT-7-Llama-30B-HF": ModelType.FT,
|
53 |
+
"TheBloke/vicuna-13B-1.1-HF": ModelType.IFT,
|
54 |
+
"TheBloke/guanaco-13B-HF": ModelType.FT,
|
55 |
+
"TheBloke/guanaco-65B-HF": ModelType.FT,
|
56 |
+
"TheBloke/airoboros-7b-gpt4-fp16": ModelType.IFT,
|
57 |
+
"TheBloke/llama-30b-supercot-SuperHOT-8K-fp16": ModelType.IFT,
|
58 |
+
"TheBloke/Llama-2-13B-fp16": ModelType.PT,
|
59 |
+
"TheBloke/llama-2-70b-Guanaco-QLoRA-fp16": ModelType.FT,
|
60 |
+
"TheBloke/landmark-attention-llama7b-fp16": ModelType.IFT,
|
61 |
+
"TheBloke/Planner-7B-fp16": ModelType.IFT,
|
62 |
+
"TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.FT,
|
63 |
+
"TheBloke/gpt4-alpaca-lora-13B-HF": ModelType.IFT,
|
64 |
+
"TheBloke/gpt4-x-vicuna-13B-HF": ModelType.IFT,
|
65 |
+
"TheBloke/gpt4-alpaca-lora_mlp-65B-HF": ModelType.IFT,
|
66 |
+
"TheBloke/tulu-13B-fp16": ModelType.IFT,
|
67 |
+
"TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16": ModelType.IFT,
|
68 |
+
"TheBloke/Llama-2-70B-fp16": ModelType.IFT,
|
69 |
+
"TheBloke/WizardLM-30B-fp16": ModelType.IFT,
|
70 |
+
"TheBloke/robin-13B-v2-fp16": ModelType.FT,
|
71 |
+
"TheBloke/robin-33B-v2-fp16": ModelType.FT,
|
72 |
+
"TheBloke/Vicuna-13B-CoT-fp16": ModelType.IFT,
|
73 |
+
"TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16": ModelType.IFT,
|
74 |
+
"TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16": ModelType.FT,
|
75 |
+
"TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16": ModelType.IFT,
|
76 |
+
"TheBloke/GPlatty-30B-SuperHOT-8K-fp16": ModelType.FT,
|
77 |
+
"TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16": ModelType.IFT,
|
78 |
+
"TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16": ModelType.IFT,
|
79 |
+
"jphme/orca_mini_v2_ger_7b": ModelType.IFT,
|
80 |
+
"Ejafa/vicuna_7B_vanilla_1.1": ModelType.FT,
|
81 |
+
"kevinpro/Vicuna-13B-CoT": ModelType.IFT,
|
82 |
+
"AlekseyKorshuk/pygmalion-6b-vicuna-chatml": ModelType.FT,
|
83 |
+
"AlekseyKorshuk/chatml-pyg-v1": ModelType.FT,
|
84 |
+
"concedo/Vicuzard-30B-Uncensored": ModelType.FT,
|
85 |
+
"concedo/OPT-19M-ChatSalad": ModelType.FT,
|
86 |
+
"concedo/Pythia-70M-ChatSalad": ModelType.FT,
|
87 |
+
"digitous/13B-HyperMantis": ModelType.IFT,
|
88 |
+
"digitous/Adventien-GPTJ": ModelType.FT,
|
89 |
+
"digitous/Alpacino13b": ModelType.IFT,
|
90 |
+
"digitous/GPT-R": ModelType.IFT,
|
91 |
+
"digitous/Javelin-R": ModelType.IFT,
|
92 |
+
"digitous/Javalion-GPTJ": ModelType.IFT,
|
93 |
+
"digitous/Javalion-R": ModelType.IFT,
|
94 |
+
"digitous/Skegma-GPTJ": ModelType.FT,
|
95 |
+
"digitous/Alpacino30b": ModelType.IFT,
|
96 |
+
"digitous/Janin-GPTJ": ModelType.FT,
|
97 |
+
"digitous/Janin-R": ModelType.FT,
|
98 |
+
"digitous/Javelin-GPTJ": ModelType.FT,
|
99 |
+
"SaylorTwift/gpt2_test": ModelType.PT,
|
100 |
+
"anton-l/gpt-j-tiny-random": ModelType.FT,
|
101 |
+
"Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca": ModelType.FT,
|
102 |
+
"Lazycuber/pyg-instruct-wizardlm": ModelType.FT,
|
103 |
+
"Lazycuber/Janemalion-6B": ModelType.FT,
|
104 |
+
"IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.FT,
|
105 |
+
"IDEA-CCNL/Ziya-LLaMA-13B-v1": ModelType.IFT,
|
106 |
+
"dsvv-cair/alpaca-cleaned-llama-30b-bf16": ModelType.FT,
|
107 |
+
"gpt2-medium": ModelType.PT,
|
108 |
+
"camel-ai/CAMEL-13B-Combined-Data": ModelType.IFT,
|
109 |
+
"camel-ai/CAMEL-13B-Role-Playing-Data": ModelType.FT,
|
110 |
+
"camel-ai/CAMEL-33B-Combined-Data": ModelType.IFT,
|
111 |
+
"PygmalionAI/pygmalion-6b": ModelType.FT,
|
112 |
+
"PygmalionAI/metharme-1.3b": ModelType.IFT,
|
113 |
+
"PygmalionAI/pygmalion-1.3b": ModelType.FT,
|
114 |
+
"PygmalionAI/pygmalion-350m": ModelType.FT,
|
115 |
+
"PygmalionAI/pygmalion-2.7b": ModelType.FT,
|
116 |
+
"medalpaca/medalpaca-7b": ModelType.FT,
|
117 |
+
"lilloukas/Platypus-30B": ModelType.IFT,
|
118 |
+
"lilloukas/GPlatty-30B": ModelType.FT,
|
119 |
+
"mncai/chatdoctor": ModelType.FT,
|
120 |
+
"chaoyi-wu/MedLLaMA_13B": ModelType.FT,
|
121 |
+
"LoupGarou/WizardCoder-Guanaco-15B-V1.0": ModelType.IFT,
|
122 |
+
"LoupGarou/WizardCoder-Guanaco-15B-V1.1": ModelType.FT,
|
123 |
+
"hakurei/instruct-12b": ModelType.IFT,
|
124 |
+
"hakurei/lotus-12B": ModelType.FT,
|
125 |
+
"shibing624/chinese-llama-plus-13b-hf": ModelType.IFT,
|
126 |
+
"shibing624/chinese-alpaca-plus-7b-hf": ModelType.IFT,
|
127 |
+
"shibing624/chinese-alpaca-plus-13b-hf": ModelType.IFT,
|
128 |
+
"mosaicml/mpt-7b-instruct": ModelType.IFT,
|
129 |
+
"mosaicml/mpt-30b-chat": ModelType.IFT,
|
130 |
+
"mosaicml/mpt-7b-storywriter": ModelType.FT,
|
131 |
+
"mosaicml/mpt-30b-instruct": ModelType.IFT,
|
132 |
+
"mosaicml/mpt-7b-chat": ModelType.IFT,
|
133 |
+
"mosaicml/mpt-30b": ModelType.PT,
|
134 |
+
"Corianas/111m": ModelType.IFT,
|
135 |
+
"Corianas/Quokka_1.3b": ModelType.IFT,
|
136 |
+
"Corianas/256_5epoch": ModelType.FT,
|
137 |
+
"Corianas/Quokka_256m": ModelType.IFT,
|
138 |
+
"Corianas/Quokka_590m": ModelType.IFT,
|
139 |
+
"Corianas/gpt-j-6B-Dolly": ModelType.FT,
|
140 |
+
"Corianas/Quokka_2.7b": ModelType.IFT,
|
141 |
+
"cyberagent/open-calm-7b": ModelType.FT,
|
142 |
+
"Aspik101/Nous-Hermes-13b-pl-lora_unload": ModelType.IFT,
|
143 |
+
"THUDM/chatglm2-6b": ModelType.IFT,
|
144 |
+
"MetaIX/GPT4-X-Alpasta-30b": ModelType.IFT,
|
145 |
+
"NYTK/PULI-GPTrio": ModelType.PT,
|
146 |
+
"EleutherAI/pythia-1.3b": ModelType.PT,
|
147 |
+
"EleutherAI/pythia-2.8b-deduped": ModelType.PT,
|
148 |
+
"EleutherAI/gpt-neo-125m": ModelType.PT,
|
149 |
+
"EleutherAI/pythia-160m": ModelType.PT,
|
150 |
+
"EleutherAI/gpt-neo-2.7B": ModelType.PT,
|
151 |
+
"EleutherAI/pythia-1b-deduped": ModelType.PT,
|
152 |
+
"EleutherAI/pythia-6.7b": ModelType.PT,
|
153 |
+
"EleutherAI/pythia-70m-deduped": ModelType.PT,
|
154 |
+
"EleutherAI/gpt-neox-20b": ModelType.PT,
|
155 |
+
"EleutherAI/pythia-1.4b-deduped": ModelType.PT,
|
156 |
+
"EleutherAI/pythia-2.7b": ModelType.PT,
|
157 |
+
"EleutherAI/pythia-6.9b-deduped": ModelType.PT,
|
158 |
+
"EleutherAI/pythia-70m": ModelType.PT,
|
159 |
+
"EleutherAI/gpt-j-6b": ModelType.PT,
|
160 |
+
"EleutherAI/pythia-12b-deduped": ModelType.PT,
|
161 |
+
"EleutherAI/gpt-neo-1.3B": ModelType.PT,
|
162 |
+
"EleutherAI/pythia-410m-deduped": ModelType.PT,
|
163 |
+
"EleutherAI/pythia-160m-deduped": ModelType.PT,
|
164 |
+
"EleutherAI/polyglot-ko-12.8b": ModelType.PT,
|
165 |
+
"EleutherAI/pythia-12b": ModelType.PT,
|
166 |
+
"roneneldan/TinyStories-33M": ModelType.PT,
|
167 |
+
"roneneldan/TinyStories-28M": ModelType.PT,
|
168 |
+
"roneneldan/TinyStories-1M": ModelType.PT,
|
169 |
+
"roneneldan/TinyStories-8M": ModelType.PT,
|
170 |
+
"roneneldan/TinyStories-3M": ModelType.PT,
|
171 |
+
"jerryjalapeno/nart-100k-7b": ModelType.FT,
|
172 |
+
"lmsys/vicuna-13b-v1.3": ModelType.IFT,
|
173 |
+
"lmsys/vicuna-7b-v1.3": ModelType.IFT,
|
174 |
+
"lmsys/vicuna-13b-v1.1": ModelType.IFT,
|
175 |
+
"lmsys/vicuna-13b-delta-v1.1": ModelType.IFT,
|
176 |
+
"lmsys/vicuna-7b-delta-v1.1": ModelType.IFT,
|
177 |
+
"abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.FT,
|
178 |
+
"haonan-li/bactrian-x-llama-13b-merged": ModelType.IFT,
|
179 |
+
"Gryphe/MythoLogic-13b": ModelType.IFT,
|
180 |
+
"Gryphe/MythoBoros-13b": ModelType.IFT,
|
181 |
+
"pillowtalks-ai/delta13b": ModelType.FT,
|
182 |
+
"wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard": ModelType.FT,
|
183 |
+
"bigscience/bloom-7b1": ModelType.PT,
|
184 |
+
"bigcode/tiny_starcoder_py": ModelType.PT,
|
185 |
+
"bigcode/starcoderplus": ModelType.FT,
|
186 |
+
"bigcode/gpt_bigcode-santacoder": ModelType.PT,
|
187 |
+
"bigcode/starcoder": ModelType.PT,
|
188 |
+
"Open-Orca/OpenOrca-Preview1-13B": ModelType.IFT,
|
189 |
+
"microsoft/DialoGPT-large": ModelType.FT,
|
190 |
+
"microsoft/DialoGPT-small": ModelType.FT,
|
191 |
+
"microsoft/DialoGPT-medium": ModelType.FT,
|
192 |
+
"microsoft/CodeGPT-small-py": ModelType.FT,
|
193 |
+
"Tincando/fiction_story_generator": ModelType.FT,
|
194 |
+
"Pirr/pythia-13b-deduped-green_devil": ModelType.FT,
|
195 |
+
"Aeala/GPT4-x-AlpacaDente2-30b": ModelType.FT,
|
196 |
+
"Aeala/GPT4-x-AlpacaDente-30b": ModelType.FT,
|
197 |
+
"Aeala/GPT4-x-Alpasta-13b": ModelType.FT,
|
198 |
+
"Aeala/VicUnlocked-alpaca-30b": ModelType.IFT,
|
199 |
+
"Tap-M/Luna-AI-Llama2-Uncensored": ModelType.FT,
|
200 |
+
"illuin/test-custom-llama": ModelType.FT,
|
201 |
+
"dvruette/oasst-llama-13b-2-epochs": ModelType.FT,
|
202 |
+
"dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.FT,
|
203 |
+
"dvruette/llama-13b-pretrained-dropout": ModelType.PT,
|
204 |
+
"dvruette/llama-13b-pretrained": ModelType.PT,
|
205 |
+
"dvruette/llama-13b-pretrained-sft-epoch-1": ModelType.FT,
|
206 |
+
"dvruette/llama-13b-pretrained-sft-do2": ModelType.FT,
|
207 |
+
"dvruette/oasst-gpt-neox-20b-3000-steps": ModelType.FT,
|
208 |
+
"dvruette/oasst-pythia-12b-pretrained-sft": ModelType.FT,
|
209 |
+
"dvruette/oasst-pythia-6.9b-4000-steps": ModelType.FT,
|
210 |
+
"dvruette/gpt-neox-20b-full-precision": ModelType.FT,
|
211 |
+
"dvruette/oasst-llama-13b-1000-steps": ModelType.FT,
|
212 |
+
"openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
|
213 |
+
"openlm-research/open_llama_7b": ModelType.PT,
|
214 |
+
"openlm-research/open_llama_7b_v2": ModelType.PT,
|
215 |
+
"openlm-research/open_llama_3b": ModelType.PT,
|
216 |
+
"openlm-research/open_llama_13b": ModelType.PT,
|
217 |
+
"openlm-research/open_llama_3b_v2": ModelType.PT,
|
218 |
+
"PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.IFT,
|
219 |
+
"GeorgiaTechResearchInstitute/galpaca-30b": ModelType.IFT,
|
220 |
+
"GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct": ModelType.IFT,
|
221 |
+
"databricks/dolly-v2-7b": ModelType.IFT,
|
222 |
+
"databricks/dolly-v2-3b": ModelType.IFT,
|
223 |
+
"databricks/dolly-v2-12b": ModelType.IFT,
|
224 |
+
"Rachneet/gpt2-xl-alpaca": ModelType.FT,
|
225 |
+
"Locutusque/gpt2-conversational-or-qa": ModelType.FT,
|
226 |
+
"psyche/kogpt": ModelType.FT,
|
227 |
+
"NbAiLab/nb-gpt-j-6B-alpaca": ModelType.IFT,
|
228 |
+
"Mikael110/llama-2-7b-guanaco-fp16": ModelType.FT,
|
229 |
+
"Mikael110/llama-2-13b-guanaco-fp16": ModelType.FT,
|
230 |
+
"Fredithefish/CrimsonPajama": ModelType.IFT,
|
231 |
+
"Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K": ModelType.FT,
|
232 |
+
"Fredithefish/ScarletPajama-3B-HF": ModelType.FT,
|
233 |
+
"Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4": ModelType.IFT,
|
234 |
+
"acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1": ModelType.IFT,
|
235 |
+
"eachadea/vicuna-13b-1.1": ModelType.FT,
|
236 |
+
"eachadea/vicuna-7b-1.1": ModelType.FT,
|
237 |
+
"eachadea/vicuna-13b": ModelType.FT,
|
238 |
+
"openaccess-ai-collective/wizard-mega-13b": ModelType.IFT,
|
239 |
+
"openaccess-ai-collective/manticore-13b": ModelType.IFT,
|
240 |
+
"openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.IFT,
|
241 |
+
"openaccess-ai-collective/minotaur-13b": ModelType.IFT,
|
242 |
+
"openaccess-ai-collective/minotaur-13b-fixed": ModelType.IFT,
|
243 |
+
"openaccess-ai-collective/hippogriff-30b-chat": ModelType.IFT,
|
244 |
+
"openaccess-ai-collective/manticore-13b-chat-pyg": ModelType.IFT,
|
245 |
+
"pythainlp/wangchanglm-7.5B-sft-enth": ModelType.IFT,
|
246 |
+
"pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.IFT,
|
247 |
+
"euclaise/gpt-neox-122m-minipile-digits": ModelType.FT,
|
248 |
+
"stabilityai/StableBeluga1-Delta": ModelType.IFT,
|
249 |
+
"stabilityai/stablelm-tuned-alpha-7b": ModelType.IFT,
|
250 |
+
"stabilityai/StableBeluga2": ModelType.IFT,
|
251 |
+
"stabilityai/StableBeluga-13B": ModelType.IFT,
|
252 |
+
"stabilityai/StableBeluga-7B": ModelType.IFT,
|
253 |
+
"stabilityai/stablelm-base-alpha-7b": ModelType.PT,
|
254 |
+
"stabilityai/stablelm-base-alpha-3b": ModelType.PT,
|
255 |
+
"stabilityai/stablelm-tuned-alpha-3b": ModelType.IFT,
|
256 |
+
"alibidaran/medical_transcription_generator": ModelType.FT,
|
257 |
+
"CalderaAI/30B-Lazarus": ModelType.IFT,
|
258 |
+
"CalderaAI/13B-BlueMethod": ModelType.IFT,
|
259 |
+
"CalderaAI/13B-Ouroboros": ModelType.IFT,
|
260 |
+
"KoboldAI/OPT-13B-Erebus": ModelType.FT,
|
261 |
+
"KoboldAI/GPT-J-6B-Janeway": ModelType.FT,
|
262 |
+
"KoboldAI/GPT-J-6B-Shinen": ModelType.FT,
|
263 |
+
"KoboldAI/fairseq-dense-2.7B": ModelType.PT,
|
264 |
+
"KoboldAI/OPT-6B-nerys-v2": ModelType.FT,
|
265 |
+
"KoboldAI/GPT-NeoX-20B-Skein": ModelType.FT,
|
266 |
+
"KoboldAI/PPO_Pygway-6b-Mix": ModelType.FT,
|
267 |
+
"KoboldAI/fairseq-dense-6.7B": ModelType.PT,
|
268 |
+
"KoboldAI/fairseq-dense-125M": ModelType.PT,
|
269 |
+
"KoboldAI/OPT-13B-Nerybus-Mix": ModelType.FT,
|
270 |
+
"KoboldAI/OPT-2.7B-Erebus": ModelType.FT,
|
271 |
+
"KoboldAI/OPT-350M-Nerys-v2": ModelType.FT,
|
272 |
+
"KoboldAI/OPT-2.7B-Nerys-v2": ModelType.FT,
|
273 |
+
"KoboldAI/OPT-2.7B-Nerybus-Mix": ModelType.FT,
|
274 |
+
"KoboldAI/OPT-13B-Nerys-v2": ModelType.FT,
|
275 |
+
"KoboldAI/GPT-NeoX-20B-Erebus": ModelType.FT,
|
276 |
+
"KoboldAI/OPT-6.7B-Erebus": ModelType.FT,
|
277 |
+
"KoboldAI/fairseq-dense-355M": ModelType.PT,
|
278 |
+
"KoboldAI/OPT-6.7B-Nerybus-Mix": ModelType.FT,
|
279 |
+
"KoboldAI/GPT-J-6B-Adventure": ModelType.FT,
|
280 |
+
"KoboldAI/OPT-350M-Erebus": ModelType.FT,
|
281 |
+
"KoboldAI/GPT-J-6B-Skein": ModelType.FT,
|
282 |
+
"KoboldAI/OPT-30B-Erebus": ModelType.FT,
|
283 |
+
"klosax/pythia-160m-deduped-step92k-193bt": ModelType.PT,
|
284 |
+
"klosax/open_llama_3b_350bt_preview": ModelType.PT,
|
285 |
+
"klosax/openllama-3b-350bt": ModelType.PT,
|
286 |
+
"klosax/pythia-70m-deduped-step44k-92bt": ModelType.PT,
|
287 |
+
"klosax/open_llama_13b_600bt_preview": ModelType.PT,
|
288 |
+
"klosax/open_llama_7b_400bt_preview": ModelType.PT,
|
289 |
+
"kfkas/Llama-2-ko-7b-Chat": ModelType.IFT,
|
290 |
+
"WeOpenML/Alpaca-7B-v1": ModelType.IFT,
|
291 |
+
"WeOpenML/PandaLM-Alpaca-7B-v1": ModelType.IFT,
|
292 |
+
"TFLai/gpt2-turkish-uncased": ModelType.FT,
|
293 |
+
"ehartford/WizardLM-13B-Uncensored": ModelType.IFT,
|
294 |
+
"ehartford/dolphin-llama-13b": ModelType.IFT,
|
295 |
+
"ehartford/Wizard-Vicuna-30B-Uncensored": ModelType.FT,
|
296 |
+
"ehartford/WizardLM-30B-Uncensored": ModelType.IFT,
|
297 |
+
"ehartford/Wizard-Vicuna-13B-Uncensored": ModelType.FT,
|
298 |
+
"ehartford/WizardLM-7B-Uncensored": ModelType.IFT,
|
299 |
+
"ehartford/based-30b": ModelType.FT,
|
300 |
+
"ehartford/Wizard-Vicuna-7B-Uncensored": ModelType.FT,
|
301 |
+
"wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.FT,
|
302 |
+
"wahaha1987/llama_13b_sharegpt94k_fastchat": ModelType.FT,
|
303 |
+
"OpenAssistant/oasst-sft-1-pythia-12b": ModelType.FT,
|
304 |
+
"OpenAssistant/stablelm-7b-sft-v7-epoch-3": ModelType.IFT,
|
305 |
+
"OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.FT,
|
306 |
+
"OpenAssistant/pythia-12b-sft-v8-2.5k-steps": ModelType.IFT,
|
307 |
+
"OpenAssistant/pythia-12b-sft-v8-7k-steps": ModelType.IFT,
|
308 |
+
"OpenAssistant/pythia-12b-pre-v8-12.5k-steps": ModelType.IFT,
|
309 |
+
"OpenAssistant/llama2-13b-orca-8k-3319": ModelType.IFT,
|
310 |
+
"junelee/wizard-vicuna-13b": ModelType.FT,
|
311 |
+
"BreadAi/gpt-YA-1-1_160M": ModelType.PT,
|
312 |
+
"BreadAi/MuseCan": ModelType.PT,
|
313 |
+
"BreadAi/MusePy-1-2": ModelType.PT,
|
314 |
+
"BreadAi/DiscordPy": ModelType.PT,
|
315 |
+
"BreadAi/PM_modelV2": ModelType.PT,
|
316 |
+
"BreadAi/gpt-Youtube": ModelType.PT,
|
317 |
+
"BreadAi/StoryPy": ModelType.FT,
|
318 |
+
"julianweng/Llama-2-7b-chat-orcah": ModelType.FT,
|
319 |
+
"AGI-inc/lora_moe_7b_baseline": ModelType.FT,
|
320 |
+
"AGI-inc/lora_moe_7b": ModelType.FT,
|
321 |
+
"togethercomputer/GPT-NeoXT-Chat-Base-20B": ModelType.IFT,
|
322 |
+
"togethercomputer/RedPajama-INCITE-Chat-7B-v0.1": ModelType.IFT,
|
323 |
+
"togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1": ModelType.IFT,
|
324 |
+
"togethercomputer/RedPajama-INCITE-7B-Base": ModelType.PT,
|
325 |
+
"togethercomputer/RedPajama-INCITE-7B-Instruct": ModelType.IFT,
|
326 |
+
"togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
|
327 |
+
"togethercomputer/Pythia-Chat-Base-7B": ModelType.IFT,
|
328 |
+
"togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
|
329 |
+
"togethercomputer/GPT-JT-6B-v1": ModelType.IFT,
|
330 |
+
"togethercomputer/GPT-JT-6B-v0": ModelType.IFT,
|
331 |
+
"togethercomputer/RedPajama-INCITE-Chat-3B-v1": ModelType.IFT,
|
332 |
+
"togethercomputer/RedPajama-INCITE-7B-Chat": ModelType.IFT,
|
333 |
+
"togethercomputer/RedPajama-INCITE-Instruct-3B-v1": ModelType.IFT,
|
334 |
+
"Writer/camel-5b-hf": ModelType.IFT,
|
335 |
+
"Writer/palmyra-base": ModelType.PT,
|
336 |
+
"MBZUAI/LaMini-GPT-1.5B": ModelType.IFT,
|
337 |
+
"MBZUAI/lamini-cerebras-111m": ModelType.IFT,
|
338 |
+
"MBZUAI/lamini-neo-1.3b": ModelType.IFT,
|
339 |
+
"MBZUAI/lamini-cerebras-1.3b": ModelType.IFT,
|
340 |
+
"MBZUAI/lamini-cerebras-256m": ModelType.IFT,
|
341 |
+
"MBZUAI/LaMini-GPT-124M": ModelType.IFT,
|
342 |
+
"MBZUAI/lamini-neo-125m": ModelType.IFT,
|
343 |
+
"TehVenom/DiffMerge-DollyGPT-Pygmalion": ModelType.FT,
|
344 |
+
"TehVenom/PPO_Shygmalion-6b": ModelType.FT,
|
345 |
+
"TehVenom/Dolly_Shygmalion-6b-Dev_V8P2": ModelType.FT,
|
346 |
+
"TehVenom/Pygmalion_AlpacaLora-7b": ModelType.FT,
|
347 |
+
"TehVenom/PPO_Pygway-V8p4_Dev-6b": ModelType.FT,
|
348 |
+
"TehVenom/Dolly_Malion-6b": ModelType.FT,
|
349 |
+
"TehVenom/PPO_Shygmalion-V8p4_Dev-6b": ModelType.FT,
|
350 |
+
"TehVenom/ChanMalion": ModelType.FT,
|
351 |
+
"TehVenom/GPT-J-Pyg_PPO-6B": ModelType.IFT,
|
352 |
+
"TehVenom/Pygmalion-13b-Merged": ModelType.FT,
|
353 |
+
"TehVenom/Metharme-13b-Merged": ModelType.IFT,
|
354 |
+
"TehVenom/Dolly_Shygmalion-6b": ModelType.FT,
|
355 |
+
"TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4": ModelType.IFT,
|
356 |
+
"georgesung/llama2_7b_chat_uncensored": ModelType.FT,
|
357 |
+
"vicgalle/gpt2-alpaca": ModelType.IFT,
|
358 |
+
"vicgalle/alpaca-7b": ModelType.FT,
|
359 |
+
"vicgalle/gpt2-alpaca-gpt4": ModelType.IFT,
|
360 |
+
"facebook/opt-350m": ModelType.PT,
|
361 |
+
"facebook/opt-125m": ModelType.PT,
|
362 |
+
"facebook/xglm-4.5B": ModelType.PT,
|
363 |
+
"facebook/opt-2.7b": ModelType.PT,
|
364 |
+
"facebook/opt-6.7b": ModelType.PT,
|
365 |
+
"facebook/galactica-30b": ModelType.PT,
|
366 |
+
"facebook/opt-13b": ModelType.PT,
|
367 |
+
"facebook/opt-66b": ModelType.PT,
|
368 |
+
"facebook/xglm-7.5B": ModelType.PT,
|
369 |
+
"facebook/xglm-564M": ModelType.PT,
|
370 |
+
"facebook/opt-30b": ModelType.PT,
|
371 |
+
"golaxy/gogpt-7b": ModelType.FT,
|
372 |
+
"golaxy/gogpt2-7b": ModelType.FT,
|
373 |
+
"golaxy/gogpt-7b-bloom": ModelType.FT,
|
374 |
+
"golaxy/gogpt-3b-bloom": ModelType.FT,
|
375 |
+
"psmathur/orca_mini_v2_7b": ModelType.IFT,
|
376 |
+
"psmathur/orca_mini_7b": ModelType.IFT,
|
377 |
+
"psmathur/orca_mini_3b": ModelType.IFT,
|
378 |
+
"psmathur/orca_mini_v2_13b": ModelType.IFT,
|
379 |
+
"gpt2-xl": ModelType.PT,
|
380 |
+
"lxe/Cerebras-GPT-2.7B-Alpaca-SP": ModelType.FT,
|
381 |
+
"Monero/Manticore-13b-Chat-Pyg-Guanaco": ModelType.FT,
|
382 |
+
"Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b": ModelType.IFT,
|
383 |
+
"Monero/WizardLM-13b-OpenAssistant-Uncensored": ModelType.IFT,
|
384 |
+
"Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b": ModelType.IFT,
|
385 |
+
"jzjiao/opt-1.3b-rlhf": ModelType.FT,
|
386 |
+
"HuggingFaceH4/starchat-beta": ModelType.IFT,
|
387 |
+
"KnutJaegersberg/gpt-2-xl-EvolInstruct": ModelType.IFT,
|
388 |
+
"KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct": ModelType.IFT,
|
389 |
+
"KnutJaegersberg/galactica-orca-wizardlm-1.3b": ModelType.IFT,
|
390 |
+
"openchat/openchat_8192": ModelType.IFT,
|
391 |
+
"openchat/openchat_v2": ModelType.IFT,
|
392 |
+
"openchat/openchat_v2_w": ModelType.IFT,
|
393 |
+
"ausboss/llama-13b-supercot": ModelType.IFT,
|
394 |
+
"ausboss/llama-30b-supercot": ModelType.IFT,
|
395 |
+
"Neko-Institute-of-Science/metharme-7b": ModelType.IFT,
|
396 |
+
"Neko-Institute-of-Science/pygmalion-7b": ModelType.FT,
|
397 |
+
"SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.IFT,
|
398 |
+
"victor123/WizardLM-13B-1.0": ModelType.IFT,
|
399 |
+
"OpenBuddy/openbuddy-openllama-13b-v7-fp16": ModelType.FT,
|
400 |
+
"OpenBuddy/openbuddy-llama2-13b-v8.1-fp16": ModelType.FT,
|
401 |
+
"OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16": ModelType.FT,
|
402 |
+
"baichuan-inc/Baichuan-7B": ModelType.PT,
|
403 |
+
"tiiuae/falcon-40b-instruct": ModelType.IFT,
|
404 |
+
"tiiuae/falcon-40b": ModelType.PT,
|
405 |
+
"tiiuae/falcon-7b": ModelType.PT,
|
406 |
+
"YeungNLP/firefly-llama-13b": ModelType.FT,
|
407 |
+
"YeungNLP/firefly-llama-13b-v1.2": ModelType.FT,
|
408 |
+
"YeungNLP/firefly-llama2-13b": ModelType.FT,
|
409 |
+
"YeungNLP/firefly-ziya-13b": ModelType.FT,
|
410 |
+
"shaohang/Sparse0.5_OPT-1.3": ModelType.FT,
|
411 |
+
"xzuyn/Alpacino-SuperCOT-13B": ModelType.IFT,
|
412 |
+
"xzuyn/MedicWizard-7B": ModelType.FT,
|
413 |
+
"xDAN-AI/xDAN_13b_l2_lora": ModelType.FT,
|
414 |
+
"beomi/KoAlpaca-Polyglot-5.8B": ModelType.FT,
|
415 |
+
"beomi/llama-2-ko-7b": ModelType.IFT,
|
416 |
+
"Salesforce/codegen-6B-multi": ModelType.PT,
|
417 |
+
"Salesforce/codegen-16B-nl": ModelType.PT,
|
418 |
+
"Salesforce/codegen-6B-nl": ModelType.PT,
|
419 |
+
"ai-forever/rugpt3large_based_on_gpt2": ModelType.FT,
|
420 |
+
"gpt2-large": ModelType.PT,
|
421 |
+
"frank098/orca_mini_3b_juniper": ModelType.FT,
|
422 |
+
"frank098/WizardLM_13B_juniper": ModelType.FT,
|
423 |
+
"FPHam/Free_Sydney_13b_HF": ModelType.FT,
|
424 |
+
"huggingface/llama-13b": ModelType.PT,
|
425 |
+
"huggingface/llama-7b": ModelType.PT,
|
426 |
+
"huggingface/llama-65b": ModelType.PT,
|
427 |
+
"huggingface/llama-30b": ModelType.PT,
|
428 |
+
"Henk717/chronoboros-33B": ModelType.IFT,
|
429 |
+
"jondurbin/airoboros-13b-gpt4-1.4": ModelType.IFT,
|
430 |
+
"jondurbin/airoboros-7b": ModelType.IFT,
|
431 |
+
"jondurbin/airoboros-7b-gpt4": ModelType.IFT,
|
432 |
+
"jondurbin/airoboros-7b-gpt4-1.1": ModelType.IFT,
|
433 |
+
"jondurbin/airoboros-7b-gpt4-1.2": ModelType.IFT,
|
434 |
+
"jondurbin/airoboros-7b-gpt4-1.3": ModelType.IFT,
|
435 |
+
"jondurbin/airoboros-7b-gpt4-1.4": ModelType.IFT,
|
436 |
+
"jondurbin/airoboros-l2-7b-gpt4-1.4.1": ModelType.IFT,
|
437 |
+
"jondurbin/airoboros-l2-13b-gpt4-1.4.1": ModelType.IFT,
|
438 |
+
"jondurbin/airoboros-l2-70b-gpt4-1.4.1": ModelType.IFT,
|
439 |
+
"jondurbin/airoboros-13b": ModelType.IFT,
|
440 |
+
"jondurbin/airoboros-33b-gpt4-1.4": ModelType.IFT,
|
441 |
+
"jondurbin/airoboros-33b-gpt4-1.2": ModelType.IFT,
|
442 |
+
"jondurbin/airoboros-65b-gpt4-1.2": ModelType.IFT,
|
443 |
+
"ariellee/SuperPlatty-30B": ModelType.IFT,
|
444 |
+
"danielhanchen/open_llama_3b_600bt_preview": ModelType.FT,
|
445 |
+
"cerebras/Cerebras-GPT-256M": ModelType.PT,
|
446 |
+
"cerebras/Cerebras-GPT-1.3B": ModelType.PT,
|
447 |
+
"cerebras/Cerebras-GPT-13B": ModelType.PT,
|
448 |
+
"cerebras/Cerebras-GPT-2.7B": ModelType.PT,
|
449 |
+
"cerebras/Cerebras-GPT-111M": ModelType.PT,
|
450 |
+
"cerebras/Cerebras-GPT-6.7B": ModelType.PT,
|
451 |
+
"Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf": ModelType.RL,
|
452 |
+
"Yhyu13/llama-30B-hf-openassitant": ModelType.FT,
|
453 |
+
"NousResearch/Nous-Hermes-Llama2-13b": ModelType.IFT,
|
454 |
+
"NousResearch/Nous-Hermes-llama-2-7b": ModelType.IFT,
|
455 |
+
"NousResearch/Redmond-Puffin-13B": ModelType.IFT,
|
456 |
+
"NousResearch/Nous-Hermes-13b": ModelType.IFT,
|
457 |
+
"project-baize/baize-v2-7b": ModelType.IFT,
|
458 |
+
"project-baize/baize-v2-13b": ModelType.IFT,
|
459 |
+
"LLMs/WizardLM-13B-V1.0": ModelType.FT,
|
460 |
+
"LLMs/AlpacaGPT4-7B-elina": ModelType.FT,
|
461 |
+
"wenge-research/yayi-7b": ModelType.FT,
|
462 |
+
"wenge-research/yayi-7b-llama2": ModelType.FT,
|
463 |
+
"wenge-research/yayi-13b-llama2": ModelType.FT,
|
464 |
+
"yhyhy3/open_llama_7b_v2_med_instruct": ModelType.IFT,
|
465 |
+
"llama-anon/instruct-13b": ModelType.IFT,
|
466 |
+
"huggingtweets/jerma985": ModelType.FT,
|
467 |
+
"huggingtweets/gladosystem": ModelType.FT,
|
468 |
+
"huggingtweets/bladeecity-jerma985": ModelType.FT,
|
469 |
+
"huggyllama/llama-13b": ModelType.PT,
|
470 |
+
"huggyllama/llama-65b": ModelType.PT,
|
471 |
+
"FabbriSimo01/Facebook_opt_1.3b_Quantized": ModelType.PT,
|
472 |
+
"upstage/Llama-2-70b-instruct": ModelType.IFT,
|
473 |
+
"upstage/Llama-2-70b-instruct-1024": ModelType.IFT,
|
474 |
+
"upstage/llama-65b-instruct": ModelType.IFT,
|
475 |
+
"upstage/llama-30b-instruct-2048": ModelType.IFT,
|
476 |
+
"upstage/llama-30b-instruct": ModelType.IFT,
|
477 |
+
"WizardLM/WizardLM-13B-1.0": ModelType.IFT,
|
478 |
+
"WizardLM/WizardLM-13B-V1.1": ModelType.IFT,
|
479 |
+
"WizardLM/WizardLM-13B-V1.2": ModelType.IFT,
|
480 |
+
"WizardLM/WizardLM-30B-V1.0": ModelType.IFT,
|
481 |
+
"WizardLM/WizardCoder-15B-V1.0": ModelType.IFT,
|
482 |
+
"gpt2": ModelType.PT,
|
483 |
+
"keyfan/vicuna-chinese-replication-v1.1": ModelType.IFT,
|
484 |
+
"nthngdy/pythia-owt2-70m-100k": ModelType.FT,
|
485 |
+
"nthngdy/pythia-owt2-70m-50k": ModelType.FT,
|
486 |
+
"quantumaikr/KoreanLM-hf": ModelType.FT,
|
487 |
+
"quantumaikr/open_llama_7b_hf": ModelType.FT,
|
488 |
+
"quantumaikr/QuantumLM-70B-hf": ModelType.IFT,
|
489 |
+
"MayaPH/FinOPT-Lincoln": ModelType.FT,
|
490 |
+
"MayaPH/FinOPT-Franklin": ModelType.FT,
|
491 |
+
"MayaPH/GodziLLa-30B": ModelType.IFT,
|
492 |
+
"MayaPH/GodziLLa-30B-plus": ModelType.IFT,
|
493 |
+
"MayaPH/FinOPT-Washington": ModelType.FT,
|
494 |
+
"ogimgio/gpt-neo-125m-neurallinguisticpioneers": ModelType.FT,
|
495 |
+
"layoric/llama-2-13b-code-alpaca": ModelType.FT,
|
496 |
+
"CobraMamba/mamba-gpt-3b": ModelType.FT,
|
497 |
+
"CobraMamba/mamba-gpt-3b-v2": ModelType.FT,
|
498 |
+
"CobraMamba/mamba-gpt-3b-v3": ModelType.FT,
|
499 |
+
"timdettmers/guanaco-33b-merged": ModelType.FT,
|
500 |
+
"elinas/chronos-33b": ModelType.IFT,
|
501 |
+
"heegyu/RedTulu-Uncensored-3B-0719": ModelType.IFT,
|
502 |
+
"heegyu/WizardVicuna-Uncensored-3B-0719": ModelType.IFT,
|
503 |
+
"heegyu/WizardVicuna-3B-0719": ModelType.IFT,
|
504 |
+
"meta-llama/Llama-2-7b-chat-hf": ModelType.RL,
|
505 |
+
"meta-llama/Llama-2-7b-hf": ModelType.PT,
|
506 |
+
"meta-llama/Llama-2-13b-chat-hf": ModelType.RL,
|
507 |
+
"meta-llama/Llama-2-13b-hf": ModelType.PT,
|
508 |
+
"meta-llama/Llama-2-70b-chat-hf": ModelType.RL,
|
509 |
+
"meta-llama/Llama-2-70b-hf": ModelType.PT,
|
510 |
+
"xhyi/PT_GPTNEO350_ATG": ModelType.FT,
|
511 |
+
"h2oai/h2ogpt-gm-oasst1-en-1024-20b": ModelType.FT,
|
512 |
+
"h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt": ModelType.FT,
|
513 |
+
"h2oai/h2ogpt-oig-oasst1-512-6_9b": ModelType.IFT,
|
514 |
+
"h2oai/h2ogpt-oasst1-512-12b": ModelType.IFT,
|
515 |
+
"h2oai/h2ogpt-oig-oasst1-256-6_9b": ModelType.IFT,
|
516 |
+
"h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt": ModelType.FT,
|
517 |
+
"h2oai/h2ogpt-oasst1-512-20b": ModelType.IFT,
|
518 |
+
"h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2": ModelType.FT,
|
519 |
+
"h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.FT,
|
520 |
+
"h2oai/h2ogpt-gm-oasst1-multilang-1024-20b": ModelType.FT,
|
521 |
+
"bofenghuang/vigogne-13b-instruct": ModelType.IFT,
|
522 |
+
"bofenghuang/vigogne-13b-chat": ModelType.FT,
|
523 |
+
"bofenghuang/vigogne-2-7b-instruct": ModelType.IFT,
|
524 |
+
"bofenghuang/vigogne-7b-instruct": ModelType.IFT,
|
525 |
+
"bofenghuang/vigogne-7b-chat": ModelType.FT,
|
526 |
+
"Vmware/open-llama-7b-v2-open-instruct": ModelType.IFT,
|
527 |
+
"VMware/open-llama-0.7T-7B-open-instruct-v1.1": ModelType.IFT,
|
528 |
+
"ewof/koishi-instruct-3b": ModelType.IFT,
|
529 |
+
"gywy/llama2-13b-chinese-v1": ModelType.FT,
|
530 |
+
"GOAT-AI/GOAT-7B-Community": ModelType.FT,
|
531 |
+
"psyche/kollama2-7b": ModelType.FT,
|
532 |
+
"TheTravellingEngineer/llama2-7b-hf-guanaco": ModelType.FT,
|
533 |
+
"beaugogh/pythia-1.4b-deduped-sharegpt": ModelType.FT,
|
534 |
+
"augtoma/qCammel-70-x": ModelType.IFT,
|
535 |
+
"Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload": ModelType.IFT,
|
536 |
+
"anhnv125/pygmalion-6b-roleplay": ModelType.FT,
|
537 |
+
"64bits/LexPodLM-13B": ModelType.FT,
|
538 |
+
}
|
539 |
+
|
540 |
+
|
541 |
+
def model_type_from_str(type):
|
542 |
+
if "fine-tuned" in type or "🔶" in type:
|
543 |
+
return ModelType.FT
|
544 |
+
if "pretrained" in type or "🟢" in type:
|
545 |
+
return ModelType.PT
|
546 |
+
if "RL-tuned" in type or "🟦" in type:
|
547 |
+
return ModelType.RL
|
548 |
+
if "instruction-tuned" in type or "⭕" in type:
|
549 |
+
return ModelType.IFT
|
550 |
+
return ModelType.Unknown
|
src/{auto_leaderboard/load_results.py → display_models/read_results.py}
RENAMED
@@ -1,14 +1,13 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
|
3 |
-
import glob
|
4 |
import json
|
5 |
import os
|
|
|
6 |
from typing import Dict, List, Tuple
|
7 |
-
import dateutil
|
8 |
|
9 |
-
|
10 |
import numpy as np
|
11 |
|
|
|
|
|
12 |
METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
|
13 |
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
|
14 |
BENCH_TO_NAME = {
|
@@ -31,13 +30,15 @@ class EvalResult:
|
|
31 |
weight_type: str = ""
|
32 |
|
33 |
def to_dict(self):
|
|
|
|
|
34 |
if self.org is not None:
|
35 |
base_model = f"{self.org}/{self.model}"
|
36 |
else:
|
37 |
base_model = f"{self.model}"
|
38 |
data_dict = {}
|
39 |
|
40 |
-
data_dict["eval_name"] = self.eval_name
|
41 |
data_dict["weight_type"] = self.weight_type # not a column, just a save name
|
42 |
data_dict[AutoEvalColumn.precision.name] = self.precision
|
43 |
data_dict[AutoEvalColumn.model_type.name] = self.model_type
|
@@ -45,6 +46,9 @@ class EvalResult:
|
|
45 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
46 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
47 |
data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
|
|
|
|
|
|
|
48 |
|
49 |
for benchmark in BENCHMARKS:
|
50 |
if benchmark not in self.results.keys():
|
@@ -60,10 +64,9 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
|
60 |
with open(json_filepath) as fp:
|
61 |
data = json.load(fp)
|
62 |
|
63 |
-
|
64 |
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
65 |
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
66 |
-
return None, []
|
67 |
|
68 |
try:
|
69 |
config = data["config"]
|
@@ -87,17 +90,24 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
|
87 |
else:
|
88 |
org = model_split[0]
|
89 |
model = model_split[1]
|
90 |
-
result_key =
|
91 |
|
92 |
eval_results = []
|
93 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
94 |
-
accs = np.array([v.get(metric,
|
95 |
-
if accs.size == 0:
|
96 |
continue
|
97 |
mean_acc = np.mean(accs) * 100.0
|
98 |
-
eval_results.append(
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
return result_key, eval_results
|
103 |
|
@@ -113,11 +123,11 @@ def get_eval_results() -> List[EvalResult]:
|
|
113 |
# Sort the files by date
|
114 |
# store results by precision maybe?
|
115 |
try:
|
116 |
-
files.sort(key=lambda x:
|
117 |
except dateutil.parser._parser.ParserError:
|
118 |
files = [files[-1]]
|
119 |
|
120 |
-
#up_to_date = files[-1]
|
121 |
for file in files:
|
122 |
json_filepaths.append(os.path.join(root, file))
|
123 |
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
from dataclasses import dataclass
|
4 |
from typing import Dict, List, Tuple
|
|
|
5 |
|
6 |
+
import dateutil
|
7 |
import numpy as np
|
8 |
|
9 |
+
from src.display_models.utils import AutoEvalColumn, make_clickable_model
|
10 |
+
|
11 |
METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
|
12 |
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
|
13 |
BENCH_TO_NAME = {
|
|
|
30 |
weight_type: str = ""
|
31 |
|
32 |
def to_dict(self):
|
33 |
+
from src.load_from_hub import is_model_on_hub
|
34 |
+
|
35 |
if self.org is not None:
|
36 |
base_model = f"{self.org}/{self.model}"
|
37 |
else:
|
38 |
base_model = f"{self.model}"
|
39 |
data_dict = {}
|
40 |
|
41 |
+
data_dict["eval_name"] = self.eval_name # not a column, just a save name
|
42 |
data_dict["weight_type"] = self.weight_type # not a column, just a save name
|
43 |
data_dict[AutoEvalColumn.precision.name] = self.precision
|
44 |
data_dict[AutoEvalColumn.model_type.name] = self.model_type
|
|
|
46 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
47 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
48 |
data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
|
49 |
+
data_dict[AutoEvalColumn.still_on_hub.name] = (
|
50 |
+
is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
|
51 |
+
)
|
52 |
|
53 |
for benchmark in BENCHMARKS:
|
54 |
if benchmark not in self.results.keys():
|
|
|
64 |
with open(json_filepath) as fp:
|
65 |
data = json.load(fp)
|
66 |
|
|
|
67 |
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
68 |
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
69 |
+
return None, [] # we skip models with the wrong version
|
70 |
|
71 |
try:
|
72 |
config = data["config"]
|
|
|
90 |
else:
|
91 |
org = model_split[0]
|
92 |
model = model_split[1]
|
93 |
+
result_key = f"{org}_{model}_{model_sha}_{precision}"
|
94 |
|
95 |
eval_results = []
|
96 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
97 |
+
accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
|
98 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
99 |
continue
|
100 |
mean_acc = np.mean(accs) * 100.0
|
101 |
+
eval_results.append(
|
102 |
+
EvalResult(
|
103 |
+
eval_name=result_key,
|
104 |
+
org=org,
|
105 |
+
model=model,
|
106 |
+
revision=model_sha,
|
107 |
+
results={benchmark: mean_acc},
|
108 |
+
precision=precision, # todo model_type=, weight_type=
|
109 |
+
)
|
110 |
+
)
|
111 |
|
112 |
return result_key, eval_results
|
113 |
|
|
|
123 |
# Sort the files by date
|
124 |
# store results by precision maybe?
|
125 |
try:
|
126 |
+
files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
|
127 |
except dateutil.parser._parser.ParserError:
|
128 |
files = [files[-1]]
|
129 |
|
130 |
+
# up_to_date = files[-1]
|
131 |
for file in files:
|
132 |
json_filepaths.append(os.path.join(root, file))
|
133 |
|
src/{utils_display.py → display_models/utils.py}
RENAMED
@@ -1,24 +1,27 @@
|
|
1 |
import os
|
2 |
from dataclasses import dataclass
|
|
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
API = HfApi()
|
6 |
|
7 |
|
8 |
-
# These classes are for user facing column names, to avoid having to change them
|
9 |
-
# all around the code when a modif is needed
|
10 |
@dataclass
|
11 |
class ColumnContent:
|
12 |
name: str
|
13 |
-
type: str
|
14 |
-
displayed_by_default: bool
|
15 |
hidden: bool = False
|
16 |
|
|
|
17 |
def fields(raw_class):
|
18 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
19 |
|
|
|
20 |
@dataclass(frozen=True)
|
21 |
-
class AutoEvalColumn:
|
22 |
model_type_symbol = ColumnContent("T", "str", True)
|
23 |
model = ColumnContent("Model", "markdown", True)
|
24 |
average = ColumnContent("Average ⬆️", "number", True)
|
@@ -27,15 +30,19 @@ class AutoEvalColumn: # Auto evals column
|
|
27 |
mmlu = ColumnContent("MMLU", "number", True)
|
28 |
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
29 |
model_type = ColumnContent("Type", "str", False)
|
30 |
-
precision = ColumnContent("Precision", "str", False)
|
31 |
license = ColumnContent("Hub License", "str", False)
|
32 |
params = ColumnContent("#Params (B)", "number", False)
|
33 |
likes = ColumnContent("Hub ❤️", "number", False)
|
|
|
34 |
revision = ColumnContent("Model sha", "str", False, False)
|
35 |
-
dummy = ColumnContent(
|
|
|
|
|
|
|
36 |
|
37 |
@dataclass(frozen=True)
|
38 |
-
class EloEvalColumn:
|
39 |
model = ColumnContent("Model", "markdown", True)
|
40 |
gpt4 = ColumnContent("GPT-4 (all)", "number", True)
|
41 |
human_all = ColumnContent("Human (all)", "number", True)
|
@@ -44,7 +51,7 @@ class EloEvalColumn: # Elo evals column
|
|
44 |
|
45 |
|
46 |
@dataclass(frozen=True)
|
47 |
-
class EvalQueueColumn:
|
48 |
model = ColumnContent("model", "markdown", True)
|
49 |
revision = ColumnContent("revision", "str", True)
|
50 |
private = ColumnContent("private", "bool", True)
|
@@ -52,7 +59,13 @@ class EvalQueueColumn: # Queue column
|
|
52 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
53 |
status = ColumnContent("status", "str", True)
|
54 |
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
|
58 |
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
@@ -90,29 +103,44 @@ def make_clickable_model(model_name):
|
|
90 |
elif model_name == "oasst-12b":
|
91 |
link = OASST_LINK
|
92 |
|
93 |
-
details_model_name = model_name.replace(
|
94 |
details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
|
95 |
|
96 |
if not bool(os.getenv("DEBUG", "False")):
|
97 |
# We only add these checks when not debugging, as they are extremely slow
|
98 |
print(f"details_link: {details_link}")
|
99 |
try:
|
100 |
-
check_path = list(
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
103 |
print(f"check_path: {check_path}")
|
104 |
except Exception as err:
|
105 |
# No details repo for this model
|
106 |
print(f"No details repo for this model: {err}")
|
107 |
return model_hyperlink(link, model_name)
|
108 |
|
109 |
-
return model_hyperlink(link, model_name) +
|
|
|
110 |
|
111 |
def styled_error(error):
|
112 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
113 |
|
|
|
114 |
def styled_warning(warn):
|
115 |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
116 |
|
|
|
117 |
def styled_message(message):
|
118 |
-
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
from dataclasses import dataclass
|
3 |
+
|
4 |
from huggingface_hub import HfApi
|
5 |
|
6 |
API = HfApi()
|
7 |
|
8 |
|
9 |
+
# These classes are for user facing column names, to avoid having to change them
|
10 |
+
# all around the code when a modif is needed
|
11 |
@dataclass
|
12 |
class ColumnContent:
|
13 |
name: str
|
14 |
+
type: str
|
15 |
+
displayed_by_default: bool
|
16 |
hidden: bool = False
|
17 |
|
18 |
+
|
19 |
def fields(raw_class):
|
20 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
21 |
|
22 |
+
|
23 |
@dataclass(frozen=True)
|
24 |
+
class AutoEvalColumn: # Auto evals column
|
25 |
model_type_symbol = ColumnContent("T", "str", True)
|
26 |
model = ColumnContent("Model", "markdown", True)
|
27 |
average = ColumnContent("Average ⬆️", "number", True)
|
|
|
30 |
mmlu = ColumnContent("MMLU", "number", True)
|
31 |
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
32 |
model_type = ColumnContent("Type", "str", False)
|
33 |
+
precision = ColumnContent("Precision", "str", False) # , True)
|
34 |
license = ColumnContent("Hub License", "str", False)
|
35 |
params = ColumnContent("#Params (B)", "number", False)
|
36 |
likes = ColumnContent("Hub ❤️", "number", False)
|
37 |
+
still_on_hub = ColumnContent("Available on the hub", "bool", False)
|
38 |
revision = ColumnContent("Model sha", "str", False, False)
|
39 |
+
dummy = ColumnContent(
|
40 |
+
"model_name_for_query", "str", True
|
41 |
+
) # dummy col to implement search bar (hidden by custom CSS)
|
42 |
+
|
43 |
|
44 |
@dataclass(frozen=True)
|
45 |
+
class EloEvalColumn: # Elo evals column
|
46 |
model = ColumnContent("Model", "markdown", True)
|
47 |
gpt4 = ColumnContent("GPT-4 (all)", "number", True)
|
48 |
human_all = ColumnContent("Human (all)", "number", True)
|
|
|
51 |
|
52 |
|
53 |
@dataclass(frozen=True)
|
54 |
+
class EvalQueueColumn: # Queue column
|
55 |
model = ColumnContent("model", "markdown", True)
|
56 |
revision = ColumnContent("revision", "str", True)
|
57 |
private = ColumnContent("private", "bool", True)
|
|
|
59 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
60 |
status = ColumnContent("status", "str", True)
|
61 |
|
62 |
+
|
63 |
+
LLAMAS = [
|
64 |
+
"huggingface/llama-7b",
|
65 |
+
"huggingface/llama-13b",
|
66 |
+
"huggingface/llama-30b",
|
67 |
+
"huggingface/llama-65b",
|
68 |
+
]
|
69 |
|
70 |
|
71 |
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
|
|
103 |
elif model_name == "oasst-12b":
|
104 |
link = OASST_LINK
|
105 |
|
106 |
+
details_model_name = model_name.replace("/", "__")
|
107 |
details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
|
108 |
|
109 |
if not bool(os.getenv("DEBUG", "False")):
|
110 |
# We only add these checks when not debugging, as they are extremely slow
|
111 |
print(f"details_link: {details_link}")
|
112 |
try:
|
113 |
+
check_path = list(
|
114 |
+
API.list_files_info(
|
115 |
+
repo_id=f"open-llm-leaderboard/details_{details_model_name}",
|
116 |
+
paths="README.md",
|
117 |
+
repo_type="dataset",
|
118 |
+
)
|
119 |
+
)
|
120 |
print(f"check_path: {check_path}")
|
121 |
except Exception as err:
|
122 |
# No details repo for this model
|
123 |
print(f"No details repo for this model: {err}")
|
124 |
return model_hyperlink(link, model_name)
|
125 |
|
126 |
+
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
127 |
+
|
128 |
|
129 |
def styled_error(error):
|
130 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
131 |
|
132 |
+
|
133 |
def styled_warning(warn):
|
134 |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
135 |
|
136 |
+
|
137 |
def styled_message(message):
|
138 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
139 |
+
|
140 |
+
|
141 |
+
def has_no_nan_values(df, columns):
|
142 |
+
return df[columns].notna().all(axis=1)
|
143 |
+
|
144 |
+
|
145 |
+
def has_nan_values(df, columns):
|
146 |
+
return df[columns].isna().any(axis=1)
|
src/init.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from huggingface_hub import Repository
|
3 |
-
|
4 |
-
|
5 |
-
def get_all_requested_models(requested_models_dir):
|
6 |
-
depth = 1
|
7 |
-
file_names = []
|
8 |
-
|
9 |
-
for root, dirs, files in os.walk(requested_models_dir):
|
10 |
-
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
11 |
-
if current_depth == depth:
|
12 |
-
file_names.extend([os.path.join(root, file) for file in files])
|
13 |
-
|
14 |
-
return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
|
15 |
-
|
16 |
-
def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
|
17 |
-
eval_queue_repo = None
|
18 |
-
eval_results_repo = None
|
19 |
-
requested_models = None
|
20 |
-
|
21 |
-
print("Pulling evaluation requests and results.")
|
22 |
-
|
23 |
-
eval_queue_repo = Repository(
|
24 |
-
local_dir=QUEUE_PATH,
|
25 |
-
clone_from=QUEUE_REPO,
|
26 |
-
repo_type="dataset",
|
27 |
-
)
|
28 |
-
eval_queue_repo.git_pull()
|
29 |
-
|
30 |
-
eval_results_repo = Repository(
|
31 |
-
local_dir=RESULTS_PATH,
|
32 |
-
clone_from=RESULTS_REPO,
|
33 |
-
repo_type="dataset",
|
34 |
-
)
|
35 |
-
eval_results_repo.git_pull()
|
36 |
-
|
37 |
-
requested_models = get_all_requested_models("eval-queue")
|
38 |
-
|
39 |
-
return eval_queue_repo, requested_models, eval_results_repo
|
40 |
-
|
41 |
-
|
42 |
-
#def load_results(model, benchmark, metric):
|
43 |
-
# file_path = os.path.join("autoevals", model, f"{model}-eval_{benchmark}.json")
|
44 |
-
# if not os.path.exists(file_path):
|
45 |
-
# return 0.0, None
|
46 |
-
|
47 |
-
# with open(file_path) as fp:
|
48 |
-
# data = json.load(fp)
|
49 |
-
# accs = np.array([v[metric] for k, v in data["results"].items()])
|
50 |
-
# mean_acc = np.mean(accs)
|
51 |
-
# return mean_acc, data["config"]["model_args"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/load_from_hub.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
from huggingface_hub import Repository
|
6 |
+
from transformers import AutoConfig
|
7 |
+
|
8 |
+
from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
|
9 |
+
from src.display_models.get_model_metadata import apply_metadata
|
10 |
+
from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
|
11 |
+
from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
|
12 |
+
|
13 |
+
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
14 |
+
|
15 |
+
|
16 |
+
def get_all_requested_models(requested_models_dir: str) -> set[str]:
|
17 |
+
depth = 1
|
18 |
+
file_names = []
|
19 |
+
|
20 |
+
for root, _, files in os.walk(requested_models_dir):
|
21 |
+
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
22 |
+
if current_depth == depth:
|
23 |
+
file_names.extend([os.path.join(root, file) for file in files])
|
24 |
+
|
25 |
+
return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
|
26 |
+
|
27 |
+
|
28 |
+
def load_all_info_from_hub(QUEUE_REPO: str, RESULTS_REPO: str, QUEUE_PATH: str, RESULTS_PATH: str) -> list[Repository]:
|
29 |
+
eval_queue_repo = None
|
30 |
+
eval_results_repo = None
|
31 |
+
requested_models = None
|
32 |
+
|
33 |
+
print("Pulling evaluation requests and results.")
|
34 |
+
|
35 |
+
eval_queue_repo = Repository(
|
36 |
+
local_dir=QUEUE_PATH,
|
37 |
+
clone_from=QUEUE_REPO,
|
38 |
+
repo_type="dataset",
|
39 |
+
)
|
40 |
+
# eval_queue_repo.git_pull()
|
41 |
+
|
42 |
+
eval_results_repo = Repository(
|
43 |
+
local_dir=RESULTS_PATH,
|
44 |
+
clone_from=RESULTS_REPO,
|
45 |
+
repo_type="dataset",
|
46 |
+
)
|
47 |
+
# eval_results_repo.git_pull()
|
48 |
+
|
49 |
+
requested_models = get_all_requested_models("eval-queue")
|
50 |
+
|
51 |
+
return eval_queue_repo, requested_models, eval_results_repo
|
52 |
+
|
53 |
+
|
54 |
+
def get_leaderboard_df(
|
55 |
+
eval_results: Repository, eval_results_private: Repository, cols: list, benchmark_cols: list
|
56 |
+
) -> pd.DataFrame:
|
57 |
+
if eval_results:
|
58 |
+
print("Pulling evaluation results for the leaderboard.")
|
59 |
+
# eval_results.git_pull()
|
60 |
+
if eval_results_private:
|
61 |
+
print("Pulling evaluation results for the leaderboard.")
|
62 |
+
# eval_results_private.git_pull()
|
63 |
+
|
64 |
+
all_data = get_eval_results_dicts()
|
65 |
+
|
66 |
+
# if not IS_PUBLIC:
|
67 |
+
all_data.append(gpt4_values)
|
68 |
+
all_data.append(gpt35_values)
|
69 |
+
|
70 |
+
all_data.append(baseline)
|
71 |
+
apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
|
72 |
+
|
73 |
+
df = pd.DataFrame.from_records(all_data)
|
74 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
75 |
+
df = df[cols].round(decimals=2)
|
76 |
+
|
77 |
+
# filter out if any of the benchmarks have not been produced
|
78 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
79 |
+
return df
|
80 |
+
|
81 |
+
|
82 |
+
def get_evaluation_queue_df(
|
83 |
+
eval_queue: Repository, eval_queue_private: Repository, save_path: str, cols: list
|
84 |
+
) -> list[pd.DataFrame]:
|
85 |
+
if eval_queue:
|
86 |
+
print("Pulling changes for the evaluation queue.")
|
87 |
+
# eval_queue.git_pull()
|
88 |
+
if eval_queue_private:
|
89 |
+
print("Pulling changes for the evaluation queue.")
|
90 |
+
# eval_queue_private.git_pull()
|
91 |
+
|
92 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
93 |
+
all_evals = []
|
94 |
+
|
95 |
+
for entry in entries:
|
96 |
+
if ".json" in entry:
|
97 |
+
file_path = os.path.join(save_path, entry)
|
98 |
+
with open(file_path) as fp:
|
99 |
+
data = json.load(fp)
|
100 |
+
|
101 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
102 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
103 |
+
|
104 |
+
all_evals.append(data)
|
105 |
+
elif ".md" not in entry:
|
106 |
+
# this is a folder
|
107 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
108 |
+
for sub_entry in sub_entries:
|
109 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
110 |
+
with open(file_path) as fp:
|
111 |
+
data = json.load(fp)
|
112 |
+
|
113 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
114 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
115 |
+
all_evals.append(data)
|
116 |
+
|
117 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
118 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
119 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
120 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
121 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
122 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
123 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|
124 |
+
|
125 |
+
|
126 |
+
def is_model_on_hub(model_name: str, revision: str) -> bool:
|
127 |
+
try:
|
128 |
+
AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
|
129 |
+
return True, None
|
130 |
+
|
131 |
+
except ValueError:
|
132 |
+
return (
|
133 |
+
False,
|
134 |
+
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
135 |
+
)
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
print(f"Could not get the model config from the hub.: {e}")
|
139 |
+
return False, "was not found on hub!"
|