Sai Vinay G commited on
Commit
010b2a5
1 Parent(s): e16ecd0
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
app.py CHANGED
@@ -2,23 +2,31 @@ import json
2
  import os
3
  from datetime import datetime, timezone
4
 
5
-
6
  import gradio as gr
7
- import numpy as np
8
  import pandas as pd
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
  from huggingface_hub import HfApi
11
- from transformers import AutoConfig
12
-
13
- from src.auto_leaderboard.get_model_metadata import apply_metadata, DO_NOT_SUBMIT_MODELS
14
- from src.assets.text_content import *
15
- from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
- from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
17
  from src.assets.css_html_js import custom_css, get_window_url_params
18
- from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
19
- from src.init import get_all_requested_models, load_all_info_from_hub
20
-
21
- pd.set_option('display.precision', 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # clone / pull the lmeh eval data
24
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
@@ -37,20 +45,16 @@ EVAL_RESULTS_PATH = "eval-results"
37
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
38
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
39
 
40
- api = HfApi()
 
41
 
42
  def restart_space():
43
  api.restart_space(
44
  repo_id="gsaivinay/open_llm_leaderboard", token=H4_TOKEN
45
  )
46
 
47
- eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
48
-
49
- if not IS_PUBLIC:
50
- eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
51
- else:
52
- eval_queue_private, eval_results_private = None, None
53
 
 
54
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
55
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
56
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -63,116 +67,41 @@ if not IS_PUBLIC:
63
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
64
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
65
 
66
- BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
67
-
68
-
69
- def has_no_nan_values(df, columns):
70
- return df[columns].notna().all(axis=1)
71
-
72
-
73
- def has_nan_values(df, columns):
74
- return df[columns].isna().any(axis=1)
75
-
76
-
77
- def get_leaderboard_df():
78
- if eval_results:
79
- print("Pulling evaluation results for the leaderboard.")
80
- eval_results.git_pull()
81
- if eval_results_private:
82
- print("Pulling evaluation results for the leaderboard.")
83
- eval_results_private.git_pull()
84
-
85
- all_data = get_eval_results_dicts()
86
-
87
- # if not IS_PUBLIC:
88
- all_data.append(gpt4_values)
89
- all_data.append(gpt35_values)
90
-
91
- all_data.append(baseline)
92
- apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
93
-
94
- df = pd.DataFrame.from_records(all_data)
95
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
96
- df = df[COLS].round(decimals=2)
97
-
98
- # filter out if any of the benchmarks have not been produced
99
- df = df[has_no_nan_values(df, BENCHMARK_COLS)]
100
- return df
101
 
 
 
 
 
102
 
103
- def get_evaluation_queue_df():
104
- if eval_queue:
105
- print("Pulling changes for the evaluation queue.")
106
- eval_queue.git_pull()
107
- if eval_queue_private:
108
- print("Pulling changes for the evaluation queue.")
109
- eval_queue_private.git_pull()
 
 
110
 
111
- entries = [
112
- entry
113
- for entry in os.listdir(EVAL_REQUESTS_PATH)
114
- if not entry.startswith(".")
115
- ]
116
- all_evals = []
117
-
118
- for entry in entries:
119
- if ".json" in entry:
120
- file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
121
- with open(file_path) as fp:
122
- data = json.load(fp)
123
-
124
- data["# params"] = "unknown"
125
- data["model"] = make_clickable_model(data["model"])
126
- data["revision"] = data.get("revision", "main")
127
-
128
- all_evals.append(data)
129
- elif ".md" not in entry:
130
- # this is a folder
131
- sub_entries = [
132
- e
133
- for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
134
- if not e.startswith(".")
135
- ]
136
- for sub_entry in sub_entries:
137
- file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
138
- with open(file_path) as fp:
139
- data = json.load(fp)
140
-
141
- # data["# params"] = get_n_params(data["model"])
142
- data["model"] = make_clickable_model(data["model"])
143
- all_evals.append(data)
144
-
145
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
146
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
147
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
148
- df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
149
- df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
150
- df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
151
- return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
152
-
153
-
154
-
155
- original_df = get_leaderboard_df()
156
  leaderboard_df = original_df.copy()
157
  (
158
  finished_eval_queue_df,
159
  running_eval_queue_df,
160
  pending_eval_queue_df,
161
- ) = get_evaluation_queue_df()
162
-
163
- def is_model_on_hub(model_name, revision) -> bool:
164
- try:
165
- AutoConfig.from_pretrained(model_name, revision=revision)
166
- return True, None
167
-
168
- except ValueError as e:
169
- return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
170
-
171
- except Exception as e:
172
- print(f"Could not get the model config from the hub.: {e}")
173
- return False, "was not found on hub!"
174
 
175
 
 
176
  def add_new_eval(
177
  model: str,
178
  base_model: str,
@@ -196,13 +125,12 @@ def add_new_eval(
196
  base_model_on_hub, error = is_model_on_hub(base_model, revision)
197
  if not base_model_on_hub:
198
  return styled_error(f'Base model "{base_model}" {error}')
199
-
200
 
201
  if not weight_type == "Adapter":
202
  model_on_hub, error = is_model_on_hub(model, revision)
203
  if not model_on_hub:
204
  return styled_error(f'Model "{model}" {error}')
205
-
206
  print("adding new eval")
207
 
208
  eval_entry = {
@@ -233,7 +161,7 @@ def add_new_eval(
233
 
234
  # Check for duplicate submission
235
  if out_path.split("eval-queue/")[1].lower() in requested_models:
236
- return styled_warning("This model has been already submitted.")
237
 
238
  with open(out_path, "w") as f:
239
  f.write(json.dumps(eval_entry))
@@ -242,7 +170,6 @@ def add_new_eval(
242
  path_or_fileobj=out_path,
243
  path_in_repo=out_path.split("eval-queue/")[1],
244
  repo_id=QUEUE_REPO,
245
- token=H4_TOKEN,
246
  repo_type="dataset",
247
  commit_message=f"Add {model} to eval queue",
248
  )
@@ -250,16 +177,19 @@ def add_new_eval(
250
  # remove the local file
251
  os.remove(out_path)
252
 
253
- return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
 
 
254
 
255
 
256
- def refresh():
257
- leaderboard_df = get_leaderboard_df()
 
258
  (
259
  finished_eval_queue_df,
260
  running_eval_queue_df,
261
  pending_eval_queue_df,
262
- ) = get_evaluation_queue_df()
263
  return (
264
  leaderboard_df,
265
  finished_eval_queue_df,
@@ -268,74 +198,72 @@ def refresh():
268
  )
269
 
270
 
271
- def search_table(df, leaderboard_table, query):
272
- if AutoEvalColumn.model_type.name in leaderboard_table.columns:
 
 
 
 
 
 
 
 
 
 
 
 
273
  filtered_df = df[
274
  (df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
275
  | (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
276
- ]
277
  else:
278
  filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
279
- return filtered_df[leaderboard_table.columns]
280
 
281
 
282
- def select_columns(df, columns):
283
- always_here_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
284
- # We use COLS to maintain sorting
285
- filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]]
 
 
 
 
 
286
  return filtered_df
287
 
288
- #TODO allow this to filter by values of any columns
289
- def filter_items(df, leaderboard_table, query):
290
- if query == "all":
291
- return df[leaderboard_table.columns]
292
- else:
293
- query = query[0] #take only the emoji character
294
- if AutoEvalColumn.model_type_symbol.name in leaderboard_table.columns:
295
- filtered_df = df[(df[AutoEvalColumn.model_type_symbol.name] == query)]
296
- else:
297
- return filtered_df[leaderboard_table.columns]
298
- return filtered_df[leaderboard_table.columns]
299
-
300
- def filter_items_size(df, leaderboard_table, query):
301
- numeric_intervals = {
302
- "all": None,
303
- "< 1B": (0, 1),
304
- "~3B": (1, 5),
305
- "~7B": (6, 11),
306
- "~13B": (12, 15),
307
- "~35B": (16, 55),
308
- "60B+": (55, 1000)
309
- }
310
-
311
- if query == "all":
312
- return df[leaderboard_table.columns]
313
-
314
- numeric_interval = numeric_intervals[query]
315
-
316
- if AutoEvalColumn.params.name in leaderboard_table.columns:
317
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors='coerce')
318
- filtered_df = df[params_column.between(*numeric_interval)]
319
- else:
320
- return filtered_df[leaderboard_table.columns]
321
- return filtered_df[leaderboard_table.columns]
322
 
323
- def change_tab(query_param):
324
- query_param = query_param.replace("'", '"')
325
- query_param = json.loads(query_param)
326
-
327
- if (
328
- isinstance(query_param, dict)
329
- and "tab" in query_param
330
- and query_param["tab"] == "evaluation"
331
- ):
332
- return gr.Tabs.update(selected=1)
333
- else:
334
- return gr.Tabs.update(selected=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
- def update_filter_type(input_type, shown_columns):
337
- shown_columns.append(AutoEvalColumn.params.name)
338
- return gr.update(visible=(input_type == 'types')), gr.update(visible=(input_type == 'sizes')), shown_columns
339
 
340
 
341
  demo = gr.Blocks(css=custom_css)
@@ -346,13 +274,39 @@ with demo:
346
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
347
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
348
  with gr.Row():
349
- shown_columns = gr.CheckboxGroup(
350
- choices = [c for c in COLS if c not in [AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name]],
351
- value = [c for c in COLS_LITE if c not in [AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name]],
352
- label="Select columns to show",
353
- elem_id="column-select",
354
- interactive=True,
355
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  with gr.Column(min_width=320):
357
  search_bar = gr.Textbox(
358
  placeholder="🔍 Search for your model and press ENTER...",
@@ -360,46 +314,47 @@ with demo:
360
  elem_id="search-bar",
361
  )
362
  with gr.Box(elem_id="box-filter"):
363
- filter_type = gr.Dropdown(
364
- label="⏚ Filter model",
365
- choices=["types", "sizes"], value="types",
366
- interactive=True,
367
- elem_id="filter_type"
368
- )
369
- filter_columns = gr.Radio(
370
  label="⏚ Filter model types",
371
- show_label=False,
372
- choices = [
373
- "all",
374
  ModelType.PT.to_str(),
375
  ModelType.FT.to_str(),
376
  ModelType.IFT.to_str(),
377
- ModelType.RL.to_str(),
378
  ],
379
  value="all",
380
- elem_id="filter-columns"
 
381
  )
382
  filter_columns_size = gr.Radio(
383
  label="⏚ Filter model sizes",
384
- show_label=False,
385
- choices = [
386
  "all",
387
  "< 1B",
388
  "~3B",
389
  "~7B",
390
  "~13B",
391
  "~35B",
392
- "60B+"
393
  ],
394
  value="all",
395
- visible=False,
396
  interactive=True,
397
- elem_id="filter-columns-size"
398
  )
399
-
400
  leaderboard_table = gr.components.Dataframe(
401
- value=leaderboard_df[[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name]],
402
- headers=[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name],
 
 
 
 
 
 
 
 
 
403
  datatype=TYPES,
404
  max_rows=None,
405
  elem_id="leaderboard-table",
@@ -417,14 +372,55 @@ with demo:
417
  )
418
  search_bar.submit(
419
  search_table,
420
- [hidden_leaderboard_table_for_search, leaderboard_table, search_bar],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  leaderboard_table,
 
422
  )
423
-
424
- filter_type.change(update_filter_type,inputs=[filter_type, shown_columns],outputs=[filter_columns, filter_columns_size, shown_columns],queue=False).then(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table, queue=False)
425
- shown_columns.change(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table, queue=False)
426
- filter_columns.change(filter_items, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns], leaderboard_table, queue=False)
427
- filter_columns_size.change(filter_items_size, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns_size], leaderboard_table, queue=False)
428
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
429
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
430
 
@@ -434,7 +430,10 @@ with demo:
434
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
435
 
436
  with gr.Column():
437
- with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
 
 
 
438
  with gr.Row():
439
  finished_eval_table = gr.components.Dataframe(
440
  value=finished_eval_queue_df,
@@ -442,7 +441,10 @@ with demo:
442
  datatype=EVAL_TYPES,
443
  max_rows=5,
444
  )
445
- with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
 
 
 
446
  with gr.Row():
447
  running_eval_table = gr.components.Dataframe(
448
  value=running_eval_queue_df,
@@ -451,7 +453,10 @@ with demo:
451
  max_rows=5,
452
  )
453
 
454
- with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
 
 
 
455
  with gr.Row():
456
  pending_eval_table = gr.components.Dataframe(
457
  value=pending_eval_queue_df,
 
2
  import os
3
  from datetime import datetime, timezone
4
 
 
5
  import gradio as gr
 
6
  import pandas as pd
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from huggingface_hub import HfApi
 
 
 
 
 
 
9
  from src.assets.css_html_js import custom_css, get_window_url_params
10
+ from src.assets.text_content import (
11
+ CITATION_BUTTON_LABEL,
12
+ CITATION_BUTTON_TEXT,
13
+ EVALUATION_QUEUE_TEXT,
14
+ INTRODUCTION_TEXT,
15
+ LLM_BENCHMARKS_TEXT,
16
+ TITLE,
17
+ )
18
+ from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
19
+ from src.display_models.utils import (
20
+ AutoEvalColumn,
21
+ EvalQueueColumn,
22
+ fields,
23
+ styled_error,
24
+ styled_message,
25
+ styled_warning,
26
+ )
27
+ from src.load_from_hub import get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub, load_all_info_from_hub
28
+
29
+ pd.set_option("display.precision", 1)
30
 
31
  # clone / pull the lmeh eval data
32
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
 
45
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
46
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
47
 
48
+ api = HfApi(token=H4_TOKEN)
49
+
50
 
51
  def restart_space():
52
  api.restart_space(
53
  repo_id="gsaivinay/open_llm_leaderboard", token=H4_TOKEN
54
  )
55
 
 
 
 
 
 
 
56
 
57
+ # Column selection
58
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
59
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
60
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
67
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
68
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
69
 
70
+ BENCHMARK_COLS = [
71
+ c.name
72
+ for c in [
73
+ AutoEvalColumn.arc,
74
+ AutoEvalColumn.hellaswag,
75
+ AutoEvalColumn.mmlu,
76
+ AutoEvalColumn.truthfulqa,
77
+ ]
78
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ ## LOAD INFO FROM HUB
81
+ eval_queue, requested_models, eval_results = load_all_info_from_hub(
82
+ QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
83
+ )
84
 
85
+ if not IS_PUBLIC:
86
+ (eval_queue_private, requested_models_private, eval_results_private,) = load_all_info_from_hub(
87
+ PRIVATE_QUEUE_REPO,
88
+ PRIVATE_RESULTS_REPO,
89
+ EVAL_REQUESTS_PATH_PRIVATE,
90
+ EVAL_RESULTS_PATH_PRIVATE,
91
+ )
92
+ else:
93
+ eval_queue_private, eval_results_private = None, None
94
 
95
+ original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  leaderboard_df = original_df.copy()
97
  (
98
  finished_eval_queue_df,
99
  running_eval_queue_df,
100
  pending_eval_queue_df,
101
+ ) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
 
104
+ ## INTERACTION FUNCTIONS
105
  def add_new_eval(
106
  model: str,
107
  base_model: str,
 
125
  base_model_on_hub, error = is_model_on_hub(base_model, revision)
126
  if not base_model_on_hub:
127
  return styled_error(f'Base model "{base_model}" {error}')
 
128
 
129
  if not weight_type == "Adapter":
130
  model_on_hub, error = is_model_on_hub(model, revision)
131
  if not model_on_hub:
132
  return styled_error(f'Model "{model}" {error}')
133
+
134
  print("adding new eval")
135
 
136
  eval_entry = {
 
161
 
162
  # Check for duplicate submission
163
  if out_path.split("eval-queue/")[1].lower() in requested_models:
164
+ return styled_warning("This model has been already submitted.")
165
 
166
  with open(out_path, "w") as f:
167
  f.write(json.dumps(eval_entry))
 
170
  path_or_fileobj=out_path,
171
  path_in_repo=out_path.split("eval-queue/")[1],
172
  repo_id=QUEUE_REPO,
 
173
  repo_type="dataset",
174
  commit_message=f"Add {model} to eval queue",
175
  )
 
177
  # remove the local file
178
  os.remove(out_path)
179
 
180
+ return styled_message(
181
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
182
+ )
183
 
184
 
185
+ # Basics
186
+ def refresh() -> list[pd.DataFrame]:
187
+ leaderboard_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
188
  (
189
  finished_eval_queue_df,
190
  running_eval_queue_df,
191
  pending_eval_queue_df,
192
+ ) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, COLS)
193
  return (
194
  leaderboard_df,
195
  finished_eval_queue_df,
 
198
  )
199
 
200
 
201
+ def change_tab(query_param: str):
202
+ query_param = query_param.replace("'", '"')
203
+ query_param = json.loads(query_param)
204
+
205
+ if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
206
+ return gr.Tabs.update(selected=1)
207
+ else:
208
+ return gr.Tabs.update(selected=0)
209
+
210
+
211
+ # Searching and filtering
212
+ def search_table(df: pd.DataFrame, current_columns_df: pd.DataFrame, query: str) -> pd.DataFrame:
213
+ current_columns = current_columns_df.columns
214
+ if AutoEvalColumn.model_type.name in current_columns:
215
  filtered_df = df[
216
  (df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
217
  | (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
218
+ ]
219
  else:
220
  filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
221
+ return filtered_df[current_columns]
222
 
223
 
224
+ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
225
+ always_here_cols = [
226
+ AutoEvalColumn.model_type_symbol.name,
227
+ AutoEvalColumn.model.name,
228
+ ]
229
+ # We use COLS to maintain sorting
230
+ filtered_df = df[
231
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
232
+ ]
233
  return filtered_df
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ def filter_models(
237
+ df: pd.DataFrame, current_columns_df: pd.DataFrame, type_query: str, size_query: str, show_deleted: bool
238
+ ) -> pd.DataFrame:
239
+ current_columns = current_columns_df.columns
240
+
241
+ # Show all models
242
+ if show_deleted:
243
+ filtered_df = df[current_columns]
244
+ else: # Show only still on the hub models
245
+ filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True][current_columns]
246
+
247
+ if type_query != "all":
248
+ type_emoji = type_query[0]
249
+ filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
250
+
251
+ if size_query != "all":
252
+ numeric_intervals = {
253
+ "all": (0, 10000),
254
+ "< 1B": (0, 1),
255
+ "~3B": (1, 5),
256
+ "~7B": (6, 11),
257
+ "~13B": (12, 15),
258
+ "~35B": (16, 55),
259
+ "60B+": (55, 10000),
260
+ }
261
+ numeric_interval = numeric_intervals[size_query]
262
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
263
+
264
+ filtered_df = filtered_df[params_column.between(*numeric_interval)]
265
 
266
+ return filtered_df
 
 
267
 
268
 
269
  demo = gr.Blocks(css=custom_css)
 
274
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
275
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
276
  with gr.Row():
277
+ with gr.Column():
278
+ with gr.Row():
279
+ shown_columns = gr.CheckboxGroup(
280
+ choices=[
281
+ c
282
+ for c in COLS
283
+ if c
284
+ not in [
285
+ AutoEvalColumn.dummy.name,
286
+ AutoEvalColumn.model.name,
287
+ AutoEvalColumn.model_type_symbol.name,
288
+ AutoEvalColumn.still_on_hub.name,
289
+ ]
290
+ ],
291
+ value=[
292
+ c
293
+ for c in COLS_LITE
294
+ if c
295
+ not in [
296
+ AutoEvalColumn.dummy.name,
297
+ AutoEvalColumn.model.name,
298
+ AutoEvalColumn.model_type_symbol.name,
299
+ AutoEvalColumn.still_on_hub.name,
300
+ ]
301
+ ],
302
+ label="Select columns to show",
303
+ elem_id="column-select",
304
+ interactive=True,
305
+ )
306
+ with gr.Row():
307
+ deleted_models_visibility = gr.Checkbox(
308
+ value=True, label="Show models removed from the hub", interactive=True
309
+ )
310
  with gr.Column(min_width=320):
311
  search_bar = gr.Textbox(
312
  placeholder="🔍 Search for your model and press ENTER...",
 
314
  elem_id="search-bar",
315
  )
316
  with gr.Box(elem_id="box-filter"):
317
+ filter_columns_type = gr.Radio(
 
 
 
 
 
 
318
  label="⏚ Filter model types",
319
+ choices=[
320
+ "all",
 
321
  ModelType.PT.to_str(),
322
  ModelType.FT.to_str(),
323
  ModelType.IFT.to_str(),
324
+ ModelType.RL.to_str(),
325
  ],
326
  value="all",
327
+ interactive=True,
328
+ elem_id="filter-columns-type",
329
  )
330
  filter_columns_size = gr.Radio(
331
  label="⏚ Filter model sizes",
332
+ choices=[
 
333
  "all",
334
  "< 1B",
335
  "~3B",
336
  "~7B",
337
  "~13B",
338
  "~35B",
339
+ "60B+",
340
  ],
341
  value="all",
 
342
  interactive=True,
343
+ elem_id="filter-columns-size",
344
  )
345
+
346
  leaderboard_table = gr.components.Dataframe(
347
+ value=leaderboard_df[
348
+ [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
349
+ + shown_columns.value
350
+ + [AutoEvalColumn.dummy.name]
351
+ ],
352
+ headers=[
353
+ AutoEvalColumn.model_type_symbol.name,
354
+ AutoEvalColumn.model.name,
355
+ ]
356
+ + shown_columns.value
357
+ + [AutoEvalColumn.dummy.name],
358
  datatype=TYPES,
359
  max_rows=None,
360
  elem_id="leaderboard-table",
 
372
  )
373
  search_bar.submit(
374
  search_table,
375
+ [
376
+ hidden_leaderboard_table_for_search,
377
+ leaderboard_table,
378
+ search_bar,
379
+ ],
380
+ leaderboard_table,
381
+ )
382
+ shown_columns.change(
383
+ select_columns,
384
+ [hidden_leaderboard_table_for_search, shown_columns],
385
+ leaderboard_table,
386
+ queue=False,
387
+ )
388
+ filter_columns_type.change(
389
+ filter_models,
390
+ [
391
+ hidden_leaderboard_table_for_search,
392
+ leaderboard_table,
393
+ filter_columns_type,
394
+ filter_columns_size,
395
+ deleted_models_visibility,
396
+ ],
397
+ leaderboard_table,
398
+ queue=False,
399
+ )
400
+ filter_columns_size.change(
401
+ filter_models,
402
+ [
403
+ hidden_leaderboard_table_for_search,
404
+ leaderboard_table,
405
+ filter_columns_type,
406
+ filter_columns_size,
407
+ deleted_models_visibility,
408
+ ],
409
+ leaderboard_table,
410
+ queue=False,
411
+ )
412
+ deleted_models_visibility.change(
413
+ filter_models,
414
+ [
415
+ hidden_leaderboard_table_for_search,
416
+ leaderboard_table,
417
+ filter_columns_type,
418
+ filter_columns_size,
419
+ deleted_models_visibility,
420
+ ],
421
  leaderboard_table,
422
+ queue=False,
423
  )
 
 
 
 
 
424
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
425
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
426
 
 
430
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
431
 
432
  with gr.Column():
433
+ with gr.Accordion(
434
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
435
+ open=False,
436
+ ):
437
  with gr.Row():
438
  finished_eval_table = gr.components.Dataframe(
439
  value=finished_eval_queue_df,
 
441
  datatype=EVAL_TYPES,
442
  max_rows=5,
443
  )
444
+ with gr.Accordion(
445
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
446
+ open=False,
447
+ ):
448
  with gr.Row():
449
  running_eval_table = gr.components.Dataframe(
450
  value=running_eval_queue_df,
 
453
  max_rows=5,
454
  )
455
 
456
+ with gr.Accordion(
457
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
458
+ open=False,
459
+ ):
460
  with gr.Row():
461
  pending_eval_table = gr.components.Dataframe(
462
  value=pending_eval_queue_df,
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt CHANGED
@@ -24,7 +24,7 @@ gradio_client==0.1.3
24
  h11==0.14.0
25
  httpcore==0.17.0
26
  httpx==0.24.0
27
- huggingface-hub==0.13.4
28
  idna==3.4
29
  Jinja2==3.1.2
30
  jsonschema==4.17.3
@@ -59,7 +59,7 @@ sniffio==1.3.0
59
  starlette==0.26.1
60
  toolz==0.12.0
61
  tqdm==4.65.0
62
- transformers==4.28.1
63
  typing_extensions==4.5.0
64
  tzdata==2023.3
65
  tzlocal==4.3
 
24
  h11==0.14.0
25
  httpcore==0.17.0
26
  httpx==0.24.0
27
+ huggingface-hub==0.16.4
28
  idna==3.4
29
  Jinja2==3.1.2
30
  jsonschema==4.17.3
 
59
  starlette==0.26.1
60
  toolz==0.12.0
61
  tqdm==4.65.0
62
+ transformers==4.32.0
63
  typing_extensions==4.5.0
64
  tzdata==2023.3
65
  tzlocal==4.3
src/assets/css_html_js.py CHANGED
@@ -89,13 +89,13 @@ table th:first-child {
89
  #filter_type label > .wrap .wrap-inner input{
90
  width: 1px
91
  }
92
- #filter-columns{
93
  border:0;
94
- padding:0;
95
  }
96
  #filter-columns-size{
97
  border:0;
98
- padding:0;
99
  }
100
  #box-filter > .form{
101
  border: 0
@@ -108,4 +108,4 @@ get_window_url_params = """
108
  url_params = Object.fromEntries(params);
109
  return url_params;
110
  }
111
- """
 
89
  #filter_type label > .wrap .wrap-inner input{
90
  width: 1px
91
  }
92
+ #filter-columns-type{
93
  border:0;
94
+ padding:0.5;
95
  }
96
  #filter-columns-size{
97
  border:0;
98
+ padding:0.5;
99
  }
100
  #box-filter > .form{
101
  border: 0
 
108
  url_params = Object.fromEntries(params);
109
  return url_params;
110
  }
111
+ """
src/assets/hardcoded_evals.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.utils_display import AutoEvalColumn, model_hyperlink
2
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
@@ -6,9 +6,9 @@ gpt4_values = {
6
  AutoEvalColumn.precision.name: None,
7
  AutoEvalColumn.average.name: 84.3,
8
  AutoEvalColumn.arc.name: 96.3,
9
- AutoEvalColumn.hellaswag.name: 95.3,
10
- AutoEvalColumn.mmlu.name: 86.4,
11
- AutoEvalColumn.truthfulqa.name: 59.0,
12
  AutoEvalColumn.dummy.name: "GPT-4",
13
  AutoEvalColumn.model_type.name: "",
14
  }
@@ -19,9 +19,9 @@ gpt35_values = {
19
  AutoEvalColumn.precision.name: None,
20
  AutoEvalColumn.average.name: 71.9,
21
  AutoEvalColumn.arc.name: 85.2,
22
- AutoEvalColumn.hellaswag.name: 85.5,
23
- AutoEvalColumn.mmlu.name: 70.0,
24
- AutoEvalColumn.truthfulqa.name: 47.0,
25
  AutoEvalColumn.dummy.name: "GPT-3.5",
26
  AutoEvalColumn.model_type.name: "",
27
  }
@@ -32,10 +32,9 @@ baseline = {
32
  AutoEvalColumn.precision.name: None,
33
  AutoEvalColumn.average.name: 25.0,
34
  AutoEvalColumn.arc.name: 25.0,
35
- AutoEvalColumn.hellaswag.name: 25.0,
36
- AutoEvalColumn.mmlu.name: 25.0,
37
- AutoEvalColumn.truthfulqa.name: 25.0,
38
  AutoEvalColumn.dummy.name: "baseline",
39
  AutoEvalColumn.model_type.name: "",
40
  }
41
-
 
1
+ from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
 
6
  AutoEvalColumn.precision.name: None,
7
  AutoEvalColumn.average.name: 84.3,
8
  AutoEvalColumn.arc.name: 96.3,
9
+ AutoEvalColumn.hellaswag.name: 95.3,
10
+ AutoEvalColumn.mmlu.name: 86.4,
11
+ AutoEvalColumn.truthfulqa.name: 59.0,
12
  AutoEvalColumn.dummy.name: "GPT-4",
13
  AutoEvalColumn.model_type.name: "",
14
  }
 
19
  AutoEvalColumn.precision.name: None,
20
  AutoEvalColumn.average.name: 71.9,
21
  AutoEvalColumn.arc.name: 85.2,
22
+ AutoEvalColumn.hellaswag.name: 85.5,
23
+ AutoEvalColumn.mmlu.name: 70.0,
24
+ AutoEvalColumn.truthfulqa.name: 47.0,
25
  AutoEvalColumn.dummy.name: "GPT-3.5",
26
  AutoEvalColumn.model_type.name: "",
27
  }
 
32
  AutoEvalColumn.precision.name: None,
33
  AutoEvalColumn.average.name: 25.0,
34
  AutoEvalColumn.arc.name: 25.0,
35
+ AutoEvalColumn.hellaswag.name: 25.0,
36
+ AutoEvalColumn.mmlu.name: 25.0,
37
+ AutoEvalColumn.truthfulqa.name: 25.0,
38
  AutoEvalColumn.dummy.name: "baseline",
39
  AutoEvalColumn.model_type.name: "",
40
  }
 
src/assets/text_content.py CHANGED
@@ -1,17 +1,17 @@
1
- from ..auto_leaderboard.model_metadata_type import ModelType
2
 
3
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
5
- INTRODUCTION_TEXT = f"""
6
  📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
7
 
8
- 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
9
  The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
10
  """
11
 
12
  LLM_BENCHMARKS_TEXT = f"""
13
  # Context
14
- With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
  ## Icons
17
  {ModelType.PT.to_str(" : ")} model
@@ -25,14 +25,14 @@ If there is no icon, we have not uploaded the information on the model yet, feel
25
 
26
  ## How it works
27
 
28
- 📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
29
 
30
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
31
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
32
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
33
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
34
 
35
- For all these evaluations, a higher score is a better score.
36
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
37
 
38
  ## Details and logs
@@ -46,7 +46,7 @@ To reproduce our results, here is the commands you can run, using [this version]
46
  `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
47
  ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
48
 
49
- The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
50
  *You can expect results to vary slightly for different batch sizes because of padding.*
51
 
52
  The tasks and few shots parameters are:
@@ -65,7 +65,7 @@ If you still have questions, you can check our FAQ [here](https://huggingface.co
65
  We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
66
  """
67
 
68
- EVALUATION_QUEUE_TEXT = f"""
69
  # Evaluation Queue for the 🤗 Open LLM Leaderboard
70
 
71
  Models added here will be automatically evaluated on the 🤗 cluster.
@@ -79,7 +79,7 @@ config = AutoConfig.from_pretrained("your model name", revision=revision)
79
  model = AutoModel.from_pretrained("your model name", revision=revision)
80
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
81
  ```
82
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
83
 
84
  Note: make sure your model is public!
85
  Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
@@ -94,8 +94,8 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
94
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
95
 
96
  ## In case of model failure
97
- If your model is displayed in the `FAILED` category, its execution stopped.
98
- Make sure you have followed the above steps first.
99
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
100
  """
101
 
@@ -135,7 +135,7 @@ CITATION_BUTTON_TEXT = r"""
135
  url = {https://doi.org/10.5281/zenodo.5371628}
136
  }
137
  @misc{clark2018think,
138
- title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
139
  author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
140
  year={2018},
141
  eprint={1803.05457},
@@ -143,7 +143,7 @@ CITATION_BUTTON_TEXT = r"""
143
  primaryClass={cs.AI}
144
  }
145
  @misc{zellers2019hellaswag,
146
- title={HellaSwag: Can a Machine Really Finish Your Sentence?},
147
  author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
148
  year={2019},
149
  eprint={1905.07830},
@@ -151,7 +151,7 @@ CITATION_BUTTON_TEXT = r"""
151
  primaryClass={cs.CL}
152
  }
153
  @misc{hendrycks2021measuring,
154
- title={Measuring Massive Multitask Language Understanding},
155
  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
156
  year={2021},
157
  eprint={2009.03300},
@@ -159,7 +159,7 @@ CITATION_BUTTON_TEXT = r"""
159
  primaryClass={cs.CY}
160
  }
161
  @misc{lin2022truthfulqa,
162
- title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
163
  author={Stephanie Lin and Jacob Hilton and Owain Evans},
164
  year={2022},
165
  eprint={2109.07958},
 
1
+ from src.display_models.model_metadata_type import ModelType
2
 
3
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
5
+ INTRODUCTION_TEXT = """
6
  📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
7
 
8
+ 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
9
  The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
10
  """
11
 
12
  LLM_BENCHMARKS_TEXT = f"""
13
  # Context
14
+ With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
  ## Icons
17
  {ModelType.PT.to_str(" : ")} model
 
25
 
26
  ## How it works
27
 
28
+ 📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
29
 
30
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
31
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
32
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
33
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
34
 
35
+ For all these evaluations, a higher score is a better score.
36
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
37
 
38
  ## Details and logs
 
46
  `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
47
  ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
48
 
49
+ The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
50
  *You can expect results to vary slightly for different batch sizes because of padding.*
51
 
52
  The tasks and few shots parameters are:
 
65
  We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
66
  """
67
 
68
+ EVALUATION_QUEUE_TEXT = """
69
  # Evaluation Queue for the 🤗 Open LLM Leaderboard
70
 
71
  Models added here will be automatically evaluated on the 🤗 cluster.
 
79
  model = AutoModel.from_pretrained("your model name", revision=revision)
80
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
81
  ```
82
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
83
 
84
  Note: make sure your model is public!
85
  Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
 
94
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
95
 
96
  ## In case of model failure
97
+ If your model is displayed in the `FAILED` category, its execution stopped.
98
+ Make sure you have followed the above steps first.
99
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
100
  """
101
 
 
135
  url = {https://doi.org/10.5281/zenodo.5371628}
136
  }
137
  @misc{clark2018think,
138
+ title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
139
  author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
140
  year={2018},
141
  eprint={1803.05457},
 
143
  primaryClass={cs.AI}
144
  }
145
  @misc{zellers2019hellaswag,
146
+ title={HellaSwag: Can a Machine Really Finish Your Sentence?},
147
  author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
148
  year={2019},
149
  eprint={1905.07830},
 
151
  primaryClass={cs.CL}
152
  }
153
  @misc{hendrycks2021measuring,
154
+ title={Measuring Massive Multitask Language Understanding},
155
  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
156
  year={2021},
157
  eprint={2009.03300},
 
159
  primaryClass={cs.CY}
160
  }
161
  @misc{lin2022truthfulqa,
162
+ title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
163
  author={Stephanie Lin and Jacob Hilton and Owain Evans},
164
  year={2022},
165
  eprint={2109.07958},
src/auto_leaderboard/model_metadata_type.py DELETED
@@ -1,551 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
- from typing import Dict
4
-
5
-
6
- @dataclass
7
- class ModelInfo:
8
- name: str
9
- symbol: str # emoji
10
-
11
-
12
- class ModelType(Enum):
13
- PT = ModelInfo(name="pretrained", symbol="🟢")
14
- FT = ModelInfo(name="fine-tuned", symbol="🔶")
15
- IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
16
- RL = ModelInfo(name="RL-tuned", symbol="🟦")
17
- Unknown = ModelInfo(name="Unknown, add type to request file!", symbol="?")
18
-
19
- def to_str(self, separator = " "):
20
- return f"{self.value.symbol}{separator}{self.value.name}"
21
-
22
-
23
- MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
- 'notstoic/PygmalionCoT-7b': ModelType.IFT,
25
- 'aisquared/dlite-v1-355m': ModelType.IFT,
26
- 'aisquared/dlite-v1-1_5b': ModelType.IFT,
27
- 'aisquared/dlite-v1-774m': ModelType.IFT,
28
- 'aisquared/dlite-v1-124m': ModelType.IFT,
29
- 'aisquared/chopt-2_7b': ModelType.IFT,
30
- 'aisquared/dlite-v2-124m': ModelType.IFT,
31
- 'aisquared/dlite-v2-774m': ModelType.IFT,
32
- 'aisquared/dlite-v2-1_5b': ModelType.IFT,
33
- 'aisquared/chopt-1_3b': ModelType.IFT,
34
- 'aisquared/dlite-v2-355m': ModelType.IFT,
35
- 'augtoma/qCammel-13': ModelType.IFT,
36
- 'Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload': ModelType.IFT,
37
- 'Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload': ModelType.IFT,
38
- 'TheBloke/alpaca-lora-65B-HF': ModelType.FT,
39
- 'TheBloke/tulu-7B-fp16': ModelType.IFT,
40
- 'TheBloke/guanaco-7B-HF': ModelType.FT,
41
- 'TheBloke/koala-7B-HF': ModelType.FT,
42
- 'TheBloke/wizardLM-7B-HF': ModelType.IFT,
43
- 'TheBloke/airoboros-13B-HF': ModelType.IFT,
44
- 'TheBloke/koala-13B-HF': ModelType.FT,
45
- 'TheBloke/Wizard-Vicuna-7B-Uncensored-HF': ModelType.FT,
46
- 'TheBloke/dromedary-65b-lora-HF': ModelType.IFT,
47
- 'TheBloke/wizardLM-13B-1.0-fp16': ModelType.IFT,
48
- 'TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16': ModelType.FT,
49
- 'TheBloke/Wizard-Vicuna-30B-Uncensored-fp16': ModelType.FT,
50
- 'TheBloke/wizard-vicuna-13B-HF': ModelType.IFT,
51
- 'TheBloke/UltraLM-13B-fp16': ModelType.IFT,
52
- 'TheBloke/OpenAssistant-FT-7-Llama-30B-HF': ModelType.FT,
53
- 'TheBloke/vicuna-13B-1.1-HF': ModelType.IFT,
54
- 'TheBloke/guanaco-13B-HF': ModelType.FT,
55
- 'TheBloke/guanaco-65B-HF': ModelType.FT,
56
- 'TheBloke/airoboros-7b-gpt4-fp16': ModelType.IFT,
57
- 'TheBloke/llama-30b-supercot-SuperHOT-8K-fp16': ModelType.IFT,
58
- 'TheBloke/Llama-2-13B-fp16': ModelType.PT,
59
- 'TheBloke/llama-2-70b-Guanaco-QLoRA-fp16': ModelType.FT,
60
- 'TheBloke/landmark-attention-llama7b-fp16': ModelType.IFT,
61
- 'TheBloke/Planner-7B-fp16': ModelType.IFT,
62
- 'TheBloke/Wizard-Vicuna-13B-Uncensored-HF': ModelType.FT,
63
- 'TheBloke/gpt4-alpaca-lora-13B-HF': ModelType.IFT,
64
- 'TheBloke/gpt4-x-vicuna-13B-HF': ModelType.IFT,
65
- 'TheBloke/gpt4-alpaca-lora_mlp-65B-HF': ModelType.IFT,
66
- 'TheBloke/tulu-13B-fp16': ModelType.IFT,
67
- 'TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16': ModelType.IFT,
68
- 'TheBloke/Llama-2-70B-fp16': ModelType.IFT,
69
- 'TheBloke/WizardLM-30B-fp16': ModelType.IFT,
70
- 'TheBloke/robin-13B-v2-fp16': ModelType.FT,
71
- 'TheBloke/robin-33B-v2-fp16': ModelType.FT,
72
- 'TheBloke/Vicuna-13B-CoT-fp16': ModelType.IFT,
73
- 'TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16': ModelType.IFT,
74
- 'TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16': ModelType.FT,
75
- 'TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16': ModelType.IFT,
76
- 'TheBloke/GPlatty-30B-SuperHOT-8K-fp16': ModelType.FT,
77
- 'TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16': ModelType.IFT,
78
- 'TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16': ModelType.IFT,
79
- 'jphme/orca_mini_v2_ger_7b': ModelType.IFT,
80
- 'Ejafa/vicuna_7B_vanilla_1.1': ModelType.FT,
81
- 'kevinpro/Vicuna-13B-CoT': ModelType.IFT,
82
- 'AlekseyKorshuk/pygmalion-6b-vicuna-chatml': ModelType.FT,
83
- 'AlekseyKorshuk/chatml-pyg-v1': ModelType.FT,
84
- 'concedo/Vicuzard-30B-Uncensored': ModelType.FT,
85
- 'concedo/OPT-19M-ChatSalad': ModelType.FT,
86
- 'concedo/Pythia-70M-ChatSalad': ModelType.FT,
87
- 'digitous/13B-HyperMantis': ModelType.IFT,
88
- 'digitous/Adventien-GPTJ': ModelType.FT,
89
- 'digitous/Alpacino13b': ModelType.IFT,
90
- 'digitous/GPT-R': ModelType.IFT,
91
- 'digitous/Javelin-R': ModelType.IFT,
92
- 'digitous/Javalion-GPTJ': ModelType.IFT,
93
- 'digitous/Javalion-R': ModelType.IFT,
94
- 'digitous/Skegma-GPTJ': ModelType.FT,
95
- 'digitous/Alpacino30b': ModelType.IFT,
96
- 'digitous/Janin-GPTJ': ModelType.FT,
97
- 'digitous/Janin-R': ModelType.FT,
98
- 'digitous/Javelin-GPTJ': ModelType.FT,
99
- 'SaylorTwift/gpt2_test': ModelType.PT,
100
- 'anton-l/gpt-j-tiny-random': ModelType.FT,
101
- 'Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca': ModelType.FT,
102
- 'Lazycuber/pyg-instruct-wizardlm': ModelType.FT,
103
- 'Lazycuber/Janemalion-6B': ModelType.FT,
104
- 'IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1': ModelType.FT,
105
- 'IDEA-CCNL/Ziya-LLaMA-13B-v1': ModelType.IFT,
106
- 'dsvv-cair/alpaca-cleaned-llama-30b-bf16': ModelType.FT,
107
- 'gpt2-medium': ModelType.PT,
108
- 'camel-ai/CAMEL-13B-Combined-Data': ModelType.IFT,
109
- 'camel-ai/CAMEL-13B-Role-Playing-Data': ModelType.FT,
110
- 'camel-ai/CAMEL-33B-Combined-Data': ModelType.IFT,
111
- 'PygmalionAI/pygmalion-6b': ModelType.FT,
112
- 'PygmalionAI/metharme-1.3b': ModelType.IFT,
113
- 'PygmalionAI/pygmalion-1.3b': ModelType.FT,
114
- 'PygmalionAI/pygmalion-350m': ModelType.FT,
115
- 'PygmalionAI/pygmalion-2.7b': ModelType.FT,
116
- 'medalpaca/medalpaca-7b': ModelType.FT,
117
- 'lilloukas/Platypus-30B': ModelType.IFT,
118
- 'lilloukas/GPlatty-30B': ModelType.FT,
119
- 'mncai/chatdoctor': ModelType.FT,
120
- 'chaoyi-wu/MedLLaMA_13B': ModelType.FT,
121
- 'LoupGarou/WizardCoder-Guanaco-15B-V1.0': ModelType.IFT,
122
- 'LoupGarou/WizardCoder-Guanaco-15B-V1.1': ModelType.FT,
123
- 'hakurei/instruct-12b': ModelType.IFT,
124
- 'hakurei/lotus-12B': ModelType.FT,
125
- 'shibing624/chinese-llama-plus-13b-hf': ModelType.IFT,
126
- 'shibing624/chinese-alpaca-plus-7b-hf': ModelType.IFT,
127
- 'shibing624/chinese-alpaca-plus-13b-hf': ModelType.IFT,
128
- 'mosaicml/mpt-7b-instruct': ModelType.IFT,
129
- 'mosaicml/mpt-30b-chat': ModelType.IFT,
130
- 'mosaicml/mpt-7b-storywriter': ModelType.FT,
131
- 'mosaicml/mpt-30b-instruct': ModelType.IFT,
132
- 'mosaicml/mpt-7b-chat': ModelType.IFT,
133
- 'mosaicml/mpt-30b': ModelType.PT,
134
- 'Corianas/111m': ModelType.IFT,
135
- 'Corianas/Quokka_1.3b': ModelType.IFT,
136
- 'Corianas/256_5epoch': ModelType.FT,
137
- 'Corianas/Quokka_256m': ModelType.IFT,
138
- 'Corianas/Quokka_590m': ModelType.IFT,
139
- 'Corianas/gpt-j-6B-Dolly': ModelType.FT,
140
- 'Corianas/Quokka_2.7b': ModelType.IFT,
141
- 'cyberagent/open-calm-7b': ModelType.FT,
142
- 'Aspik101/Nous-Hermes-13b-pl-lora_unload': ModelType.IFT,
143
- 'THUDM/chatglm2-6b': ModelType.IFT,
144
- 'MetaIX/GPT4-X-Alpasta-30b': ModelType.IFT,
145
- 'NYTK/PULI-GPTrio': ModelType.PT,
146
- 'EleutherAI/pythia-1.3b': ModelType.PT,
147
- 'EleutherAI/pythia-2.8b-deduped': ModelType.PT,
148
- 'EleutherAI/gpt-neo-125m': ModelType.PT,
149
- 'EleutherAI/pythia-160m': ModelType.PT,
150
- 'EleutherAI/gpt-neo-2.7B': ModelType.PT,
151
- 'EleutherAI/pythia-1b-deduped': ModelType.PT,
152
- 'EleutherAI/pythia-6.7b': ModelType.PT,
153
- 'EleutherAI/pythia-70m-deduped': ModelType.PT,
154
- 'EleutherAI/gpt-neox-20b': ModelType.PT,
155
- 'EleutherAI/pythia-1.4b-deduped': ModelType.PT,
156
- 'EleutherAI/pythia-2.7b': ModelType.PT,
157
- 'EleutherAI/pythia-6.9b-deduped': ModelType.PT,
158
- 'EleutherAI/pythia-70m': ModelType.PT,
159
- 'EleutherAI/gpt-j-6b': ModelType.PT,
160
- 'EleutherAI/pythia-12b-deduped': ModelType.PT,
161
- 'EleutherAI/gpt-neo-1.3B': ModelType.PT,
162
- 'EleutherAI/pythia-410m-deduped': ModelType.PT,
163
- 'EleutherAI/pythia-160m-deduped': ModelType.PT,
164
- 'EleutherAI/polyglot-ko-12.8b': ModelType.PT,
165
- 'EleutherAI/pythia-12b': ModelType.PT,
166
- 'roneneldan/TinyStories-33M': ModelType.PT,
167
- 'roneneldan/TinyStories-28M': ModelType.PT,
168
- 'roneneldan/TinyStories-1M': ModelType.PT,
169
- 'roneneldan/TinyStories-8M': ModelType.PT,
170
- 'roneneldan/TinyStories-3M': ModelType.PT,
171
- 'jerryjalapeno/nart-100k-7b': ModelType.FT,
172
- 'lmsys/vicuna-13b-v1.3': ModelType.IFT,
173
- 'lmsys/vicuna-7b-v1.3': ModelType.IFT,
174
- 'lmsys/vicuna-13b-v1.1': ModelType.IFT,
175
- 'lmsys/vicuna-13b-delta-v1.1': ModelType.IFT,
176
- 'lmsys/vicuna-7b-delta-v1.1': ModelType.IFT,
177
- 'abhiramtirumala/DialoGPT-sarcastic-medium': ModelType.FT,
178
- 'haonan-li/bactrian-x-llama-13b-merged': ModelType.IFT,
179
- 'Gryphe/MythoLogic-13b': ModelType.IFT,
180
- 'Gryphe/MythoBoros-13b': ModelType.IFT,
181
- 'pillowtalks-ai/delta13b': ModelType.FT,
182
- 'wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard': ModelType.FT,
183
- 'bigscience/bloom-7b1': ModelType.PT,
184
- 'bigcode/tiny_starcoder_py': ModelType.PT,
185
- 'bigcode/starcoderplus': ModelType.FT,
186
- 'bigcode/gpt_bigcode-santacoder': ModelType.PT,
187
- 'bigcode/starcoder': ModelType.PT,
188
- 'Open-Orca/OpenOrca-Preview1-13B': ModelType.IFT,
189
- 'microsoft/DialoGPT-large': ModelType.FT,
190
- 'microsoft/DialoGPT-small': ModelType.FT,
191
- 'microsoft/DialoGPT-medium': ModelType.FT,
192
- 'microsoft/CodeGPT-small-py': ModelType.FT,
193
- 'Tincando/fiction_story_generator': ModelType.FT,
194
- 'Pirr/pythia-13b-deduped-green_devil': ModelType.FT,
195
- 'Aeala/GPT4-x-AlpacaDente2-30b': ModelType.FT,
196
- 'Aeala/GPT4-x-AlpacaDente-30b': ModelType.FT,
197
- 'Aeala/GPT4-x-Alpasta-13b': ModelType.FT,
198
- 'Aeala/VicUnlocked-alpaca-30b': ModelType.IFT,
199
- 'Tap-M/Luna-AI-Llama2-Uncensored': ModelType.FT,
200
- 'illuin/test-custom-llama': ModelType.FT,
201
- 'dvruette/oasst-llama-13b-2-epochs': ModelType.FT,
202
- 'dvruette/oasst-gpt-neox-20b-1000-steps': ModelType.FT,
203
- 'dvruette/llama-13b-pretrained-dropout': ModelType.PT,
204
- 'dvruette/llama-13b-pretrained': ModelType.PT,
205
- 'dvruette/llama-13b-pretrained-sft-epoch-1': ModelType.FT,
206
- 'dvruette/llama-13b-pretrained-sft-do2': ModelType.FT,
207
- 'dvruette/oasst-gpt-neox-20b-3000-steps': ModelType.FT,
208
- 'dvruette/oasst-pythia-12b-pretrained-sft': ModelType.FT,
209
- 'dvruette/oasst-pythia-6.9b-4000-steps': ModelType.FT,
210
- 'dvruette/gpt-neox-20b-full-precision': ModelType.FT,
211
- 'dvruette/oasst-llama-13b-1000-steps': ModelType.FT,
212
- 'openlm-research/open_llama_7b_700bt_preview': ModelType.PT,
213
- 'openlm-research/open_llama_7b': ModelType.PT,
214
- 'openlm-research/open_llama_7b_v2': ModelType.PT,
215
- 'openlm-research/open_llama_3b': ModelType.PT,
216
- 'openlm-research/open_llama_13b': ModelType.PT,
217
- 'openlm-research/open_llama_3b_v2': ModelType.PT,
218
- 'PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged': ModelType.IFT,
219
- 'GeorgiaTechResearchInstitute/galpaca-30b': ModelType.IFT,
220
- 'GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct': ModelType.IFT,
221
- 'databricks/dolly-v2-7b': ModelType.IFT,
222
- 'databricks/dolly-v2-3b': ModelType.IFT,
223
- 'databricks/dolly-v2-12b': ModelType.IFT,
224
- 'Rachneet/gpt2-xl-alpaca': ModelType.FT,
225
- 'Locutusque/gpt2-conversational-or-qa': ModelType.FT,
226
- 'psyche/kogpt': ModelType.FT,
227
- 'NbAiLab/nb-gpt-j-6B-alpaca': ModelType.IFT,
228
- 'Mikael110/llama-2-7b-guanaco-fp16': ModelType.FT,
229
- 'Mikael110/llama-2-13b-guanaco-fp16': ModelType.FT,
230
- 'Fredithefish/CrimsonPajama': ModelType.IFT,
231
- 'Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K': ModelType.FT,
232
- 'Fredithefish/ScarletPajama-3B-HF': ModelType.FT,
233
- 'Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4': ModelType.IFT,
234
- 'acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1': ModelType.IFT,
235
- 'eachadea/vicuna-13b-1.1': ModelType.FT,
236
- 'eachadea/vicuna-7b-1.1': ModelType.FT,
237
- 'eachadea/vicuna-13b': ModelType.FT,
238
- 'openaccess-ai-collective/wizard-mega-13b': ModelType.IFT,
239
- 'openaccess-ai-collective/manticore-13b': ModelType.IFT,
240
- 'openaccess-ai-collective/manticore-30b-chat-pyg-alpha': ModelType.IFT,
241
- 'openaccess-ai-collective/minotaur-13b': ModelType.IFT,
242
- 'openaccess-ai-collective/minotaur-13b-fixed': ModelType.IFT,
243
- 'openaccess-ai-collective/hippogriff-30b-chat': ModelType.IFT,
244
- 'openaccess-ai-collective/manticore-13b-chat-pyg': ModelType.IFT,
245
- 'pythainlp/wangchanglm-7.5B-sft-enth': ModelType.IFT,
246
- 'pythainlp/wangchanglm-7.5B-sft-en-sharded': ModelType.IFT,
247
- 'euclaise/gpt-neox-122m-minipile-digits': ModelType.FT,
248
- 'stabilityai/StableBeluga1-Delta': ModelType.IFT,
249
- 'stabilityai/stablelm-tuned-alpha-7b': ModelType.IFT,
250
- 'stabilityai/StableBeluga2': ModelType.IFT,
251
- 'stabilityai/StableBeluga-13B': ModelType.IFT,
252
- 'stabilityai/StableBeluga-7B': ModelType.IFT,
253
- 'stabilityai/stablelm-base-alpha-7b': ModelType.PT,
254
- 'stabilityai/stablelm-base-alpha-3b': ModelType.PT,
255
- 'stabilityai/stablelm-tuned-alpha-3b': ModelType.IFT,
256
- 'alibidaran/medical_transcription_generator': ModelType.FT,
257
- 'CalderaAI/30B-Lazarus': ModelType.IFT,
258
- 'CalderaAI/13B-BlueMethod': ModelType.IFT,
259
- 'CalderaAI/13B-Ouroboros': ModelType.IFT,
260
- 'KoboldAI/OPT-13B-Erebus': ModelType.FT,
261
- 'KoboldAI/GPT-J-6B-Janeway': ModelType.FT,
262
- 'KoboldAI/GPT-J-6B-Shinen': ModelType.FT,
263
- 'KoboldAI/fairseq-dense-2.7B': ModelType.PT,
264
- 'KoboldAI/OPT-6B-nerys-v2': ModelType.FT,
265
- 'KoboldAI/GPT-NeoX-20B-Skein': ModelType.FT,
266
- 'KoboldAI/PPO_Pygway-6b-Mix': ModelType.FT,
267
- 'KoboldAI/fairseq-dense-6.7B': ModelType.PT,
268
- 'KoboldAI/fairseq-dense-125M': ModelType.PT,
269
- 'KoboldAI/OPT-13B-Nerybus-Mix': ModelType.FT,
270
- 'KoboldAI/OPT-2.7B-Erebus': ModelType.FT,
271
- 'KoboldAI/OPT-350M-Nerys-v2': ModelType.FT,
272
- 'KoboldAI/OPT-2.7B-Nerys-v2': ModelType.FT,
273
- 'KoboldAI/OPT-2.7B-Nerybus-Mix': ModelType.FT,
274
- 'KoboldAI/OPT-13B-Nerys-v2': ModelType.FT,
275
- 'KoboldAI/GPT-NeoX-20B-Erebus': ModelType.FT,
276
- 'KoboldAI/OPT-6.7B-Erebus': ModelType.FT,
277
- 'KoboldAI/fairseq-dense-355M': ModelType.PT,
278
- 'KoboldAI/OPT-6.7B-Nerybus-Mix': ModelType.FT,
279
- 'KoboldAI/GPT-J-6B-Adventure': ModelType.FT,
280
- 'KoboldAI/OPT-350M-Erebus': ModelType.FT,
281
- 'KoboldAI/GPT-J-6B-Skein': ModelType.FT,
282
- 'KoboldAI/OPT-30B-Erebus': ModelType.FT,
283
- 'klosax/pythia-160m-deduped-step92k-193bt': ModelType.PT,
284
- 'klosax/open_llama_3b_350bt_preview': ModelType.PT,
285
- 'klosax/openllama-3b-350bt': ModelType.PT,
286
- 'klosax/pythia-70m-deduped-step44k-92bt': ModelType.PT,
287
- 'klosax/open_llama_13b_600bt_preview': ModelType.PT,
288
- 'klosax/open_llama_7b_400bt_preview': ModelType.PT,
289
- 'kfkas/Llama-2-ko-7b-Chat': ModelType.IFT,
290
- 'WeOpenML/Alpaca-7B-v1': ModelType.IFT,
291
- 'WeOpenML/PandaLM-Alpaca-7B-v1': ModelType.IFT,
292
- 'TFLai/gpt2-turkish-uncased': ModelType.FT,
293
- 'ehartford/WizardLM-13B-Uncensored': ModelType.IFT,
294
- 'ehartford/dolphin-llama-13b': ModelType.IFT,
295
- 'ehartford/Wizard-Vicuna-30B-Uncensored': ModelType.FT,
296
- 'ehartford/WizardLM-30B-Uncensored': ModelType.IFT,
297
- 'ehartford/Wizard-Vicuna-13B-Uncensored': ModelType.FT,
298
- 'ehartford/WizardLM-7B-Uncensored': ModelType.IFT,
299
- 'ehartford/based-30b': ModelType.FT,
300
- 'ehartford/Wizard-Vicuna-7B-Uncensored': ModelType.FT,
301
- 'wahaha1987/llama_7b_sharegpt94k_fastchat': ModelType.FT,
302
- 'wahaha1987/llama_13b_sharegpt94k_fastchat': ModelType.FT,
303
- 'OpenAssistant/oasst-sft-1-pythia-12b': ModelType.FT,
304
- 'OpenAssistant/stablelm-7b-sft-v7-epoch-3': ModelType.IFT,
305
- 'OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5': ModelType.FT,
306
- 'OpenAssistant/pythia-12b-sft-v8-2.5k-steps': ModelType.IFT,
307
- 'OpenAssistant/pythia-12b-sft-v8-7k-steps': ModelType.IFT,
308
- 'OpenAssistant/pythia-12b-pre-v8-12.5k-steps': ModelType.IFT,
309
- 'OpenAssistant/llama2-13b-orca-8k-3319': ModelType.IFT,
310
- 'junelee/wizard-vicuna-13b': ModelType.FT,
311
- 'BreadAi/gpt-YA-1-1_160M': ModelType.PT,
312
- 'BreadAi/MuseCan': ModelType.PT,
313
- 'BreadAi/MusePy-1-2': ModelType.PT,
314
- 'BreadAi/DiscordPy': ModelType.PT,
315
- 'BreadAi/PM_modelV2': ModelType.PT,
316
- 'BreadAi/gpt-Youtube': ModelType.PT,
317
- 'BreadAi/StoryPy': ModelType.FT,
318
- 'julianweng/Llama-2-7b-chat-orcah': ModelType.FT,
319
- 'AGI-inc/lora_moe_7b_baseline': ModelType.FT,
320
- 'AGI-inc/lora_moe_7b': ModelType.FT,
321
- 'togethercomputer/GPT-NeoXT-Chat-Base-20B': ModelType.IFT,
322
- 'togethercomputer/RedPajama-INCITE-Chat-7B-v0.1': ModelType.IFT,
323
- 'togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1': ModelType.IFT,
324
- 'togethercomputer/RedPajama-INCITE-7B-Base': ModelType.PT,
325
- 'togethercomputer/RedPajama-INCITE-7B-Instruct': ModelType.IFT,
326
- 'togethercomputer/RedPajama-INCITE-Base-3B-v1': ModelType.PT,
327
- 'togethercomputer/Pythia-Chat-Base-7B': ModelType.IFT,
328
- 'togethercomputer/RedPajama-INCITE-Base-7B-v0.1': ModelType.PT,
329
- 'togethercomputer/GPT-JT-6B-v1': ModelType.IFT,
330
- 'togethercomputer/GPT-JT-6B-v0': ModelType.IFT,
331
- 'togethercomputer/RedPajama-INCITE-Chat-3B-v1': ModelType.IFT,
332
- 'togethercomputer/RedPajama-INCITE-7B-Chat': ModelType.IFT,
333
- 'togethercomputer/RedPajama-INCITE-Instruct-3B-v1': ModelType.IFT,
334
- 'Writer/camel-5b-hf': ModelType.IFT,
335
- 'Writer/palmyra-base': ModelType.PT,
336
- 'MBZUAI/LaMini-GPT-1.5B': ModelType.IFT,
337
- 'MBZUAI/lamini-cerebras-111m': ModelType.IFT,
338
- 'MBZUAI/lamini-neo-1.3b': ModelType.IFT,
339
- 'MBZUAI/lamini-cerebras-1.3b': ModelType.IFT,
340
- 'MBZUAI/lamini-cerebras-256m': ModelType.IFT,
341
- 'MBZUAI/LaMini-GPT-124M': ModelType.IFT,
342
- 'MBZUAI/lamini-neo-125m': ModelType.IFT,
343
- 'TehVenom/DiffMerge-DollyGPT-Pygmalion': ModelType.FT,
344
- 'TehVenom/PPO_Shygmalion-6b': ModelType.FT,
345
- 'TehVenom/Dolly_Shygmalion-6b-Dev_V8P2': ModelType.FT,
346
- 'TehVenom/Pygmalion_AlpacaLora-7b': ModelType.FT,
347
- 'TehVenom/PPO_Pygway-V8p4_Dev-6b': ModelType.FT,
348
- 'TehVenom/Dolly_Malion-6b': ModelType.FT,
349
- 'TehVenom/PPO_Shygmalion-V8p4_Dev-6b': ModelType.FT,
350
- 'TehVenom/ChanMalion': ModelType.FT,
351
- 'TehVenom/GPT-J-Pyg_PPO-6B': ModelType.IFT,
352
- 'TehVenom/Pygmalion-13b-Merged': ModelType.FT,
353
- 'TehVenom/Metharme-13b-Merged': ModelType.IFT,
354
- 'TehVenom/Dolly_Shygmalion-6b': ModelType.FT,
355
- 'TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4': ModelType.IFT,
356
- 'georgesung/llama2_7b_chat_uncensored': ModelType.FT,
357
- 'vicgalle/gpt2-alpaca': ModelType.IFT,
358
- 'vicgalle/alpaca-7b': ModelType.FT,
359
- 'vicgalle/gpt2-alpaca-gpt4': ModelType.IFT,
360
- 'facebook/opt-350m': ModelType.PT,
361
- 'facebook/opt-125m': ModelType.PT,
362
- 'facebook/xglm-4.5B': ModelType.PT,
363
- 'facebook/opt-2.7b': ModelType.PT,
364
- 'facebook/opt-6.7b': ModelType.PT,
365
- 'facebook/galactica-30b': ModelType.PT,
366
- 'facebook/opt-13b': ModelType.PT,
367
- 'facebook/opt-66b': ModelType.PT,
368
- 'facebook/xglm-7.5B': ModelType.PT,
369
- 'facebook/xglm-564M': ModelType.PT,
370
- 'facebook/opt-30b': ModelType.PT,
371
- 'golaxy/gogpt-7b': ModelType.FT,
372
- 'golaxy/gogpt2-7b': ModelType.FT,
373
- 'golaxy/gogpt-7b-bloom': ModelType.FT,
374
- 'golaxy/gogpt-3b-bloom': ModelType.FT,
375
- 'psmathur/orca_mini_v2_7b': ModelType.IFT,
376
- 'psmathur/orca_mini_7b': ModelType.IFT,
377
- 'psmathur/orca_mini_3b': ModelType.IFT,
378
- 'psmathur/orca_mini_v2_13b': ModelType.IFT,
379
- 'gpt2-xl': ModelType.PT,
380
- 'lxe/Cerebras-GPT-2.7B-Alpaca-SP': ModelType.FT,
381
- 'Monero/Manticore-13b-Chat-Pyg-Guanaco': ModelType.FT,
382
- 'Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b': ModelType.IFT,
383
- 'Monero/WizardLM-13b-OpenAssistant-Uncensored': ModelType.IFT,
384
- 'Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b': ModelType.IFT,
385
- 'jzjiao/opt-1.3b-rlhf': ModelType.FT,
386
- 'HuggingFaceH4/starchat-beta': ModelType.IFT,
387
- 'KnutJaegersberg/gpt-2-xl-EvolInstruct': ModelType.IFT,
388
- 'KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct': ModelType.IFT,
389
- 'KnutJaegersberg/galactica-orca-wizardlm-1.3b': ModelType.IFT,
390
- 'openchat/openchat_8192': ModelType.IFT,
391
- 'openchat/openchat_v2': ModelType.IFT,
392
- 'openchat/openchat_v2_w': ModelType.IFT,
393
- 'ausboss/llama-13b-supercot': ModelType.IFT,
394
- 'ausboss/llama-30b-supercot': ModelType.IFT,
395
- 'Neko-Institute-of-Science/metharme-7b': ModelType.IFT,
396
- 'Neko-Institute-of-Science/pygmalion-7b': ModelType.FT,
397
- 'SebastianSchramm/Cerebras-GPT-111M-instruction': ModelType.IFT,
398
- 'victor123/WizardLM-13B-1.0': ModelType.IFT,
399
- 'OpenBuddy/openbuddy-openllama-13b-v7-fp16': ModelType.FT,
400
- 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16': ModelType.FT,
401
- 'OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16': ModelType.FT,
402
- 'baichuan-inc/Baichuan-7B': ModelType.PT,
403
- 'tiiuae/falcon-40b-instruct': ModelType.IFT,
404
- 'tiiuae/falcon-40b': ModelType.PT,
405
- 'tiiuae/falcon-7b': ModelType.PT,
406
- 'YeungNLP/firefly-llama-13b': ModelType.FT,
407
- 'YeungNLP/firefly-llama-13b-v1.2': ModelType.FT,
408
- 'YeungNLP/firefly-llama2-13b': ModelType.FT,
409
- 'YeungNLP/firefly-ziya-13b': ModelType.FT,
410
- 'shaohang/Sparse0.5_OPT-1.3': ModelType.FT,
411
- 'xzuyn/Alpacino-SuperCOT-13B': ModelType.IFT,
412
- 'xzuyn/MedicWizard-7B': ModelType.FT,
413
- 'xDAN-AI/xDAN_13b_l2_lora': ModelType.FT,
414
- 'beomi/KoAlpaca-Polyglot-5.8B': ModelType.FT,
415
- 'beomi/llama-2-ko-7b': ModelType.IFT,
416
- 'Salesforce/codegen-6B-multi': ModelType.PT,
417
- 'Salesforce/codegen-16B-nl': ModelType.PT,
418
- 'Salesforce/codegen-6B-nl': ModelType.PT,
419
- 'ai-forever/rugpt3large_based_on_gpt2': ModelType.FT,
420
- 'gpt2-large': ModelType.PT,
421
- 'frank098/orca_mini_3b_juniper': ModelType.FT,
422
- 'frank098/WizardLM_13B_juniper': ModelType.FT,
423
- 'FPHam/Free_Sydney_13b_HF': ModelType.FT,
424
- 'huggingface/llama-13b': ModelType.PT,
425
- 'huggingface/llama-7b': ModelType.PT,
426
- 'huggingface/llama-65b': ModelType.PT,
427
- 'huggingface/llama-30b': ModelType.PT,
428
- 'Henk717/chronoboros-33B': ModelType.IFT,
429
- 'jondurbin/airoboros-13b-gpt4-1.4': ModelType.IFT,
430
- 'jondurbin/airoboros-7b': ModelType.IFT,
431
- 'jondurbin/airoboros-7b-gpt4': ModelType.IFT,
432
- 'jondurbin/airoboros-7b-gpt4-1.1': ModelType.IFT,
433
- 'jondurbin/airoboros-7b-gpt4-1.2': ModelType.IFT,
434
- 'jondurbin/airoboros-7b-gpt4-1.3': ModelType.IFT,
435
- 'jondurbin/airoboros-7b-gpt4-1.4': ModelType.IFT,
436
- 'jondurbin/airoboros-l2-7b-gpt4-1.4.1': ModelType.IFT,
437
- 'jondurbin/airoboros-l2-13b-gpt4-1.4.1': ModelType.IFT,
438
- 'jondurbin/airoboros-l2-70b-gpt4-1.4.1': ModelType.IFT,
439
- 'jondurbin/airoboros-13b': ModelType.IFT,
440
- 'jondurbin/airoboros-33b-gpt4-1.4': ModelType.IFT,
441
- 'jondurbin/airoboros-33b-gpt4-1.2': ModelType.IFT,
442
- 'jondurbin/airoboros-65b-gpt4-1.2': ModelType.IFT,
443
- 'ariellee/SuperPlatty-30B': ModelType.IFT,
444
- 'danielhanchen/open_llama_3b_600bt_preview': ModelType.FT,
445
- 'cerebras/Cerebras-GPT-256M': ModelType.PT,
446
- 'cerebras/Cerebras-GPT-1.3B': ModelType.PT,
447
- 'cerebras/Cerebras-GPT-13B': ModelType.PT,
448
- 'cerebras/Cerebras-GPT-2.7B': ModelType.PT,
449
- 'cerebras/Cerebras-GPT-111M': ModelType.PT,
450
- 'cerebras/Cerebras-GPT-6.7B': ModelType.PT,
451
- 'Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf': ModelType.RL,
452
- 'Yhyu13/llama-30B-hf-openassitant': ModelType.FT,
453
- 'NousResearch/Nous-Hermes-Llama2-13b': ModelType.IFT,
454
- 'NousResearch/Nous-Hermes-llama-2-7b': ModelType.IFT,
455
- 'NousResearch/Redmond-Puffin-13B': ModelType.IFT,
456
- 'NousResearch/Nous-Hermes-13b': ModelType.IFT,
457
- 'project-baize/baize-v2-7b': ModelType.IFT,
458
- 'project-baize/baize-v2-13b': ModelType.IFT,
459
- 'LLMs/WizardLM-13B-V1.0': ModelType.FT,
460
- 'LLMs/AlpacaGPT4-7B-elina': ModelType.FT,
461
- 'wenge-research/yayi-7b': ModelType.FT,
462
- 'wenge-research/yayi-7b-llama2': ModelType.FT,
463
- 'wenge-research/yayi-13b-llama2': ModelType.FT,
464
- 'yhyhy3/open_llama_7b_v2_med_instruct': ModelType.IFT,
465
- 'llama-anon/instruct-13b': ModelType.IFT,
466
- 'huggingtweets/jerma985': ModelType.FT,
467
- 'huggingtweets/gladosystem': ModelType.FT,
468
- 'huggingtweets/bladeecity-jerma985': ModelType.FT,
469
- 'huggyllama/llama-13b': ModelType.PT,
470
- 'huggyllama/llama-65b': ModelType.PT,
471
- 'FabbriSimo01/Facebook_opt_1.3b_Quantized': ModelType.PT,
472
- 'upstage/Llama-2-70b-instruct': ModelType.IFT,
473
- 'upstage/Llama-2-70b-instruct-1024': ModelType.IFT,
474
- 'upstage/llama-65b-instruct': ModelType.IFT,
475
- 'upstage/llama-30b-instruct-2048': ModelType.IFT,
476
- 'upstage/llama-30b-instruct': ModelType.IFT,
477
- 'WizardLM/WizardLM-13B-1.0': ModelType.IFT,
478
- 'WizardLM/WizardLM-13B-V1.1': ModelType.IFT,
479
- 'WizardLM/WizardLM-13B-V1.2': ModelType.IFT,
480
- 'WizardLM/WizardLM-30B-V1.0': ModelType.IFT,
481
- 'WizardLM/WizardCoder-15B-V1.0': ModelType.IFT,
482
- 'gpt2': ModelType.PT,
483
- 'keyfan/vicuna-chinese-replication-v1.1': ModelType.IFT,
484
- 'nthngdy/pythia-owt2-70m-100k': ModelType.FT,
485
- 'nthngdy/pythia-owt2-70m-50k': ModelType.FT,
486
- 'quantumaikr/KoreanLM-hf': ModelType.FT,
487
- 'quantumaikr/open_llama_7b_hf': ModelType.FT,
488
- 'quantumaikr/QuantumLM-70B-hf': ModelType.IFT,
489
- 'MayaPH/FinOPT-Lincoln': ModelType.FT,
490
- 'MayaPH/FinOPT-Franklin': ModelType.FT,
491
- 'MayaPH/GodziLLa-30B': ModelType.IFT,
492
- 'MayaPH/GodziLLa-30B-plus': ModelType.IFT,
493
- 'MayaPH/FinOPT-Washington': ModelType.FT,
494
- 'ogimgio/gpt-neo-125m-neurallinguisticpioneers': ModelType.FT,
495
- 'layoric/llama-2-13b-code-alpaca': ModelType.FT,
496
- 'CobraMamba/mamba-gpt-3b': ModelType.FT,
497
- 'CobraMamba/mamba-gpt-3b-v2': ModelType.FT,
498
- 'CobraMamba/mamba-gpt-3b-v3': ModelType.FT,
499
- 'timdettmers/guanaco-33b-merged': ModelType.FT,
500
- 'elinas/chronos-33b': ModelType.IFT,
501
- 'heegyu/RedTulu-Uncensored-3B-0719': ModelType.IFT,
502
- 'heegyu/WizardVicuna-Uncensored-3B-0719': ModelType.IFT,
503
- 'heegyu/WizardVicuna-3B-0719': ModelType.IFT,
504
- 'meta-llama/Llama-2-7b-chat-hf': ModelType.RL,
505
- 'meta-llama/Llama-2-7b-hf': ModelType.PT,
506
- 'meta-llama/Llama-2-13b-chat-hf': ModelType.RL,
507
- 'meta-llama/Llama-2-13b-hf': ModelType.PT,
508
- 'meta-llama/Llama-2-70b-chat-hf': ModelType.RL,
509
- 'meta-llama/Llama-2-70b-hf': ModelType.PT,
510
- 'xhyi/PT_GPTNEO350_ATG': ModelType.FT,
511
- 'h2oai/h2ogpt-gm-oasst1-en-1024-20b': ModelType.FT,
512
- 'h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt': ModelType.FT,
513
- 'h2oai/h2ogpt-oig-oasst1-512-6_9b': ModelType.IFT,
514
- 'h2oai/h2ogpt-oasst1-512-12b': ModelType.IFT,
515
- 'h2oai/h2ogpt-oig-oasst1-256-6_9b': ModelType.IFT,
516
- 'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt': ModelType.FT,
517
- 'h2oai/h2ogpt-oasst1-512-20b': ModelType.IFT,
518
- 'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2': ModelType.FT,
519
- 'h2oai/h2ogpt-gm-oasst1-en-1024-12b': ModelType.FT,
520
- 'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b': ModelType.FT,
521
- 'bofenghuang/vigogne-13b-instruct': ModelType.IFT,
522
- 'bofenghuang/vigogne-13b-chat': ModelType.FT,
523
- 'bofenghuang/vigogne-2-7b-instruct': ModelType.IFT,
524
- 'bofenghuang/vigogne-7b-instruct': ModelType.IFT,
525
- 'bofenghuang/vigogne-7b-chat': ModelType.FT,
526
- 'Vmware/open-llama-7b-v2-open-instruct': ModelType.IFT,
527
- 'VMware/open-llama-0.7T-7B-open-instruct-v1.1': ModelType.IFT,
528
- 'ewof/koishi-instruct-3b': ModelType.IFT,
529
- 'gywy/llama2-13b-chinese-v1': ModelType.FT,
530
- 'GOAT-AI/GOAT-7B-Community': ModelType.FT,
531
- 'psyche/kollama2-7b': ModelType.FT,
532
- 'TheTravellingEngineer/llama2-7b-hf-guanaco': ModelType.FT,
533
- 'beaugogh/pythia-1.4b-deduped-sharegpt': ModelType.FT,
534
- 'augtoma/qCammel-70-x': ModelType.IFT,
535
- 'Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload': ModelType.IFT,
536
- 'anhnv125/pygmalion-6b-roleplay': ModelType.FT,
537
- '64bits/LexPodLM-13B': ModelType.FT,
538
- }
539
-
540
-
541
- def model_type_from_str(type):
542
- if "fine-tuned" in type or "🔶" in type:
543
- return ModelType.FT
544
- if "pretrained" in type or "🟢" in type:
545
- return ModelType.PT
546
- if "RL-tuned" in type or "🟦" in type:
547
- return ModelType.RL
548
- if "instruction-tuned" in type or "⭕" in type:
549
- return ModelType.IFT
550
- return ModelType.Unknown
551
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{auto_leaderboard → display_models}/get_model_metadata.py RENAMED
@@ -1,17 +1,17 @@
1
- import re
2
- import os
3
  import glob
4
  import json
5
  import os
 
6
  from typing import List
 
 
 
7
  from tqdm import tqdm
8
 
9
- from src.utils_display import AutoEvalColumn, model_hyperlink
10
- from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
11
- from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS, DO_NOT_SUBMIT_MODELS
12
 
13
- from huggingface_hub import HfApi
14
- import huggingface_hub
15
  api = HfApi(token=os.environ.get("H4_TOKEN", None))
16
 
17
 
@@ -38,15 +38,18 @@ def get_model_license(model_info):
38
  except Exception:
39
  return None
40
 
 
41
  def get_model_likes(model_info):
42
  return model_info.likes
43
 
 
44
  size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
45
 
 
46
  def get_model_size(model_name, model_info):
47
  # In billions
48
  try:
49
- return round(model_info.safetensors["total"] / 1e9, 3)
50
  except AttributeError:
51
  try:
52
  size_match = re.search(size_pattern, model_name.lower())
@@ -58,7 +61,10 @@ def get_model_size(model_name, model_info):
58
 
59
  def get_model_type(leaderboard_data: List[dict]):
60
  for model_data in leaderboard_data:
61
- request_files = os.path.join("eval-queue", model_data["model_name_for_query"] + "_eval_request_*" + ".json")
 
 
 
62
  request_files = glob.glob(request_files)
63
 
64
  # Select correct request file (precision)
@@ -70,9 +76,12 @@ def get_model_type(leaderboard_data: List[dict]):
70
  for tmp_request_file in request_files:
71
  with open(tmp_request_file, "r") as f:
72
  req_content = json.load(f)
73
- if req_content["status"] == "FINISHED" and req_content["precision"] == model_data["Precision"].split(".")[-1]:
 
 
 
74
  request_file = tmp_request_file
75
-
76
  if request_file == "":
77
  model_data[AutoEvalColumn.model_type.name] = ""
78
  model_data[AutoEvalColumn.model_type_symbol.name] = ""
@@ -81,30 +90,41 @@ def get_model_type(leaderboard_data: List[dict]):
81
  try:
82
  with open(request_file, "r") as f:
83
  request = json.load(f)
84
- is_delta = request["weight_type"] != "Original"
85
  except Exception:
86
- is_delta = False
87
 
88
  try:
89
  with open(request_file, "r") as f:
90
  request = json.load(f)
91
  model_type = model_type_from_str(request["model_type"])
92
  model_data[AutoEvalColumn.model_type.name] = model_type.value.name
93
- model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol #+ ("🔺" if is_delta else "")
94
  except KeyError:
95
  if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
96
- model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[model_data["model_name_for_query"]].value.name
97
- model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[model_data["model_name_for_query"]].value.symbol #+ ("🔺" if is_delta else "")
 
 
 
 
98
  else:
99
  model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
100
  model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
101
 
102
- def flag_models(leaderboard_data:List[dict]):
 
103
  for model_data in leaderboard_data:
104
  if model_data["model_name_for_query"] in FLAGGED_MODELS:
105
  issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
106
- issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
107
- model_data[AutoEvalColumn.model.name] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
 
 
 
 
 
 
108
 
109
  def remove_forbidden_models(leaderboard_data: List[dict]):
110
  indices_to_remove = []
@@ -116,6 +136,7 @@ def remove_forbidden_models(leaderboard_data: List[dict]):
116
  leaderboard_data.pop(ix)
117
  return leaderboard_data
118
 
 
119
  def apply_metadata(leaderboard_data: List[dict]):
120
  leaderboard_data = remove_forbidden_models(leaderboard_data)
121
  get_model_type(leaderboard_data)
 
 
 
1
  import glob
2
  import json
3
  import os
4
+ import re
5
  from typing import List
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import HfApi
9
  from tqdm import tqdm
10
 
11
+ from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
12
+ from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
13
+ from src.display_models.utils import AutoEvalColumn, model_hyperlink
14
 
 
 
15
  api = HfApi(token=os.environ.get("H4_TOKEN", None))
16
 
17
 
 
38
  except Exception:
39
  return None
40
 
41
+
42
  def get_model_likes(model_info):
43
  return model_info.likes
44
 
45
+
46
  size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
47
 
48
+
49
  def get_model_size(model_name, model_info):
50
  # In billions
51
  try:
52
+ return round(model_info.safetensors["total"] / 1e9, 3)
53
  except AttributeError:
54
  try:
55
  size_match = re.search(size_pattern, model_name.lower())
 
61
 
62
  def get_model_type(leaderboard_data: List[dict]):
63
  for model_data in leaderboard_data:
64
+ request_files = os.path.join(
65
+ "eval-queue",
66
+ model_data["model_name_for_query"] + "_eval_request_*" + ".json",
67
+ )
68
  request_files = glob.glob(request_files)
69
 
70
  # Select correct request file (precision)
 
76
  for tmp_request_file in request_files:
77
  with open(tmp_request_file, "r") as f:
78
  req_content = json.load(f)
79
+ if (
80
+ req_content["status"] == "FINISHED"
81
+ and req_content["precision"] == model_data["Precision"].split(".")[-1]
82
+ ):
83
  request_file = tmp_request_file
84
+
85
  if request_file == "":
86
  model_data[AutoEvalColumn.model_type.name] = ""
87
  model_data[AutoEvalColumn.model_type_symbol.name] = ""
 
90
  try:
91
  with open(request_file, "r") as f:
92
  request = json.load(f)
93
+ request["weight_type"] != "Original"
94
  except Exception:
95
+ pass
96
 
97
  try:
98
  with open(request_file, "r") as f:
99
  request = json.load(f)
100
  model_type = model_type_from_str(request["model_type"])
101
  model_data[AutoEvalColumn.model_type.name] = model_type.value.name
102
+ model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol # + ("🔺" if is_delta else "")
103
  except KeyError:
104
  if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
105
+ model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
106
+ model_data["model_name_for_query"]
107
+ ].value.name
108
+ model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[
109
+ model_data["model_name_for_query"]
110
+ ].value.symbol # + ("🔺" if is_delta else "")
111
  else:
112
  model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
113
  model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
114
 
115
+
116
+ def flag_models(leaderboard_data: List[dict]):
117
  for model_data in leaderboard_data:
118
  if model_data["model_name_for_query"] in FLAGGED_MODELS:
119
  issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
120
+ issue_link = model_hyperlink(
121
+ FLAGGED_MODELS[model_data["model_name_for_query"]],
122
+ f"See discussion #{issue_num}",
123
+ )
124
+ model_data[
125
+ AutoEvalColumn.model.name
126
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
127
+
128
 
129
  def remove_forbidden_models(leaderboard_data: List[dict]):
130
  indices_to_remove = []
 
136
  leaderboard_data.pop(ix)
137
  return leaderboard_data
138
 
139
+
140
  def apply_metadata(leaderboard_data: List[dict]):
141
  leaderboard_data = remove_forbidden_models(leaderboard_data)
142
  get_model_type(leaderboard_data)
src/{auto_leaderboard → display_models}/model_metadata_flags.py RENAMED
@@ -8,5 +8,5 @@ FLAGGED_MODELS = {
8
 
9
  # Models which have been requested by orgs to not be submitted on the leaderboard
10
  DO_NOT_SUBMIT_MODELS = [
11
- "Voicelab/trurl-2-13b", # trained on MMLU
12
- ]
 
8
 
9
  # Models which have been requested by orgs to not be submitted on the leaderboard
10
  DO_NOT_SUBMIT_MODELS = [
11
+ "Voicelab/trurl-2-13b", # trained on MMLU
12
+ ]
src/display_models/model_metadata_type.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Dict
4
+
5
+
6
+ @dataclass
7
+ class ModelInfo:
8
+ name: str
9
+ symbol: str # emoji
10
+
11
+
12
+ class ModelType(Enum):
13
+ PT = ModelInfo(name="pretrained", symbol="🟢")
14
+ FT = ModelInfo(name="fine-tuned", symbol="🔶")
15
+ IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
16
+ RL = ModelInfo(name="RL-tuned", symbol="🟦")
17
+ Unknown = ModelInfo(name="Unknown, add type to request file!", symbol="?")
18
+
19
+ def to_str(self, separator=" "):
20
+ return f"{self.value.symbol}{separator}{self.value.name}"
21
+
22
+
23
+ MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
+ "notstoic/PygmalionCoT-7b": ModelType.IFT,
25
+ "aisquared/dlite-v1-355m": ModelType.IFT,
26
+ "aisquared/dlite-v1-1_5b": ModelType.IFT,
27
+ "aisquared/dlite-v1-774m": ModelType.IFT,
28
+ "aisquared/dlite-v1-124m": ModelType.IFT,
29
+ "aisquared/chopt-2_7b": ModelType.IFT,
30
+ "aisquared/dlite-v2-124m": ModelType.IFT,
31
+ "aisquared/dlite-v2-774m": ModelType.IFT,
32
+ "aisquared/dlite-v2-1_5b": ModelType.IFT,
33
+ "aisquared/chopt-1_3b": ModelType.IFT,
34
+ "aisquared/dlite-v2-355m": ModelType.IFT,
35
+ "augtoma/qCammel-13": ModelType.IFT,
36
+ "Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload": ModelType.IFT,
37
+ "Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload": ModelType.IFT,
38
+ "TheBloke/alpaca-lora-65B-HF": ModelType.FT,
39
+ "TheBloke/tulu-7B-fp16": ModelType.IFT,
40
+ "TheBloke/guanaco-7B-HF": ModelType.FT,
41
+ "TheBloke/koala-7B-HF": ModelType.FT,
42
+ "TheBloke/wizardLM-7B-HF": ModelType.IFT,
43
+ "TheBloke/airoboros-13B-HF": ModelType.IFT,
44
+ "TheBloke/koala-13B-HF": ModelType.FT,
45
+ "TheBloke/Wizard-Vicuna-7B-Uncensored-HF": ModelType.FT,
46
+ "TheBloke/dromedary-65b-lora-HF": ModelType.IFT,
47
+ "TheBloke/wizardLM-13B-1.0-fp16": ModelType.IFT,
48
+ "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16": ModelType.FT,
49
+ "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16": ModelType.FT,
50
+ "TheBloke/wizard-vicuna-13B-HF": ModelType.IFT,
51
+ "TheBloke/UltraLM-13B-fp16": ModelType.IFT,
52
+ "TheBloke/OpenAssistant-FT-7-Llama-30B-HF": ModelType.FT,
53
+ "TheBloke/vicuna-13B-1.1-HF": ModelType.IFT,
54
+ "TheBloke/guanaco-13B-HF": ModelType.FT,
55
+ "TheBloke/guanaco-65B-HF": ModelType.FT,
56
+ "TheBloke/airoboros-7b-gpt4-fp16": ModelType.IFT,
57
+ "TheBloke/llama-30b-supercot-SuperHOT-8K-fp16": ModelType.IFT,
58
+ "TheBloke/Llama-2-13B-fp16": ModelType.PT,
59
+ "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16": ModelType.FT,
60
+ "TheBloke/landmark-attention-llama7b-fp16": ModelType.IFT,
61
+ "TheBloke/Planner-7B-fp16": ModelType.IFT,
62
+ "TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.FT,
63
+ "TheBloke/gpt4-alpaca-lora-13B-HF": ModelType.IFT,
64
+ "TheBloke/gpt4-x-vicuna-13B-HF": ModelType.IFT,
65
+ "TheBloke/gpt4-alpaca-lora_mlp-65B-HF": ModelType.IFT,
66
+ "TheBloke/tulu-13B-fp16": ModelType.IFT,
67
+ "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16": ModelType.IFT,
68
+ "TheBloke/Llama-2-70B-fp16": ModelType.IFT,
69
+ "TheBloke/WizardLM-30B-fp16": ModelType.IFT,
70
+ "TheBloke/robin-13B-v2-fp16": ModelType.FT,
71
+ "TheBloke/robin-33B-v2-fp16": ModelType.FT,
72
+ "TheBloke/Vicuna-13B-CoT-fp16": ModelType.IFT,
73
+ "TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16": ModelType.IFT,
74
+ "TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16": ModelType.FT,
75
+ "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16": ModelType.IFT,
76
+ "TheBloke/GPlatty-30B-SuperHOT-8K-fp16": ModelType.FT,
77
+ "TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16": ModelType.IFT,
78
+ "TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16": ModelType.IFT,
79
+ "jphme/orca_mini_v2_ger_7b": ModelType.IFT,
80
+ "Ejafa/vicuna_7B_vanilla_1.1": ModelType.FT,
81
+ "kevinpro/Vicuna-13B-CoT": ModelType.IFT,
82
+ "AlekseyKorshuk/pygmalion-6b-vicuna-chatml": ModelType.FT,
83
+ "AlekseyKorshuk/chatml-pyg-v1": ModelType.FT,
84
+ "concedo/Vicuzard-30B-Uncensored": ModelType.FT,
85
+ "concedo/OPT-19M-ChatSalad": ModelType.FT,
86
+ "concedo/Pythia-70M-ChatSalad": ModelType.FT,
87
+ "digitous/13B-HyperMantis": ModelType.IFT,
88
+ "digitous/Adventien-GPTJ": ModelType.FT,
89
+ "digitous/Alpacino13b": ModelType.IFT,
90
+ "digitous/GPT-R": ModelType.IFT,
91
+ "digitous/Javelin-R": ModelType.IFT,
92
+ "digitous/Javalion-GPTJ": ModelType.IFT,
93
+ "digitous/Javalion-R": ModelType.IFT,
94
+ "digitous/Skegma-GPTJ": ModelType.FT,
95
+ "digitous/Alpacino30b": ModelType.IFT,
96
+ "digitous/Janin-GPTJ": ModelType.FT,
97
+ "digitous/Janin-R": ModelType.FT,
98
+ "digitous/Javelin-GPTJ": ModelType.FT,
99
+ "SaylorTwift/gpt2_test": ModelType.PT,
100
+ "anton-l/gpt-j-tiny-random": ModelType.FT,
101
+ "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca": ModelType.FT,
102
+ "Lazycuber/pyg-instruct-wizardlm": ModelType.FT,
103
+ "Lazycuber/Janemalion-6B": ModelType.FT,
104
+ "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.FT,
105
+ "IDEA-CCNL/Ziya-LLaMA-13B-v1": ModelType.IFT,
106
+ "dsvv-cair/alpaca-cleaned-llama-30b-bf16": ModelType.FT,
107
+ "gpt2-medium": ModelType.PT,
108
+ "camel-ai/CAMEL-13B-Combined-Data": ModelType.IFT,
109
+ "camel-ai/CAMEL-13B-Role-Playing-Data": ModelType.FT,
110
+ "camel-ai/CAMEL-33B-Combined-Data": ModelType.IFT,
111
+ "PygmalionAI/pygmalion-6b": ModelType.FT,
112
+ "PygmalionAI/metharme-1.3b": ModelType.IFT,
113
+ "PygmalionAI/pygmalion-1.3b": ModelType.FT,
114
+ "PygmalionAI/pygmalion-350m": ModelType.FT,
115
+ "PygmalionAI/pygmalion-2.7b": ModelType.FT,
116
+ "medalpaca/medalpaca-7b": ModelType.FT,
117
+ "lilloukas/Platypus-30B": ModelType.IFT,
118
+ "lilloukas/GPlatty-30B": ModelType.FT,
119
+ "mncai/chatdoctor": ModelType.FT,
120
+ "chaoyi-wu/MedLLaMA_13B": ModelType.FT,
121
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.0": ModelType.IFT,
122
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.1": ModelType.FT,
123
+ "hakurei/instruct-12b": ModelType.IFT,
124
+ "hakurei/lotus-12B": ModelType.FT,
125
+ "shibing624/chinese-llama-plus-13b-hf": ModelType.IFT,
126
+ "shibing624/chinese-alpaca-plus-7b-hf": ModelType.IFT,
127
+ "shibing624/chinese-alpaca-plus-13b-hf": ModelType.IFT,
128
+ "mosaicml/mpt-7b-instruct": ModelType.IFT,
129
+ "mosaicml/mpt-30b-chat": ModelType.IFT,
130
+ "mosaicml/mpt-7b-storywriter": ModelType.FT,
131
+ "mosaicml/mpt-30b-instruct": ModelType.IFT,
132
+ "mosaicml/mpt-7b-chat": ModelType.IFT,
133
+ "mosaicml/mpt-30b": ModelType.PT,
134
+ "Corianas/111m": ModelType.IFT,
135
+ "Corianas/Quokka_1.3b": ModelType.IFT,
136
+ "Corianas/256_5epoch": ModelType.FT,
137
+ "Corianas/Quokka_256m": ModelType.IFT,
138
+ "Corianas/Quokka_590m": ModelType.IFT,
139
+ "Corianas/gpt-j-6B-Dolly": ModelType.FT,
140
+ "Corianas/Quokka_2.7b": ModelType.IFT,
141
+ "cyberagent/open-calm-7b": ModelType.FT,
142
+ "Aspik101/Nous-Hermes-13b-pl-lora_unload": ModelType.IFT,
143
+ "THUDM/chatglm2-6b": ModelType.IFT,
144
+ "MetaIX/GPT4-X-Alpasta-30b": ModelType.IFT,
145
+ "NYTK/PULI-GPTrio": ModelType.PT,
146
+ "EleutherAI/pythia-1.3b": ModelType.PT,
147
+ "EleutherAI/pythia-2.8b-deduped": ModelType.PT,
148
+ "EleutherAI/gpt-neo-125m": ModelType.PT,
149
+ "EleutherAI/pythia-160m": ModelType.PT,
150
+ "EleutherAI/gpt-neo-2.7B": ModelType.PT,
151
+ "EleutherAI/pythia-1b-deduped": ModelType.PT,
152
+ "EleutherAI/pythia-6.7b": ModelType.PT,
153
+ "EleutherAI/pythia-70m-deduped": ModelType.PT,
154
+ "EleutherAI/gpt-neox-20b": ModelType.PT,
155
+ "EleutherAI/pythia-1.4b-deduped": ModelType.PT,
156
+ "EleutherAI/pythia-2.7b": ModelType.PT,
157
+ "EleutherAI/pythia-6.9b-deduped": ModelType.PT,
158
+ "EleutherAI/pythia-70m": ModelType.PT,
159
+ "EleutherAI/gpt-j-6b": ModelType.PT,
160
+ "EleutherAI/pythia-12b-deduped": ModelType.PT,
161
+ "EleutherAI/gpt-neo-1.3B": ModelType.PT,
162
+ "EleutherAI/pythia-410m-deduped": ModelType.PT,
163
+ "EleutherAI/pythia-160m-deduped": ModelType.PT,
164
+ "EleutherAI/polyglot-ko-12.8b": ModelType.PT,
165
+ "EleutherAI/pythia-12b": ModelType.PT,
166
+ "roneneldan/TinyStories-33M": ModelType.PT,
167
+ "roneneldan/TinyStories-28M": ModelType.PT,
168
+ "roneneldan/TinyStories-1M": ModelType.PT,
169
+ "roneneldan/TinyStories-8M": ModelType.PT,
170
+ "roneneldan/TinyStories-3M": ModelType.PT,
171
+ "jerryjalapeno/nart-100k-7b": ModelType.FT,
172
+ "lmsys/vicuna-13b-v1.3": ModelType.IFT,
173
+ "lmsys/vicuna-7b-v1.3": ModelType.IFT,
174
+ "lmsys/vicuna-13b-v1.1": ModelType.IFT,
175
+ "lmsys/vicuna-13b-delta-v1.1": ModelType.IFT,
176
+ "lmsys/vicuna-7b-delta-v1.1": ModelType.IFT,
177
+ "abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.FT,
178
+ "haonan-li/bactrian-x-llama-13b-merged": ModelType.IFT,
179
+ "Gryphe/MythoLogic-13b": ModelType.IFT,
180
+ "Gryphe/MythoBoros-13b": ModelType.IFT,
181
+ "pillowtalks-ai/delta13b": ModelType.FT,
182
+ "wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard": ModelType.FT,
183
+ "bigscience/bloom-7b1": ModelType.PT,
184
+ "bigcode/tiny_starcoder_py": ModelType.PT,
185
+ "bigcode/starcoderplus": ModelType.FT,
186
+ "bigcode/gpt_bigcode-santacoder": ModelType.PT,
187
+ "bigcode/starcoder": ModelType.PT,
188
+ "Open-Orca/OpenOrca-Preview1-13B": ModelType.IFT,
189
+ "microsoft/DialoGPT-large": ModelType.FT,
190
+ "microsoft/DialoGPT-small": ModelType.FT,
191
+ "microsoft/DialoGPT-medium": ModelType.FT,
192
+ "microsoft/CodeGPT-small-py": ModelType.FT,
193
+ "Tincando/fiction_story_generator": ModelType.FT,
194
+ "Pirr/pythia-13b-deduped-green_devil": ModelType.FT,
195
+ "Aeala/GPT4-x-AlpacaDente2-30b": ModelType.FT,
196
+ "Aeala/GPT4-x-AlpacaDente-30b": ModelType.FT,
197
+ "Aeala/GPT4-x-Alpasta-13b": ModelType.FT,
198
+ "Aeala/VicUnlocked-alpaca-30b": ModelType.IFT,
199
+ "Tap-M/Luna-AI-Llama2-Uncensored": ModelType.FT,
200
+ "illuin/test-custom-llama": ModelType.FT,
201
+ "dvruette/oasst-llama-13b-2-epochs": ModelType.FT,
202
+ "dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.FT,
203
+ "dvruette/llama-13b-pretrained-dropout": ModelType.PT,
204
+ "dvruette/llama-13b-pretrained": ModelType.PT,
205
+ "dvruette/llama-13b-pretrained-sft-epoch-1": ModelType.FT,
206
+ "dvruette/llama-13b-pretrained-sft-do2": ModelType.FT,
207
+ "dvruette/oasst-gpt-neox-20b-3000-steps": ModelType.FT,
208
+ "dvruette/oasst-pythia-12b-pretrained-sft": ModelType.FT,
209
+ "dvruette/oasst-pythia-6.9b-4000-steps": ModelType.FT,
210
+ "dvruette/gpt-neox-20b-full-precision": ModelType.FT,
211
+ "dvruette/oasst-llama-13b-1000-steps": ModelType.FT,
212
+ "openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
213
+ "openlm-research/open_llama_7b": ModelType.PT,
214
+ "openlm-research/open_llama_7b_v2": ModelType.PT,
215
+ "openlm-research/open_llama_3b": ModelType.PT,
216
+ "openlm-research/open_llama_13b": ModelType.PT,
217
+ "openlm-research/open_llama_3b_v2": ModelType.PT,
218
+ "PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.IFT,
219
+ "GeorgiaTechResearchInstitute/galpaca-30b": ModelType.IFT,
220
+ "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct": ModelType.IFT,
221
+ "databricks/dolly-v2-7b": ModelType.IFT,
222
+ "databricks/dolly-v2-3b": ModelType.IFT,
223
+ "databricks/dolly-v2-12b": ModelType.IFT,
224
+ "Rachneet/gpt2-xl-alpaca": ModelType.FT,
225
+ "Locutusque/gpt2-conversational-or-qa": ModelType.FT,
226
+ "psyche/kogpt": ModelType.FT,
227
+ "NbAiLab/nb-gpt-j-6B-alpaca": ModelType.IFT,
228
+ "Mikael110/llama-2-7b-guanaco-fp16": ModelType.FT,
229
+ "Mikael110/llama-2-13b-guanaco-fp16": ModelType.FT,
230
+ "Fredithefish/CrimsonPajama": ModelType.IFT,
231
+ "Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K": ModelType.FT,
232
+ "Fredithefish/ScarletPajama-3B-HF": ModelType.FT,
233
+ "Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4": ModelType.IFT,
234
+ "acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1": ModelType.IFT,
235
+ "eachadea/vicuna-13b-1.1": ModelType.FT,
236
+ "eachadea/vicuna-7b-1.1": ModelType.FT,
237
+ "eachadea/vicuna-13b": ModelType.FT,
238
+ "openaccess-ai-collective/wizard-mega-13b": ModelType.IFT,
239
+ "openaccess-ai-collective/manticore-13b": ModelType.IFT,
240
+ "openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.IFT,
241
+ "openaccess-ai-collective/minotaur-13b": ModelType.IFT,
242
+ "openaccess-ai-collective/minotaur-13b-fixed": ModelType.IFT,
243
+ "openaccess-ai-collective/hippogriff-30b-chat": ModelType.IFT,
244
+ "openaccess-ai-collective/manticore-13b-chat-pyg": ModelType.IFT,
245
+ "pythainlp/wangchanglm-7.5B-sft-enth": ModelType.IFT,
246
+ "pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.IFT,
247
+ "euclaise/gpt-neox-122m-minipile-digits": ModelType.FT,
248
+ "stabilityai/StableBeluga1-Delta": ModelType.IFT,
249
+ "stabilityai/stablelm-tuned-alpha-7b": ModelType.IFT,
250
+ "stabilityai/StableBeluga2": ModelType.IFT,
251
+ "stabilityai/StableBeluga-13B": ModelType.IFT,
252
+ "stabilityai/StableBeluga-7B": ModelType.IFT,
253
+ "stabilityai/stablelm-base-alpha-7b": ModelType.PT,
254
+ "stabilityai/stablelm-base-alpha-3b": ModelType.PT,
255
+ "stabilityai/stablelm-tuned-alpha-3b": ModelType.IFT,
256
+ "alibidaran/medical_transcription_generator": ModelType.FT,
257
+ "CalderaAI/30B-Lazarus": ModelType.IFT,
258
+ "CalderaAI/13B-BlueMethod": ModelType.IFT,
259
+ "CalderaAI/13B-Ouroboros": ModelType.IFT,
260
+ "KoboldAI/OPT-13B-Erebus": ModelType.FT,
261
+ "KoboldAI/GPT-J-6B-Janeway": ModelType.FT,
262
+ "KoboldAI/GPT-J-6B-Shinen": ModelType.FT,
263
+ "KoboldAI/fairseq-dense-2.7B": ModelType.PT,
264
+ "KoboldAI/OPT-6B-nerys-v2": ModelType.FT,
265
+ "KoboldAI/GPT-NeoX-20B-Skein": ModelType.FT,
266
+ "KoboldAI/PPO_Pygway-6b-Mix": ModelType.FT,
267
+ "KoboldAI/fairseq-dense-6.7B": ModelType.PT,
268
+ "KoboldAI/fairseq-dense-125M": ModelType.PT,
269
+ "KoboldAI/OPT-13B-Nerybus-Mix": ModelType.FT,
270
+ "KoboldAI/OPT-2.7B-Erebus": ModelType.FT,
271
+ "KoboldAI/OPT-350M-Nerys-v2": ModelType.FT,
272
+ "KoboldAI/OPT-2.7B-Nerys-v2": ModelType.FT,
273
+ "KoboldAI/OPT-2.7B-Nerybus-Mix": ModelType.FT,
274
+ "KoboldAI/OPT-13B-Nerys-v2": ModelType.FT,
275
+ "KoboldAI/GPT-NeoX-20B-Erebus": ModelType.FT,
276
+ "KoboldAI/OPT-6.7B-Erebus": ModelType.FT,
277
+ "KoboldAI/fairseq-dense-355M": ModelType.PT,
278
+ "KoboldAI/OPT-6.7B-Nerybus-Mix": ModelType.FT,
279
+ "KoboldAI/GPT-J-6B-Adventure": ModelType.FT,
280
+ "KoboldAI/OPT-350M-Erebus": ModelType.FT,
281
+ "KoboldAI/GPT-J-6B-Skein": ModelType.FT,
282
+ "KoboldAI/OPT-30B-Erebus": ModelType.FT,
283
+ "klosax/pythia-160m-deduped-step92k-193bt": ModelType.PT,
284
+ "klosax/open_llama_3b_350bt_preview": ModelType.PT,
285
+ "klosax/openllama-3b-350bt": ModelType.PT,
286
+ "klosax/pythia-70m-deduped-step44k-92bt": ModelType.PT,
287
+ "klosax/open_llama_13b_600bt_preview": ModelType.PT,
288
+ "klosax/open_llama_7b_400bt_preview": ModelType.PT,
289
+ "kfkas/Llama-2-ko-7b-Chat": ModelType.IFT,
290
+ "WeOpenML/Alpaca-7B-v1": ModelType.IFT,
291
+ "WeOpenML/PandaLM-Alpaca-7B-v1": ModelType.IFT,
292
+ "TFLai/gpt2-turkish-uncased": ModelType.FT,
293
+ "ehartford/WizardLM-13B-Uncensored": ModelType.IFT,
294
+ "ehartford/dolphin-llama-13b": ModelType.IFT,
295
+ "ehartford/Wizard-Vicuna-30B-Uncensored": ModelType.FT,
296
+ "ehartford/WizardLM-30B-Uncensored": ModelType.IFT,
297
+ "ehartford/Wizard-Vicuna-13B-Uncensored": ModelType.FT,
298
+ "ehartford/WizardLM-7B-Uncensored": ModelType.IFT,
299
+ "ehartford/based-30b": ModelType.FT,
300
+ "ehartford/Wizard-Vicuna-7B-Uncensored": ModelType.FT,
301
+ "wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.FT,
302
+ "wahaha1987/llama_13b_sharegpt94k_fastchat": ModelType.FT,
303
+ "OpenAssistant/oasst-sft-1-pythia-12b": ModelType.FT,
304
+ "OpenAssistant/stablelm-7b-sft-v7-epoch-3": ModelType.IFT,
305
+ "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.FT,
306
+ "OpenAssistant/pythia-12b-sft-v8-2.5k-steps": ModelType.IFT,
307
+ "OpenAssistant/pythia-12b-sft-v8-7k-steps": ModelType.IFT,
308
+ "OpenAssistant/pythia-12b-pre-v8-12.5k-steps": ModelType.IFT,
309
+ "OpenAssistant/llama2-13b-orca-8k-3319": ModelType.IFT,
310
+ "junelee/wizard-vicuna-13b": ModelType.FT,
311
+ "BreadAi/gpt-YA-1-1_160M": ModelType.PT,
312
+ "BreadAi/MuseCan": ModelType.PT,
313
+ "BreadAi/MusePy-1-2": ModelType.PT,
314
+ "BreadAi/DiscordPy": ModelType.PT,
315
+ "BreadAi/PM_modelV2": ModelType.PT,
316
+ "BreadAi/gpt-Youtube": ModelType.PT,
317
+ "BreadAi/StoryPy": ModelType.FT,
318
+ "julianweng/Llama-2-7b-chat-orcah": ModelType.FT,
319
+ "AGI-inc/lora_moe_7b_baseline": ModelType.FT,
320
+ "AGI-inc/lora_moe_7b": ModelType.FT,
321
+ "togethercomputer/GPT-NeoXT-Chat-Base-20B": ModelType.IFT,
322
+ "togethercomputer/RedPajama-INCITE-Chat-7B-v0.1": ModelType.IFT,
323
+ "togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1": ModelType.IFT,
324
+ "togethercomputer/RedPajama-INCITE-7B-Base": ModelType.PT,
325
+ "togethercomputer/RedPajama-INCITE-7B-Instruct": ModelType.IFT,
326
+ "togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
327
+ "togethercomputer/Pythia-Chat-Base-7B": ModelType.IFT,
328
+ "togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
329
+ "togethercomputer/GPT-JT-6B-v1": ModelType.IFT,
330
+ "togethercomputer/GPT-JT-6B-v0": ModelType.IFT,
331
+ "togethercomputer/RedPajama-INCITE-Chat-3B-v1": ModelType.IFT,
332
+ "togethercomputer/RedPajama-INCITE-7B-Chat": ModelType.IFT,
333
+ "togethercomputer/RedPajama-INCITE-Instruct-3B-v1": ModelType.IFT,
334
+ "Writer/camel-5b-hf": ModelType.IFT,
335
+ "Writer/palmyra-base": ModelType.PT,
336
+ "MBZUAI/LaMini-GPT-1.5B": ModelType.IFT,
337
+ "MBZUAI/lamini-cerebras-111m": ModelType.IFT,
338
+ "MBZUAI/lamini-neo-1.3b": ModelType.IFT,
339
+ "MBZUAI/lamini-cerebras-1.3b": ModelType.IFT,
340
+ "MBZUAI/lamini-cerebras-256m": ModelType.IFT,
341
+ "MBZUAI/LaMini-GPT-124M": ModelType.IFT,
342
+ "MBZUAI/lamini-neo-125m": ModelType.IFT,
343
+ "TehVenom/DiffMerge-DollyGPT-Pygmalion": ModelType.FT,
344
+ "TehVenom/PPO_Shygmalion-6b": ModelType.FT,
345
+ "TehVenom/Dolly_Shygmalion-6b-Dev_V8P2": ModelType.FT,
346
+ "TehVenom/Pygmalion_AlpacaLora-7b": ModelType.FT,
347
+ "TehVenom/PPO_Pygway-V8p4_Dev-6b": ModelType.FT,
348
+ "TehVenom/Dolly_Malion-6b": ModelType.FT,
349
+ "TehVenom/PPO_Shygmalion-V8p4_Dev-6b": ModelType.FT,
350
+ "TehVenom/ChanMalion": ModelType.FT,
351
+ "TehVenom/GPT-J-Pyg_PPO-6B": ModelType.IFT,
352
+ "TehVenom/Pygmalion-13b-Merged": ModelType.FT,
353
+ "TehVenom/Metharme-13b-Merged": ModelType.IFT,
354
+ "TehVenom/Dolly_Shygmalion-6b": ModelType.FT,
355
+ "TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4": ModelType.IFT,
356
+ "georgesung/llama2_7b_chat_uncensored": ModelType.FT,
357
+ "vicgalle/gpt2-alpaca": ModelType.IFT,
358
+ "vicgalle/alpaca-7b": ModelType.FT,
359
+ "vicgalle/gpt2-alpaca-gpt4": ModelType.IFT,
360
+ "facebook/opt-350m": ModelType.PT,
361
+ "facebook/opt-125m": ModelType.PT,
362
+ "facebook/xglm-4.5B": ModelType.PT,
363
+ "facebook/opt-2.7b": ModelType.PT,
364
+ "facebook/opt-6.7b": ModelType.PT,
365
+ "facebook/galactica-30b": ModelType.PT,
366
+ "facebook/opt-13b": ModelType.PT,
367
+ "facebook/opt-66b": ModelType.PT,
368
+ "facebook/xglm-7.5B": ModelType.PT,
369
+ "facebook/xglm-564M": ModelType.PT,
370
+ "facebook/opt-30b": ModelType.PT,
371
+ "golaxy/gogpt-7b": ModelType.FT,
372
+ "golaxy/gogpt2-7b": ModelType.FT,
373
+ "golaxy/gogpt-7b-bloom": ModelType.FT,
374
+ "golaxy/gogpt-3b-bloom": ModelType.FT,
375
+ "psmathur/orca_mini_v2_7b": ModelType.IFT,
376
+ "psmathur/orca_mini_7b": ModelType.IFT,
377
+ "psmathur/orca_mini_3b": ModelType.IFT,
378
+ "psmathur/orca_mini_v2_13b": ModelType.IFT,
379
+ "gpt2-xl": ModelType.PT,
380
+ "lxe/Cerebras-GPT-2.7B-Alpaca-SP": ModelType.FT,
381
+ "Monero/Manticore-13b-Chat-Pyg-Guanaco": ModelType.FT,
382
+ "Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b": ModelType.IFT,
383
+ "Monero/WizardLM-13b-OpenAssistant-Uncensored": ModelType.IFT,
384
+ "Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b": ModelType.IFT,
385
+ "jzjiao/opt-1.3b-rlhf": ModelType.FT,
386
+ "HuggingFaceH4/starchat-beta": ModelType.IFT,
387
+ "KnutJaegersberg/gpt-2-xl-EvolInstruct": ModelType.IFT,
388
+ "KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct": ModelType.IFT,
389
+ "KnutJaegersberg/galactica-orca-wizardlm-1.3b": ModelType.IFT,
390
+ "openchat/openchat_8192": ModelType.IFT,
391
+ "openchat/openchat_v2": ModelType.IFT,
392
+ "openchat/openchat_v2_w": ModelType.IFT,
393
+ "ausboss/llama-13b-supercot": ModelType.IFT,
394
+ "ausboss/llama-30b-supercot": ModelType.IFT,
395
+ "Neko-Institute-of-Science/metharme-7b": ModelType.IFT,
396
+ "Neko-Institute-of-Science/pygmalion-7b": ModelType.FT,
397
+ "SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.IFT,
398
+ "victor123/WizardLM-13B-1.0": ModelType.IFT,
399
+ "OpenBuddy/openbuddy-openllama-13b-v7-fp16": ModelType.FT,
400
+ "OpenBuddy/openbuddy-llama2-13b-v8.1-fp16": ModelType.FT,
401
+ "OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16": ModelType.FT,
402
+ "baichuan-inc/Baichuan-7B": ModelType.PT,
403
+ "tiiuae/falcon-40b-instruct": ModelType.IFT,
404
+ "tiiuae/falcon-40b": ModelType.PT,
405
+ "tiiuae/falcon-7b": ModelType.PT,
406
+ "YeungNLP/firefly-llama-13b": ModelType.FT,
407
+ "YeungNLP/firefly-llama-13b-v1.2": ModelType.FT,
408
+ "YeungNLP/firefly-llama2-13b": ModelType.FT,
409
+ "YeungNLP/firefly-ziya-13b": ModelType.FT,
410
+ "shaohang/Sparse0.5_OPT-1.3": ModelType.FT,
411
+ "xzuyn/Alpacino-SuperCOT-13B": ModelType.IFT,
412
+ "xzuyn/MedicWizard-7B": ModelType.FT,
413
+ "xDAN-AI/xDAN_13b_l2_lora": ModelType.FT,
414
+ "beomi/KoAlpaca-Polyglot-5.8B": ModelType.FT,
415
+ "beomi/llama-2-ko-7b": ModelType.IFT,
416
+ "Salesforce/codegen-6B-multi": ModelType.PT,
417
+ "Salesforce/codegen-16B-nl": ModelType.PT,
418
+ "Salesforce/codegen-6B-nl": ModelType.PT,
419
+ "ai-forever/rugpt3large_based_on_gpt2": ModelType.FT,
420
+ "gpt2-large": ModelType.PT,
421
+ "frank098/orca_mini_3b_juniper": ModelType.FT,
422
+ "frank098/WizardLM_13B_juniper": ModelType.FT,
423
+ "FPHam/Free_Sydney_13b_HF": ModelType.FT,
424
+ "huggingface/llama-13b": ModelType.PT,
425
+ "huggingface/llama-7b": ModelType.PT,
426
+ "huggingface/llama-65b": ModelType.PT,
427
+ "huggingface/llama-30b": ModelType.PT,
428
+ "Henk717/chronoboros-33B": ModelType.IFT,
429
+ "jondurbin/airoboros-13b-gpt4-1.4": ModelType.IFT,
430
+ "jondurbin/airoboros-7b": ModelType.IFT,
431
+ "jondurbin/airoboros-7b-gpt4": ModelType.IFT,
432
+ "jondurbin/airoboros-7b-gpt4-1.1": ModelType.IFT,
433
+ "jondurbin/airoboros-7b-gpt4-1.2": ModelType.IFT,
434
+ "jondurbin/airoboros-7b-gpt4-1.3": ModelType.IFT,
435
+ "jondurbin/airoboros-7b-gpt4-1.4": ModelType.IFT,
436
+ "jondurbin/airoboros-l2-7b-gpt4-1.4.1": ModelType.IFT,
437
+ "jondurbin/airoboros-l2-13b-gpt4-1.4.1": ModelType.IFT,
438
+ "jondurbin/airoboros-l2-70b-gpt4-1.4.1": ModelType.IFT,
439
+ "jondurbin/airoboros-13b": ModelType.IFT,
440
+ "jondurbin/airoboros-33b-gpt4-1.4": ModelType.IFT,
441
+ "jondurbin/airoboros-33b-gpt4-1.2": ModelType.IFT,
442
+ "jondurbin/airoboros-65b-gpt4-1.2": ModelType.IFT,
443
+ "ariellee/SuperPlatty-30B": ModelType.IFT,
444
+ "danielhanchen/open_llama_3b_600bt_preview": ModelType.FT,
445
+ "cerebras/Cerebras-GPT-256M": ModelType.PT,
446
+ "cerebras/Cerebras-GPT-1.3B": ModelType.PT,
447
+ "cerebras/Cerebras-GPT-13B": ModelType.PT,
448
+ "cerebras/Cerebras-GPT-2.7B": ModelType.PT,
449
+ "cerebras/Cerebras-GPT-111M": ModelType.PT,
450
+ "cerebras/Cerebras-GPT-6.7B": ModelType.PT,
451
+ "Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf": ModelType.RL,
452
+ "Yhyu13/llama-30B-hf-openassitant": ModelType.FT,
453
+ "NousResearch/Nous-Hermes-Llama2-13b": ModelType.IFT,
454
+ "NousResearch/Nous-Hermes-llama-2-7b": ModelType.IFT,
455
+ "NousResearch/Redmond-Puffin-13B": ModelType.IFT,
456
+ "NousResearch/Nous-Hermes-13b": ModelType.IFT,
457
+ "project-baize/baize-v2-7b": ModelType.IFT,
458
+ "project-baize/baize-v2-13b": ModelType.IFT,
459
+ "LLMs/WizardLM-13B-V1.0": ModelType.FT,
460
+ "LLMs/AlpacaGPT4-7B-elina": ModelType.FT,
461
+ "wenge-research/yayi-7b": ModelType.FT,
462
+ "wenge-research/yayi-7b-llama2": ModelType.FT,
463
+ "wenge-research/yayi-13b-llama2": ModelType.FT,
464
+ "yhyhy3/open_llama_7b_v2_med_instruct": ModelType.IFT,
465
+ "llama-anon/instruct-13b": ModelType.IFT,
466
+ "huggingtweets/jerma985": ModelType.FT,
467
+ "huggingtweets/gladosystem": ModelType.FT,
468
+ "huggingtweets/bladeecity-jerma985": ModelType.FT,
469
+ "huggyllama/llama-13b": ModelType.PT,
470
+ "huggyllama/llama-65b": ModelType.PT,
471
+ "FabbriSimo01/Facebook_opt_1.3b_Quantized": ModelType.PT,
472
+ "upstage/Llama-2-70b-instruct": ModelType.IFT,
473
+ "upstage/Llama-2-70b-instruct-1024": ModelType.IFT,
474
+ "upstage/llama-65b-instruct": ModelType.IFT,
475
+ "upstage/llama-30b-instruct-2048": ModelType.IFT,
476
+ "upstage/llama-30b-instruct": ModelType.IFT,
477
+ "WizardLM/WizardLM-13B-1.0": ModelType.IFT,
478
+ "WizardLM/WizardLM-13B-V1.1": ModelType.IFT,
479
+ "WizardLM/WizardLM-13B-V1.2": ModelType.IFT,
480
+ "WizardLM/WizardLM-30B-V1.0": ModelType.IFT,
481
+ "WizardLM/WizardCoder-15B-V1.0": ModelType.IFT,
482
+ "gpt2": ModelType.PT,
483
+ "keyfan/vicuna-chinese-replication-v1.1": ModelType.IFT,
484
+ "nthngdy/pythia-owt2-70m-100k": ModelType.FT,
485
+ "nthngdy/pythia-owt2-70m-50k": ModelType.FT,
486
+ "quantumaikr/KoreanLM-hf": ModelType.FT,
487
+ "quantumaikr/open_llama_7b_hf": ModelType.FT,
488
+ "quantumaikr/QuantumLM-70B-hf": ModelType.IFT,
489
+ "MayaPH/FinOPT-Lincoln": ModelType.FT,
490
+ "MayaPH/FinOPT-Franklin": ModelType.FT,
491
+ "MayaPH/GodziLLa-30B": ModelType.IFT,
492
+ "MayaPH/GodziLLa-30B-plus": ModelType.IFT,
493
+ "MayaPH/FinOPT-Washington": ModelType.FT,
494
+ "ogimgio/gpt-neo-125m-neurallinguisticpioneers": ModelType.FT,
495
+ "layoric/llama-2-13b-code-alpaca": ModelType.FT,
496
+ "CobraMamba/mamba-gpt-3b": ModelType.FT,
497
+ "CobraMamba/mamba-gpt-3b-v2": ModelType.FT,
498
+ "CobraMamba/mamba-gpt-3b-v3": ModelType.FT,
499
+ "timdettmers/guanaco-33b-merged": ModelType.FT,
500
+ "elinas/chronos-33b": ModelType.IFT,
501
+ "heegyu/RedTulu-Uncensored-3B-0719": ModelType.IFT,
502
+ "heegyu/WizardVicuna-Uncensored-3B-0719": ModelType.IFT,
503
+ "heegyu/WizardVicuna-3B-0719": ModelType.IFT,
504
+ "meta-llama/Llama-2-7b-chat-hf": ModelType.RL,
505
+ "meta-llama/Llama-2-7b-hf": ModelType.PT,
506
+ "meta-llama/Llama-2-13b-chat-hf": ModelType.RL,
507
+ "meta-llama/Llama-2-13b-hf": ModelType.PT,
508
+ "meta-llama/Llama-2-70b-chat-hf": ModelType.RL,
509
+ "meta-llama/Llama-2-70b-hf": ModelType.PT,
510
+ "xhyi/PT_GPTNEO350_ATG": ModelType.FT,
511
+ "h2oai/h2ogpt-gm-oasst1-en-1024-20b": ModelType.FT,
512
+ "h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt": ModelType.FT,
513
+ "h2oai/h2ogpt-oig-oasst1-512-6_9b": ModelType.IFT,
514
+ "h2oai/h2ogpt-oasst1-512-12b": ModelType.IFT,
515
+ "h2oai/h2ogpt-oig-oasst1-256-6_9b": ModelType.IFT,
516
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt": ModelType.FT,
517
+ "h2oai/h2ogpt-oasst1-512-20b": ModelType.IFT,
518
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2": ModelType.FT,
519
+ "h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.FT,
520
+ "h2oai/h2ogpt-gm-oasst1-multilang-1024-20b": ModelType.FT,
521
+ "bofenghuang/vigogne-13b-instruct": ModelType.IFT,
522
+ "bofenghuang/vigogne-13b-chat": ModelType.FT,
523
+ "bofenghuang/vigogne-2-7b-instruct": ModelType.IFT,
524
+ "bofenghuang/vigogne-7b-instruct": ModelType.IFT,
525
+ "bofenghuang/vigogne-7b-chat": ModelType.FT,
526
+ "Vmware/open-llama-7b-v2-open-instruct": ModelType.IFT,
527
+ "VMware/open-llama-0.7T-7B-open-instruct-v1.1": ModelType.IFT,
528
+ "ewof/koishi-instruct-3b": ModelType.IFT,
529
+ "gywy/llama2-13b-chinese-v1": ModelType.FT,
530
+ "GOAT-AI/GOAT-7B-Community": ModelType.FT,
531
+ "psyche/kollama2-7b": ModelType.FT,
532
+ "TheTravellingEngineer/llama2-7b-hf-guanaco": ModelType.FT,
533
+ "beaugogh/pythia-1.4b-deduped-sharegpt": ModelType.FT,
534
+ "augtoma/qCammel-70-x": ModelType.IFT,
535
+ "Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload": ModelType.IFT,
536
+ "anhnv125/pygmalion-6b-roleplay": ModelType.FT,
537
+ "64bits/LexPodLM-13B": ModelType.FT,
538
+ }
539
+
540
+
541
+ def model_type_from_str(type):
542
+ if "fine-tuned" in type or "🔶" in type:
543
+ return ModelType.FT
544
+ if "pretrained" in type or "🟢" in type:
545
+ return ModelType.PT
546
+ if "RL-tuned" in type or "🟦" in type:
547
+ return ModelType.RL
548
+ if "instruction-tuned" in type or "⭕" in type:
549
+ return ModelType.IFT
550
+ return ModelType.Unknown
src/{auto_leaderboard/load_results.py → display_models/read_results.py} RENAMED
@@ -1,14 +1,13 @@
1
- from dataclasses import dataclass
2
-
3
- import glob
4
  import json
5
  import os
 
6
  from typing import Dict, List, Tuple
7
- import dateutil
8
 
9
- from src.utils_display import AutoEvalColumn, make_clickable_model
10
  import numpy as np
11
 
 
 
12
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
13
  BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
14
  BENCH_TO_NAME = {
@@ -31,13 +30,15 @@ class EvalResult:
31
  weight_type: str = ""
32
 
33
  def to_dict(self):
 
 
34
  if self.org is not None:
35
  base_model = f"{self.org}/{self.model}"
36
  else:
37
  base_model = f"{self.model}"
38
  data_dict = {}
39
 
40
- data_dict["eval_name"] = self.eval_name # not a column, just a save name
41
  data_dict["weight_type"] = self.weight_type # not a column, just a save name
42
  data_dict[AutoEvalColumn.precision.name] = self.precision
43
  data_dict[AutoEvalColumn.model_type.name] = self.model_type
@@ -45,6 +46,9 @@ class EvalResult:
45
  data_dict[AutoEvalColumn.dummy.name] = base_model
46
  data_dict[AutoEvalColumn.revision.name] = self.revision
47
  data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
 
 
 
48
 
49
  for benchmark in BENCHMARKS:
50
  if benchmark not in self.results.keys():
@@ -60,10 +64,9 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
60
  with open(json_filepath) as fp:
61
  data = json.load(fp)
62
 
63
-
64
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
65
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
66
- return None, [] # we skip models with the wrong version
67
 
68
  try:
69
  config = data["config"]
@@ -87,17 +90,24 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
87
  else:
88
  org = model_split[0]
89
  model = model_split[1]
90
- result_key = f"{org}_{model}_{model_sha}_{precision}"
91
 
92
  eval_results = []
93
  for benchmark, metric in zip(BENCHMARKS, METRICS):
94
- accs = np.array([v.get(metric, 0) for k, v in data["results"].items() if benchmark in k])
95
- if accs.size == 0:
96
  continue
97
  mean_acc = np.mean(accs) * 100.0
98
- eval_results.append(EvalResult(
99
- eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, precision=precision, #todo model_type=, weight_type=
100
- ))
 
 
 
 
 
 
 
101
 
102
  return result_key, eval_results
103
 
@@ -113,11 +123,11 @@ def get_eval_results() -> List[EvalResult]:
113
  # Sort the files by date
114
  # store results by precision maybe?
115
  try:
116
- files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
117
  except dateutil.parser._parser.ParserError:
118
  files = [files[-1]]
119
 
120
- #up_to_date = files[-1]
121
  for file in files:
122
  json_filepaths.append(os.path.join(root, file))
123
 
 
 
 
 
1
  import json
2
  import os
3
+ from dataclasses import dataclass
4
  from typing import Dict, List, Tuple
 
5
 
6
+ import dateutil
7
  import numpy as np
8
 
9
+ from src.display_models.utils import AutoEvalColumn, make_clickable_model
10
+
11
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
  BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
13
  BENCH_TO_NAME = {
 
30
  weight_type: str = ""
31
 
32
  def to_dict(self):
33
+ from src.load_from_hub import is_model_on_hub
34
+
35
  if self.org is not None:
36
  base_model = f"{self.org}/{self.model}"
37
  else:
38
  base_model = f"{self.model}"
39
  data_dict = {}
40
 
41
+ data_dict["eval_name"] = self.eval_name # not a column, just a save name
42
  data_dict["weight_type"] = self.weight_type # not a column, just a save name
43
  data_dict[AutoEvalColumn.precision.name] = self.precision
44
  data_dict[AutoEvalColumn.model_type.name] = self.model_type
 
46
  data_dict[AutoEvalColumn.dummy.name] = base_model
47
  data_dict[AutoEvalColumn.revision.name] = self.revision
48
  data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
49
+ data_dict[AutoEvalColumn.still_on_hub.name] = (
50
+ is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
51
+ )
52
 
53
  for benchmark in BENCHMARKS:
54
  if benchmark not in self.results.keys():
 
64
  with open(json_filepath) as fp:
65
  data = json.load(fp)
66
 
 
67
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
68
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
69
+ return None, [] # we skip models with the wrong version
70
 
71
  try:
72
  config = data["config"]
 
90
  else:
91
  org = model_split[0]
92
  model = model_split[1]
93
+ result_key = f"{org}_{model}_{model_sha}_{precision}"
94
 
95
  eval_results = []
96
  for benchmark, metric in zip(BENCHMARKS, METRICS):
97
+ accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
98
+ if accs.size == 0 or any([acc is None for acc in accs]):
99
  continue
100
  mean_acc = np.mean(accs) * 100.0
101
+ eval_results.append(
102
+ EvalResult(
103
+ eval_name=result_key,
104
+ org=org,
105
+ model=model,
106
+ revision=model_sha,
107
+ results={benchmark: mean_acc},
108
+ precision=precision, # todo model_type=, weight_type=
109
+ )
110
+ )
111
 
112
  return result_key, eval_results
113
 
 
123
  # Sort the files by date
124
  # store results by precision maybe?
125
  try:
126
+ files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
127
  except dateutil.parser._parser.ParserError:
128
  files = [files[-1]]
129
 
130
+ # up_to_date = files[-1]
131
  for file in files:
132
  json_filepaths.append(os.path.join(root, file))
133
 
src/{utils_display.py → display_models/utils.py} RENAMED
@@ -1,24 +1,27 @@
1
  import os
2
  from dataclasses import dataclass
 
3
  from huggingface_hub import HfApi
4
 
5
  API = HfApi()
6
 
7
 
8
- # These classes are for user facing column names, to avoid having to change them
9
- # all around the code when a modif is needed
10
  @dataclass
11
  class ColumnContent:
12
  name: str
13
- type: str
14
- displayed_by_default: bool
15
  hidden: bool = False
16
 
 
17
  def fields(raw_class):
18
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
19
 
 
20
  @dataclass(frozen=True)
21
- class AutoEvalColumn: # Auto evals column
22
  model_type_symbol = ColumnContent("T", "str", True)
23
  model = ColumnContent("Model", "markdown", True)
24
  average = ColumnContent("Average ⬆️", "number", True)
@@ -27,15 +30,19 @@ class AutoEvalColumn: # Auto evals column
27
  mmlu = ColumnContent("MMLU", "number", True)
28
  truthfulqa = ColumnContent("TruthfulQA", "number", True)
29
  model_type = ColumnContent("Type", "str", False)
30
- precision = ColumnContent("Precision", "str", False) #, True)
31
  license = ColumnContent("Hub License", "str", False)
32
  params = ColumnContent("#Params (B)", "number", False)
33
  likes = ColumnContent("Hub ❤️", "number", False)
 
34
  revision = ColumnContent("Model sha", "str", False, False)
35
- dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
 
 
 
36
 
37
  @dataclass(frozen=True)
38
- class EloEvalColumn: # Elo evals column
39
  model = ColumnContent("Model", "markdown", True)
40
  gpt4 = ColumnContent("GPT-4 (all)", "number", True)
41
  human_all = ColumnContent("Human (all)", "number", True)
@@ -44,7 +51,7 @@ class EloEvalColumn: # Elo evals column
44
 
45
 
46
  @dataclass(frozen=True)
47
- class EvalQueueColumn: # Queue column
48
  model = ColumnContent("model", "markdown", True)
49
  revision = ColumnContent("revision", "str", True)
50
  private = ColumnContent("private", "bool", True)
@@ -52,7 +59,13 @@ class EvalQueueColumn: # Queue column
52
  weight_type = ColumnContent("weight_type", "str", "Original")
53
  status = ColumnContent("status", "str", True)
54
 
55
- LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
 
 
 
 
 
 
56
 
57
 
58
  KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
@@ -90,29 +103,44 @@ def make_clickable_model(model_name):
90
  elif model_name == "oasst-12b":
91
  link = OASST_LINK
92
 
93
- details_model_name = model_name.replace('/', '__')
94
  details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
95
 
96
  if not bool(os.getenv("DEBUG", "False")):
97
  # We only add these checks when not debugging, as they are extremely slow
98
  print(f"details_link: {details_link}")
99
  try:
100
- check_path = list(API.list_files_info(repo_id=f"open-llm-leaderboard/details_{details_model_name}",
101
- paths="README.md",
102
- repo_type="dataset"))
 
 
 
 
103
  print(f"check_path: {check_path}")
104
  except Exception as err:
105
  # No details repo for this model
106
  print(f"No details repo for this model: {err}")
107
  return model_hyperlink(link, model_name)
108
 
109
- return model_hyperlink(link, model_name) + ' ' + model_hyperlink(details_link, "📑")
 
110
 
111
  def styled_error(error):
112
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
113
 
 
114
  def styled_warning(warn):
115
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
116
 
 
117
  def styled_message(message):
118
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
 
 
 
 
 
 
 
 
 
1
  import os
2
  from dataclasses import dataclass
3
+
4
  from huggingface_hub import HfApi
5
 
6
  API = HfApi()
7
 
8
 
9
+ # These classes are for user facing column names, to avoid having to change them
10
+ # all around the code when a modif is needed
11
  @dataclass
12
  class ColumnContent:
13
  name: str
14
+ type: str
15
+ displayed_by_default: bool
16
  hidden: bool = False
17
 
18
+
19
  def fields(raw_class):
20
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
21
 
22
+
23
  @dataclass(frozen=True)
24
+ class AutoEvalColumn: # Auto evals column
25
  model_type_symbol = ColumnContent("T", "str", True)
26
  model = ColumnContent("Model", "markdown", True)
27
  average = ColumnContent("Average ⬆️", "number", True)
 
30
  mmlu = ColumnContent("MMLU", "number", True)
31
  truthfulqa = ColumnContent("TruthfulQA", "number", True)
32
  model_type = ColumnContent("Type", "str", False)
33
+ precision = ColumnContent("Precision", "str", False) # , True)
34
  license = ColumnContent("Hub License", "str", False)
35
  params = ColumnContent("#Params (B)", "number", False)
36
  likes = ColumnContent("Hub ❤️", "number", False)
37
+ still_on_hub = ColumnContent("Available on the hub", "bool", False)
38
  revision = ColumnContent("Model sha", "str", False, False)
39
+ dummy = ColumnContent(
40
+ "model_name_for_query", "str", True
41
+ ) # dummy col to implement search bar (hidden by custom CSS)
42
+
43
 
44
  @dataclass(frozen=True)
45
+ class EloEvalColumn: # Elo evals column
46
  model = ColumnContent("Model", "markdown", True)
47
  gpt4 = ColumnContent("GPT-4 (all)", "number", True)
48
  human_all = ColumnContent("Human (all)", "number", True)
 
51
 
52
 
53
  @dataclass(frozen=True)
54
+ class EvalQueueColumn: # Queue column
55
  model = ColumnContent("model", "markdown", True)
56
  revision = ColumnContent("revision", "str", True)
57
  private = ColumnContent("private", "bool", True)
 
59
  weight_type = ColumnContent("weight_type", "str", "Original")
60
  status = ColumnContent("status", "str", True)
61
 
62
+
63
+ LLAMAS = [
64
+ "huggingface/llama-7b",
65
+ "huggingface/llama-13b",
66
+ "huggingface/llama-30b",
67
+ "huggingface/llama-65b",
68
+ ]
69
 
70
 
71
  KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
 
103
  elif model_name == "oasst-12b":
104
  link = OASST_LINK
105
 
106
+ details_model_name = model_name.replace("/", "__")
107
  details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
108
 
109
  if not bool(os.getenv("DEBUG", "False")):
110
  # We only add these checks when not debugging, as they are extremely slow
111
  print(f"details_link: {details_link}")
112
  try:
113
+ check_path = list(
114
+ API.list_files_info(
115
+ repo_id=f"open-llm-leaderboard/details_{details_model_name}",
116
+ paths="README.md",
117
+ repo_type="dataset",
118
+ )
119
+ )
120
  print(f"check_path: {check_path}")
121
  except Exception as err:
122
  # No details repo for this model
123
  print(f"No details repo for this model: {err}")
124
  return model_hyperlink(link, model_name)
125
 
126
+ return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
127
+
128
 
129
  def styled_error(error):
130
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
131
 
132
+
133
  def styled_warning(warn):
134
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
135
 
136
+
137
  def styled_message(message):
138
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
139
+
140
+
141
+ def has_no_nan_values(df, columns):
142
+ return df[columns].notna().all(axis=1)
143
+
144
+
145
+ def has_nan_values(df, columns):
146
+ return df[columns].isna().any(axis=1)
src/init.py DELETED
@@ -1,51 +0,0 @@
1
- import os
2
- from huggingface_hub import Repository
3
-
4
-
5
- def get_all_requested_models(requested_models_dir):
6
- depth = 1
7
- file_names = []
8
-
9
- for root, dirs, files in os.walk(requested_models_dir):
10
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
11
- if current_depth == depth:
12
- file_names.extend([os.path.join(root, file) for file in files])
13
-
14
- return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
15
-
16
- def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
17
- eval_queue_repo = None
18
- eval_results_repo = None
19
- requested_models = None
20
-
21
- print("Pulling evaluation requests and results.")
22
-
23
- eval_queue_repo = Repository(
24
- local_dir=QUEUE_PATH,
25
- clone_from=QUEUE_REPO,
26
- repo_type="dataset",
27
- )
28
- eval_queue_repo.git_pull()
29
-
30
- eval_results_repo = Repository(
31
- local_dir=RESULTS_PATH,
32
- clone_from=RESULTS_REPO,
33
- repo_type="dataset",
34
- )
35
- eval_results_repo.git_pull()
36
-
37
- requested_models = get_all_requested_models("eval-queue")
38
-
39
- return eval_queue_repo, requested_models, eval_results_repo
40
-
41
-
42
- #def load_results(model, benchmark, metric):
43
- # file_path = os.path.join("autoevals", model, f"{model}-eval_{benchmark}.json")
44
- # if not os.path.exists(file_path):
45
- # return 0.0, None
46
-
47
- # with open(file_path) as fp:
48
- # data = json.load(fp)
49
- # accs = np.array([v[metric] for k, v in data["results"].items()])
50
- # mean_acc = np.mean(accs)
51
- # return mean_acc, data["config"]["model_args"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/load_from_hub.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+ from huggingface_hub import Repository
6
+ from transformers import AutoConfig
7
+
8
+ from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
9
+ from src.display_models.get_model_metadata import apply_metadata
10
+ from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
11
+ from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
12
+
13
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
14
+
15
+
16
+ def get_all_requested_models(requested_models_dir: str) -> set[str]:
17
+ depth = 1
18
+ file_names = []
19
+
20
+ for root, _, files in os.walk(requested_models_dir):
21
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
22
+ if current_depth == depth:
23
+ file_names.extend([os.path.join(root, file) for file in files])
24
+
25
+ return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
26
+
27
+
28
+ def load_all_info_from_hub(QUEUE_REPO: str, RESULTS_REPO: str, QUEUE_PATH: str, RESULTS_PATH: str) -> list[Repository]:
29
+ eval_queue_repo = None
30
+ eval_results_repo = None
31
+ requested_models = None
32
+
33
+ print("Pulling evaluation requests and results.")
34
+
35
+ eval_queue_repo = Repository(
36
+ local_dir=QUEUE_PATH,
37
+ clone_from=QUEUE_REPO,
38
+ repo_type="dataset",
39
+ )
40
+ # eval_queue_repo.git_pull()
41
+
42
+ eval_results_repo = Repository(
43
+ local_dir=RESULTS_PATH,
44
+ clone_from=RESULTS_REPO,
45
+ repo_type="dataset",
46
+ )
47
+ # eval_results_repo.git_pull()
48
+
49
+ requested_models = get_all_requested_models("eval-queue")
50
+
51
+ return eval_queue_repo, requested_models, eval_results_repo
52
+
53
+
54
+ def get_leaderboard_df(
55
+ eval_results: Repository, eval_results_private: Repository, cols: list, benchmark_cols: list
56
+ ) -> pd.DataFrame:
57
+ if eval_results:
58
+ print("Pulling evaluation results for the leaderboard.")
59
+ # eval_results.git_pull()
60
+ if eval_results_private:
61
+ print("Pulling evaluation results for the leaderboard.")
62
+ # eval_results_private.git_pull()
63
+
64
+ all_data = get_eval_results_dicts()
65
+
66
+ # if not IS_PUBLIC:
67
+ all_data.append(gpt4_values)
68
+ all_data.append(gpt35_values)
69
+
70
+ all_data.append(baseline)
71
+ apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
72
+
73
+ df = pd.DataFrame.from_records(all_data)
74
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
75
+ df = df[cols].round(decimals=2)
76
+
77
+ # filter out if any of the benchmarks have not been produced
78
+ df = df[has_no_nan_values(df, benchmark_cols)]
79
+ return df
80
+
81
+
82
+ def get_evaluation_queue_df(
83
+ eval_queue: Repository, eval_queue_private: Repository, save_path: str, cols: list
84
+ ) -> list[pd.DataFrame]:
85
+ if eval_queue:
86
+ print("Pulling changes for the evaluation queue.")
87
+ # eval_queue.git_pull()
88
+ if eval_queue_private:
89
+ print("Pulling changes for the evaluation queue.")
90
+ # eval_queue_private.git_pull()
91
+
92
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
93
+ all_evals = []
94
+
95
+ for entry in entries:
96
+ if ".json" in entry:
97
+ file_path = os.path.join(save_path, entry)
98
+ with open(file_path) as fp:
99
+ data = json.load(fp)
100
+
101
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
102
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
103
+
104
+ all_evals.append(data)
105
+ elif ".md" not in entry:
106
+ # this is a folder
107
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
108
+ for sub_entry in sub_entries:
109
+ file_path = os.path.join(save_path, entry, sub_entry)
110
+ with open(file_path) as fp:
111
+ data = json.load(fp)
112
+
113
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
114
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
115
+ all_evals.append(data)
116
+
117
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
118
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
119
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
120
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
121
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
122
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
123
+ return df_finished[cols], df_running[cols], df_pending[cols]
124
+
125
+
126
+ def is_model_on_hub(model_name: str, revision: str) -> bool:
127
+ try:
128
+ AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
129
+ return True, None
130
+
131
+ except ValueError:
132
+ return (
133
+ False,
134
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
135
+ )
136
+
137
+ except Exception as e:
138
+ print(f"Could not get the model config from the hub.: {e}")
139
+ return False, "was not found on hub!"