Clémentine commited on
Commit
056eba8
·
1 Parent(s): 92d36c1

merge refactor

Browse files
.gitignore CHANGED
@@ -1,9 +1,10 @@
1
- evals/
2
  venv/
3
  __pycache__/
4
  .env
5
  .ipynb_checkpoints
6
  *ipynb
 
7
 
8
  gpt_4_evals/
9
  human_evals/
 
1
+ auto_evals/
2
  venv/
3
  __pycache__/
4
  .env
5
  .ipynb_checkpoints
6
  *ipynb
7
+ .vscode/
8
 
9
  gpt_4_evals/
10
  human_evals/
app.py CHANGED
@@ -7,19 +7,25 @@ import gradio as gr
7
  import numpy as np
8
  import pandas as pd
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
- from huggingface_hub import HfApi, Repository
11
  from transformers import AutoConfig
12
 
13
- from content import *
14
- from elo_utils import get_elo_plots, get_elo_results_dicts
15
- from utils import get_eval_results_dicts, make_clickable_model, get_window_url_params
 
 
 
 
 
16
 
17
  # clone / pull the lmeh eval data
18
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
19
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
20
  HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
21
  GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
22
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", None))
 
23
 
24
  api = HfApi()
25
 
@@ -29,113 +35,25 @@ def restart_space():
29
  repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
30
  )
31
 
 
32
 
33
- def get_all_requested_models(requested_models_dir):
34
- depth = 1
35
- file_names = []
 
36
 
37
- for root, dirs, files in os.walk(requested_models_dir):
38
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
39
- if current_depth == depth:
40
- file_names.extend([os.path.join(root, file) for file in files])
41
-
42
- return set([file_name.lower().split("./evals/")[1] for file_name in file_names])
43
-
44
-
45
- repo = None
46
- requested_models = None
47
- if H4_TOKEN:
48
- print("Pulling evaluation requests and results.")
49
- # try:
50
- # shutil.rmtree("./evals/")
51
- # except:
52
- # pass
53
-
54
- repo = Repository(
55
- local_dir="./evals/",
56
- clone_from=LMEH_REPO,
57
- use_auth_token=H4_TOKEN,
58
- repo_type="dataset",
59
- )
60
- repo.git_pull()
61
-
62
- requested_models_dir = "./evals/eval_requests"
63
- requested_models = get_all_requested_models(requested_models_dir)
64
-
65
- human_eval_repo = None
66
- if H4_TOKEN and not os.path.isdir("./human_evals"):
67
- print("Pulling human evaluation repo")
68
- human_eval_repo = Repository(
69
- local_dir="./human_evals/",
70
- clone_from=HUMAN_EVAL_REPO,
71
- use_auth_token=H4_TOKEN,
72
- repo_type="dataset",
73
- )
74
- human_eval_repo.git_pull()
75
-
76
- gpt_4_eval_repo = None
77
- if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
78
- print("Pulling GPT-4 evaluation repo")
79
- gpt_4_eval_repo = Repository(
80
- local_dir="./gpt_4_evals/",
81
- clone_from=GPT_4_EVAL_REPO,
82
- use_auth_token=H4_TOKEN,
83
- repo_type="dataset",
84
- )
85
- gpt_4_eval_repo.git_pull()
86
-
87
- # parse the results
88
- BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
89
- METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
90
-
91
-
92
- def load_results(model, benchmark, metric):
93
- file_path = os.path.join("evals", model, f"{model}-eval_{benchmark}.json")
94
- if not os.path.exists(file_path):
95
- return 0.0, None
96
 
97
- with open(file_path) as fp:
98
- data = json.load(fp)
99
- accs = np.array([v[metric] for k, v in data["results"].items()])
100
- mean_acc = np.mean(accs)
101
- return mean_acc, data["config"]["model_args"]
102
 
 
103
 
104
- COLS = [
105
- "Model",
106
- "Revision",
107
- "Average ⬆️",
108
- "ARC (25-shot) ⬆️",
109
- "HellaSwag (10-shot) ⬆️",
110
- "MMLU (5-shot) ⬆️",
111
- "TruthfulQA (0-shot) ⬆️",
112
- "model_name_for_query", # dummy column to implement search bar (hidden by custom CSS)
113
- ]
114
- TYPES = ["markdown", "str", "number", "number", "number", "number", "number", "str"]
115
-
116
- if not IS_PUBLIC:
117
- COLS.insert(2, "8bit")
118
- TYPES.insert(2, "bool")
119
-
120
- EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
121
- EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
122
-
123
- BENCHMARK_COLS = [
124
- "ARC (25-shot) ⬆️",
125
- "HellaSwag (10-shot) ⬆️",
126
- "MMLU (5-shot) ⬆️",
127
- "TruthfulQA (0-shot) ⬆️",
128
- ]
129
-
130
- ELO_COLS = [
131
- "Model",
132
- "GPT-4 (all)",
133
- "Human (all)",
134
- "Human (instruct)",
135
- "Human (code-instruct)",
136
- ]
137
- ELO_TYPES = ["markdown", "number", "number", "number", "number"]
138
- ELO_SORT_COL = "GPT-4 (all)"
139
 
140
 
141
  def has_no_nan_values(df, columns):
@@ -147,54 +65,21 @@ def has_nan_values(df, columns):
147
 
148
 
149
  def get_leaderboard_df():
150
- if repo:
151
  print("Pulling evaluation results for the leaderboard.")
152
- repo.git_pull()
153
 
154
  all_data = get_eval_results_dicts(IS_PUBLIC)
155
 
156
  if not IS_PUBLIC:
157
- gpt4_values = {
158
- "Model": f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
159
- "Revision": "tech report",
160
- "8bit": None,
161
- "Average ⬆️": 84.3,
162
- "ARC (25-shot) ⬆️": 96.3,
163
- "HellaSwag (10-shot) ⬆️": 95.3,
164
- "MMLU (5-shot) ⬆️": 86.4,
165
- "TruthfulQA (0-shot) ⬆️": 59.0,
166
- "model_name_for_query": "GPT-4",
167
- }
168
  all_data.append(gpt4_values)
169
- gpt35_values = {
170
- "Model": f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
171
- "Revision": "tech report",
172
- "8bit": None,
173
- "Average ⬆️": 71.9,
174
- "ARC (25-shot) ⬆️": 85.2,
175
- "HellaSwag (10-shot) ⬆️": 85.5,
176
- "MMLU (5-shot) ⬆️": 70.0,
177
- "TruthfulQA (0-shot) ⬆️": 47.0,
178
- "model_name_for_query": "GPT-3.5",
179
- }
180
  all_data.append(gpt35_values)
181
 
182
- base_line = {
183
- "Model": "<p>Baseline</p>",
184
- "Revision": "N/A",
185
- "8bit": None,
186
- "Average ⬆️": 25.0,
187
- "ARC (25-shot) ⬆️": 25.0,
188
- "HellaSwag (10-shot) ⬆️": 25.0,
189
- "MMLU (5-shot) ⬆️": 25.0,
190
- "TruthfulQA (0-shot) ⬆️": 25.0,
191
- "model_name_for_query": "baseline",
192
- }
193
-
194
- all_data.append(base_line)
195
 
196
  df = pd.DataFrame.from_records(all_data)
197
- df = df.sort_values(by=["Average ⬆️"], ascending=False)
198
  df = df[COLS]
199
 
200
  # filter out if any of the benchmarks have not been produced
@@ -203,20 +88,21 @@ def get_leaderboard_df():
203
 
204
 
205
  def get_evaluation_queue_df():
206
- if repo:
 
207
  print("Pulling changes for the evaluation queue.")
208
- repo.git_pull()
209
 
210
  entries = [
211
  entry
212
- for entry in os.listdir("evals/eval_requests")
213
  if not entry.startswith(".")
214
  ]
215
  all_evals = []
216
 
217
  for entry in entries:
218
  if ".json" in entry:
219
- file_path = os.path.join("evals/eval_requests", entry)
220
  with open(file_path) as fp:
221
  data = json.load(fp)
222
 
@@ -229,11 +115,11 @@ def get_evaluation_queue_df():
229
  # this is a folder
230
  sub_entries = [
231
  e
232
- for e in os.listdir(f"evals/eval_requests/{entry}")
233
  if not e.startswith(".")
234
  ]
235
  for sub_entry in sub_entries:
236
- file_path = os.path.join("evals/eval_requests", entry, sub_entry)
237
  with open(file_path) as fp:
238
  data = json.load(fp)
239
 
@@ -305,13 +191,15 @@ leaderboard_df = original_df.copy()
305
 
306
  def is_model_on_hub(model_name, revision) -> bool:
307
  try:
308
- config = AutoConfig.from_pretrained(model_name, revision=revision)
309
- return True
 
 
 
310
 
311
  except Exception as e:
312
- print("Could not get the model config from the hub.")
313
- print(e)
314
- return False
315
 
316
 
317
  def add_new_eval(
@@ -327,14 +215,15 @@ def add_new_eval(
327
  # check the model actually exists before adding the eval
328
  if revision == "":
329
  revision = "main"
330
- if is_delta_weight and not is_model_on_hub(base_model, revision):
331
- error_message = f'Base model "{base_model}" was not found on hub!'
332
- print(error_message)
333
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error_message}</p>"
334
 
335
- if not is_model_on_hub(model, revision):
336
- error_message = f'Model "{model}"was not found on hub!'
337
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error_message}</p>"
 
 
 
 
 
338
 
339
  print("adding new eval")
340
 
@@ -355,14 +244,13 @@ def add_new_eval(
355
  user_name = model.split("/")[0]
356
  model_path = model.split("/")[1]
357
 
358
- OUT_DIR = f"eval_requests/{user_name}"
359
  os.makedirs(OUT_DIR, exist_ok=True)
360
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
361
 
362
  # Check for duplicate submission
363
- if out_path.lower() in requested_models:
364
- duplicate_request_message = "This model has been already submitted."
365
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
366
 
367
  with open(out_path, "w") as f:
368
  f.write(json.dumps(eval_entry))
@@ -375,8 +263,7 @@ def add_new_eval(
375
  repo_type="dataset",
376
  )
377
 
378
- success_message = "Your request has been submitted to the evaluation queue!"
379
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{success_message}</p>"
380
 
381
 
382
  def refresh():
@@ -395,7 +282,7 @@ def refresh():
395
 
396
 
397
  def search_table(df, query):
398
- filtered_df = df[df["model_name_for_query"].str.contains(query, case=False)]
399
  return filtered_df
400
 
401
 
@@ -413,83 +300,6 @@ def change_tab(query_param):
413
  return gr.Tabs.update(selected=0)
414
 
415
 
416
- custom_css = """
417
- #changelog-text {
418
- font-size: 16px !important;
419
- }
420
-
421
- #changelog-text h2 {
422
- font-size: 18px !important;
423
- }
424
-
425
- .markdown-text {
426
- font-size: 16px !important;
427
- }
428
-
429
- #models-to-add-text {
430
- font-size: 18px !important;
431
- }
432
-
433
- #citation-button span {
434
- font-size: 16px !important;
435
- }
436
-
437
- #citation-button textarea {
438
- font-size: 16px !important;
439
- }
440
-
441
- #citation-button > label > button {
442
- margin: 6px;
443
- transform: scale(1.3);
444
- }
445
-
446
- #leaderboard-table {
447
- margin-top: 15px
448
- }
449
-
450
- #search-bar-table-box > div:first-child {
451
- background: none;
452
- border: none;
453
- }
454
-
455
- #search-bar {
456
- padding: 0px;
457
- width: 30%;
458
- }
459
-
460
- /* Hides the final column */
461
- #llm-benchmark-tab-table table td:last-child,
462
- #llm-benchmark-tab-table table th:last-child {
463
- display: none;
464
- }
465
-
466
- /* Limit the width of the first column so that names don't expand too much */
467
- table td:first-child,
468
- table th:first-child {
469
- max-width: 400px;
470
- overflow: auto;
471
- white-space: nowrap;
472
- }
473
-
474
- .tab-buttons button {
475
- font-size: 20px;
476
- }
477
-
478
- #scale-logo {
479
- border-style: none !important;
480
- box-shadow: none;
481
- display: block;
482
- margin-left: auto;
483
- margin-right: auto;
484
- max-width: 600px;
485
- }
486
-
487
- #scale-logo .download {
488
- display: none;
489
- }
490
- """
491
-
492
-
493
  demo = gr.Blocks(css=custom_css)
494
  with demo:
495
  gr.HTML(TITLE)
@@ -518,30 +328,52 @@ with demo:
518
  show_label=False,
519
  elem_id="search-bar",
520
  )
521
-
522
- leaderboard_table = gr.components.Dataframe(
523
- value=leaderboard_df,
524
- headers=COLS,
525
- datatype=TYPES,
526
- max_rows=5,
527
- elem_id="leaderboard-table",
528
- )
 
 
 
 
 
 
 
 
 
529
 
530
  # Dummy leaderboard for handling the case when the user uses backspace key
531
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
532
  value=original_df,
533
  headers=COLS,
534
  datatype=TYPES,
535
- max_rows=5,
536
  visible=False,
537
  )
538
-
539
  search_bar.submit(
540
  search_table,
541
  [hidden_leaderboard_table_for_search, search_bar],
542
  leaderboard_table,
543
  )
544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  with gr.Row():
546
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
547
 
@@ -625,7 +457,7 @@ with demo:
625
  gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
626
  with gr.Column(scale=1):
627
  gr.Image(
628
- "scale-hf-logo.png", elem_id="scale-logo", show_label=False
629
  )
630
  gr.Markdown("## No tie allowed")
631
  elo_leaderboard_table = gr.components.Dataframe(
@@ -660,22 +492,23 @@ with demo:
660
  tabs,
661
  _js=get_window_url_params,
662
  )
663
- # with gr.Box():
664
- # visualization_title = gr.HTML(VISUALIZATION_TITLE)
665
- # with gr.Row():
666
- # with gr.Column():
667
- # gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
668
- # plot_1 = gr.Plot(plot_1, show_label=False)
669
- # with gr.Column():
670
- # gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
671
- # plot_2 = gr.Plot(plot_2, show_label=False)
672
- # with gr.Row():
673
- # with gr.Column():
674
- # gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
675
- # plot_3 = gr.Plot(plot_3, show_label=False)
676
- # with gr.Column():
677
- # gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
678
- # plot_4 = gr.Plot(plot_4, show_label=False)
 
679
 
680
  scheduler = BackgroundScheduler()
681
  scheduler.add_job(restart_space, "interval", seconds=3600)
 
7
  import numpy as np
8
  import pandas as pd
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
+ from huggingface_hub import HfApi
11
  from transformers import AutoConfig
12
 
13
+ from src.auto_leaderboard.get_model_metadata import apply_metadata
14
+ from src.assets.text_content import *
15
+ from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
16
+ from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
17
+ from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
18
+ from src.assets.css_html_js import custom_css, get_window_url_params
19
+ from src.utils_display import AutoEvalColumn, EvalQueueColumn, EloEvalColumn, fields, styled_error, styled_warning, styled_message
20
+ from src.init import load_all_info_from_hub
21
 
22
  # clone / pull the lmeh eval data
23
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
24
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
25
  HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
26
  GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
27
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
28
+ ADD_PLOTS = False
29
 
30
  api = HfApi()
31
 
 
35
  repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
36
  )
37
 
38
+ auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)
39
 
40
+ COLS = [c.name for c in fields(AutoEvalColumn)]
41
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
42
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]
43
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default]
44
 
45
+ if not IS_PUBLIC:
46
+ COLS.insert(2, AutoEvalColumn.is_8bit.name)
47
+ TYPES.insert(2, AutoEvalColumn.is_8bit.type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
50
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
 
 
51
 
52
+ BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
53
 
54
+ ELO_COLS = [c.name for c in fields(EloEvalColumn)]
55
+ ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
56
+ ELO_SORT_COL = EloEvalColumn.gpt4.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  def has_no_nan_values(df, columns):
 
65
 
66
 
67
  def get_leaderboard_df():
68
+ if auto_eval_repo:
69
  print("Pulling evaluation results for the leaderboard.")
70
+ auto_eval_repo.git_pull()
71
 
72
  all_data = get_eval_results_dicts(IS_PUBLIC)
73
 
74
  if not IS_PUBLIC:
 
 
 
 
 
 
 
 
 
 
 
75
  all_data.append(gpt4_values)
 
 
 
 
 
 
 
 
 
 
 
76
  all_data.append(gpt35_values)
77
 
78
+ all_data.append(baseline)
79
+ apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  df = pd.DataFrame.from_records(all_data)
82
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
83
  df = df[COLS]
84
 
85
  # filter out if any of the benchmarks have not been produced
 
88
 
89
 
90
  def get_evaluation_queue_df():
91
+ # todo @saylortwift: replace the repo by the one you created for the eval queue
92
+ if auto_eval_repo:
93
  print("Pulling changes for the evaluation queue.")
94
+ auto_eval_repo.git_pull()
95
 
96
  entries = [
97
  entry
98
+ for entry in os.listdir("auto_evals/eval_requests")
99
  if not entry.startswith(".")
100
  ]
101
  all_evals = []
102
 
103
  for entry in entries:
104
  if ".json" in entry:
105
+ file_path = os.path.join("auto_evals/eval_requests", entry)
106
  with open(file_path) as fp:
107
  data = json.load(fp)
108
 
 
115
  # this is a folder
116
  sub_entries = [
117
  e
118
+ for e in os.listdir(f"auto_evals/eval_requests/{entry}")
119
  if not e.startswith(".")
120
  ]
121
  for sub_entry in sub_entries:
122
+ file_path = os.path.join("auto_evals/eval_requests", entry, sub_entry)
123
  with open(file_path) as fp:
124
  data = json.load(fp)
125
 
 
191
 
192
  def is_model_on_hub(model_name, revision) -> bool:
193
  try:
194
+ AutoConfig.from_pretrained(model_name, revision=revision)
195
+ return True, None
196
+
197
+ except ValueError as e:
198
+ return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
199
 
200
  except Exception as e:
201
+ print("Could not get the model config from the hub.: \n", e)
202
+ return False, "was not found on hub!"
 
203
 
204
 
205
  def add_new_eval(
 
215
  # check the model actually exists before adding the eval
216
  if revision == "":
217
  revision = "main"
 
 
 
 
218
 
219
+ if is_delta_weight:
220
+ base_model_on_hub, error = is_model_on_hub(base_model, revision)
221
+ if not base_model_on_hub:
222
+ return styled_error(f'Base model "{base_model}" {error}')
223
+
224
+ model_on_hub, error = is_model_on_hub(model, revision)
225
+ if not model_on_hub:
226
+ return styled_error(f'Model "{model}" {error}')
227
 
228
  print("adding new eval")
229
 
 
244
  user_name = model.split("/")[0]
245
  model_path = model.split("/")[1]
246
 
247
+ OUT_DIR = f"auto_evals/eval_requests/{user_name}"
248
  os.makedirs(OUT_DIR, exist_ok=True)
249
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
250
 
251
  # Check for duplicate submission
252
+ if out_path.split("eval_requests/")[1].lower() in requested_models:
253
+ return styled_warning("This model has been already submitted.")
 
254
 
255
  with open(out_path, "w") as f:
256
  f.write(json.dumps(eval_entry))
 
263
  repo_type="dataset",
264
  )
265
 
266
+ return styled_message("Your request has been submitted to the evaluation queue!")
 
267
 
268
 
269
  def refresh():
 
282
 
283
 
284
  def search_table(df, query):
285
+ filtered_df = df[df[AutoEvalColumn.dummy.name].str.contains(query, case=False)]
286
  return filtered_df
287
 
288
 
 
300
  return gr.Tabs.update(selected=0)
301
 
302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  demo = gr.Blocks(css=custom_css)
304
  with demo:
305
  gr.HTML(TITLE)
 
328
  show_label=False,
329
  elem_id="search-bar",
330
  )
331
+ with gr.Tabs(elem_classes="tab-buttons"):
332
+ with gr.TabItem("Light View"):
333
+ leaderboard_table_lite = gr.components.Dataframe(
334
+ value=leaderboard_df[COLS_LITE],
335
+ headers=COLS_LITE,
336
+ datatype=TYPES_LITE,
337
+ max_rows=None,
338
+ elem_id="leaderboard-table-lite",
339
+ )
340
+ with gr.TabItem("Extended Model View"):
341
+ leaderboard_table = gr.components.Dataframe(
342
+ value=leaderboard_df,
343
+ headers=COLS,
344
+ datatype=TYPES,
345
+ max_rows=None,
346
+ elem_id="leaderboard-table",
347
+ )
348
 
349
  # Dummy leaderboard for handling the case when the user uses backspace key
350
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
351
  value=original_df,
352
  headers=COLS,
353
  datatype=TYPES,
354
+ max_rows=None,
355
  visible=False,
356
  )
 
357
  search_bar.submit(
358
  search_table,
359
  [hidden_leaderboard_table_for_search, search_bar],
360
  leaderboard_table,
361
  )
362
 
363
+ # Dummy leaderboard for handling the case when the user uses backspace key
364
+ hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
365
+ value=original_df[COLS_LITE],
366
+ headers=COLS_LITE,
367
+ datatype=TYPES_LITE,
368
+ max_rows=None,
369
+ visible=False,
370
+ )
371
+ search_bar.submit(
372
+ search_table,
373
+ [hidden_leaderboard_table_for_search_lite, search_bar],
374
+ leaderboard_table_lite,
375
+ )
376
+
377
  with gr.Row():
378
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
379
 
 
457
  gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
458
  with gr.Column(scale=1):
459
  gr.Image(
460
+ "src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
461
  )
462
  gr.Markdown("## No tie allowed")
463
  elo_leaderboard_table = gr.components.Dataframe(
 
492
  tabs,
493
  _js=get_window_url_params,
494
  )
495
+ if ADD_PLOTS:
496
+ with gr.Box():
497
+ visualization_title = gr.HTML(VISUALIZATION_TITLE)
498
+ with gr.Row():
499
+ with gr.Column():
500
+ gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
501
+ plot_1 = gr.Plot(plot_1, show_label=False)
502
+ with gr.Column():
503
+ gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
504
+ plot_2 = gr.Plot(plot_2, show_label=False)
505
+ with gr.Row():
506
+ with gr.Column():
507
+ gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
508
+ plot_3 = gr.Plot(plot_3, show_label=False)
509
+ with gr.Column():
510
+ gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
511
+ plot_4 = gr.Plot(plot_4, show_label=False)
512
 
513
  scheduler = BackgroundScheduler()
514
  scheduler.add_job(restart_space, "interval", seconds=3600)
src/assets/css_html_js.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ #changelog-text {
3
+ font-size: 16px !important;
4
+ }
5
+
6
+ #changelog-text h2 {
7
+ font-size: 18px !important;
8
+ }
9
+
10
+ .markdown-text {
11
+ font-size: 16px !important;
12
+ }
13
+
14
+ #models-to-add-text {
15
+ font-size: 18px !important;
16
+ }
17
+
18
+ #citation-button span {
19
+ font-size: 16px !important;
20
+ }
21
+
22
+ #citation-button textarea {
23
+ font-size: 16px !important;
24
+ }
25
+
26
+ #citation-button > label > button {
27
+ margin: 6px;
28
+ transform: scale(1.3);
29
+ }
30
+
31
+ #leaderboard-table {
32
+ margin-top: 15px
33
+ }
34
+
35
+ #leaderboard-table-lite {
36
+ margin-top: 15px
37
+ }
38
+
39
+ #search-bar-table-box > div:first-child {
40
+ background: none;
41
+ border: none;
42
+ }
43
+
44
+ #search-bar {
45
+ padding: 0px;
46
+ width: 30%;
47
+ }
48
+
49
+ /* Hides the final AutoEvalColumn */
50
+ #llm-benchmark-tab-table table td:last-child,
51
+ #llm-benchmark-tab-table table th:last-child {
52
+ display: none;
53
+ }
54
+
55
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
56
+ table td:first-child,
57
+ table th:first-child {
58
+ max-width: 400px;
59
+ overflow: auto;
60
+ white-space: nowrap;
61
+ }
62
+
63
+ .tab-buttons button {
64
+ font-size: 20px;
65
+ }
66
+
67
+ #scale-logo {
68
+ border-style: none !important;
69
+ box-shadow: none;
70
+ display: block;
71
+ margin-left: auto;
72
+ margin-right: auto;
73
+ max-width: 600px;
74
+ }
75
+
76
+ #scale-logo .download {
77
+ display: none;
78
+ }
79
+ """
80
+
81
+ get_window_url_params = """
82
+ function(url_params) {
83
+ const params = new URLSearchParams(window.location.search);
84
+ url_params = Object.fromEntries(params);
85
+ return url_params;
86
+ }
87
+ """
src/assets/hardcoded_evals.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.utils_display import AutoEvalColumn, model_hyperlink
2
+
3
+ gpt4_values = {
4
+ AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
+ AutoEvalColumn.revision.name: "tech report",
6
+ AutoEvalColumn.is_8bit.name: None,
7
+ AutoEvalColumn.average.name: 84.3,
8
+ AutoEvalColumn.arc.name: 96.3,
9
+ AutoEvalColumn.hellaswag.name: 95.3,
10
+ AutoEvalColumn.mmlu.name: 86.4,
11
+ AutoEvalColumn.truthfulqa.name: 59.0,
12
+ AutoEvalColumn.dummy.name: "GPT-4",
13
+ }
14
+
15
+ gpt35_values = {
16
+ AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
17
+ AutoEvalColumn.revision.name: "tech report",
18
+ AutoEvalColumn.is_8bit.name: None,
19
+ AutoEvalColumn.average.name: 71.9,
20
+ AutoEvalColumn.arc.name: 85.2,
21
+ AutoEvalColumn.hellaswag.name: 85.5,
22
+ AutoEvalColumn.mmlu.name: 70.0,
23
+ AutoEvalColumn.truthfulqa.name: 47.0,
24
+ AutoEvalColumn.dummy.name: "GPT-3.5",
25
+ }
26
+
27
+ baseline = {
28
+ AutoEvalColumn.model.name: "<p>Baseline</p>",
29
+ AutoEvalColumn.revision.name: "N/A",
30
+ AutoEvalColumn.is_8bit.name: None,
31
+ AutoEvalColumn.average.name: 25.0,
32
+ AutoEvalColumn.arc.name: 25.0,
33
+ AutoEvalColumn.hellaswag.name: 25.0,
34
+ AutoEvalColumn.mmlu.name: 25.0,
35
+ AutoEvalColumn.truthfulqa.name: 25.0,
36
+ AutoEvalColumn.dummy.name: "baseline",
37
+ }
38
+
content.py → src/assets/text_content.py RENAMED
@@ -1,4 +1,8 @@
1
  CHANGELOG_TEXT = f"""
 
 
 
 
2
  ## [2023-06-13]
3
  - Adjust description for TruthfulQA
4
 
@@ -13,7 +17,7 @@ CHANGELOG_TEXT = f"""
13
  - Add a typeahead search bar
14
  - Use webhooks to automatically spawn a new Space when someone opens a PR
15
  - Start recording `submitted_time` for eval requests
16
- - Limit column max-width
17
 
18
  ## [2023-05-30]
19
  - Add a citation button
 
1
  CHANGELOG_TEXT = f"""
2
+ ## [2023-06-16]
3
+ - Refactored code base
4
+ - Added new columns: number of parameters, hub likes, license
5
+
6
  ## [2023-06-13]
7
  - Adjust description for TruthfulQA
8
 
 
17
  - Add a typeahead search bar
18
  - Use webhooks to automatically spawn a new Space when someone opens a PR
19
  - Start recording `submitted_time` for eval requests
20
+ - Limit AutoEvalColumn max-width
21
 
22
  ## [2023-05-30]
23
  - Add a citation button
src/auto_leaderboard/get_model_metadata.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+
4
+ from src.utils_display import AutoEvalColumn
5
+
6
+ from huggingface_hub import HfApi
7
+ import huggingface_hub
8
+ api = HfApi()
9
+
10
+
11
+ def get_model_infos_from_hub(leaderboard_data: List[dict]):
12
+ for model_data in leaderboard_data:
13
+ model_name = model_data["model_name_for_query"]
14
+ try:
15
+ model_info = api.model_info(model_name)
16
+ except huggingface_hub.utils._errors.RepositoryNotFoundError:
17
+ model_data[AutoEvalColumn.license.name] = None
18
+ model_data[AutoEvalColumn.likes.name] = None
19
+ model_data[AutoEvalColumn.params.name] = None
20
+ continue
21
+
22
+ model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
23
+ model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
24
+ model_data[AutoEvalColumn.params.name] = get_model_size(model_name, model_info)
25
+
26
+
27
+ def get_model_license(model_info):
28
+ try:
29
+ return model_info.cardData["license"]
30
+ except Exception:
31
+ return None
32
+
33
+ def get_model_likes(model_info):
34
+ return model_info.likes
35
+
36
+ size_pattern = re.compile(r"\d+(b|m)")
37
+
38
+ def get_model_size(model_name, model_info):
39
+ # In billions
40
+ try:
41
+ return model_info.safetensors["total"] / 1e9
42
+ except AttributeError:
43
+ #print(f"Repository {model_id} does not have safetensors weights")
44
+ pass
45
+ try:
46
+ size_match = re.search(size_pattern, model_name.lower())
47
+ size = size_match.group(0)
48
+ return int(size[:-1]) if size[-1] == "b" else int(size[:-1]) / 1e3
49
+ except AttributeError:
50
+ return None
51
+
52
+
53
+ def apply_metadata(leaderboard_data: List[dict]):
54
+ get_model_infos_from_hub(leaderboard_data)
utils.py → src/auto_leaderboard/load_results.py RENAMED
@@ -1,47 +1,23 @@
 
 
1
  import glob
2
  import json
3
- from dataclasses import dataclass
4
  from typing import Dict, List, Tuple
5
 
 
6
  import numpy as np
7
 
8
  # clone / pull the lmeh eval data
9
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
10
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
11
  BENCH_TO_NAME = {
12
- "arc_challenge": "ARC (25-shot) ⬆️",
13
- "hellaswag": "HellaSwag (10-shot) ⬆️",
14
- "hendrycks": "MMLU (5-shot) ⬆️",
15
- "truthfulqa_mc": "TruthfulQA (0-shot) ⬆️",
16
  }
17
 
18
 
19
- def make_clickable_model(model_name):
20
- LLAMAS = [
21
- "huggingface/llama-7b",
22
- "huggingface/llama-13b",
23
- "huggingface/llama-30b",
24
- "huggingface/llama-65b",
25
- ]
26
- if model_name in LLAMAS:
27
- model = model_name.split("/")[1]
28
- return f'<a target="_blank" href="https://ai.facebook.com/blog/large-language-model-llama-meta-ai/" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model}</a>'
29
-
30
- if model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
31
- link = "https://huggingface.co/" + "CarperAI/stable-vicuna-13b-delta"
32
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">stable-vicuna-13b</a>'
33
-
34
- if model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
35
- link = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
36
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">alpaca-13b</a>'
37
-
38
- # remove user from model name
39
- # model_name_show = ' '.join(model_name.split('/')[1:])
40
-
41
- link = "https://huggingface.co/" + model_name
42
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
43
-
44
-
45
  @dataclass
46
  class EvalResult:
47
  eval_name: str
@@ -58,12 +34,12 @@ class EvalResult:
58
  base_model = f"{self.model}"
59
  data_dict = {}
60
 
61
- data_dict["eval_name"] = self.eval_name
62
- data_dict["8bit"] = self.is_8bit
63
- data_dict["Model"] = make_clickable_model(base_model)
64
- data_dict["model_name_for_query"] = base_model
65
- data_dict["Revision"] = self.revision
66
- data_dict["Average ⬆️"] = round(
67
  sum([v for k, v in self.results.items()]) / 4.0, 1
68
  )
69
 
@@ -88,17 +64,15 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
88
  revision = path_split[-3]
89
  if len(path_split) == 7:
90
  # handles gpt2 type models that don't have an org
91
- result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
92
  else:
93
- result_key = (
94
- f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
95
- )
96
  org = path_split[-5]
 
97
 
98
  eval_result = None
99
  for benchmark, metric in zip(BENCHMARKS, METRICS):
100
  if benchmark in json_filepath:
101
- accs = np.array([v[metric] for k, v in data["results"].items()])
102
  mean_acc = round(np.mean(accs) * 100.0, 1)
103
  eval_result = EvalResult(
104
  result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
@@ -109,18 +83,19 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
109
 
110
  def get_eval_results(is_public) -> List[EvalResult]:
111
  json_filepaths = glob.glob(
112
- "evals/eval_results/public/**/16bit/*.json", recursive=True
113
  )
114
  if not is_public:
115
  json_filepaths += glob.glob(
116
- "evals/eval_results/private/**/*.json", recursive=True
117
  )
118
  json_filepaths += glob.glob(
119
- "evals/eval_results/private/**/*.json", recursive=True
120
  )
 
121
  json_filepaths += glob.glob(
122
- "evals/eval_results/public/**/8bit/*.json", recursive=True
123
- ) # include the 8bit evals of public models
124
  eval_results = {}
125
 
126
  for json_filepath in json_filepaths:
@@ -130,7 +105,7 @@ def get_eval_results(is_public) -> List[EvalResult]:
130
  else:
131
  eval_results[result_key] = eval_result
132
 
133
- eval_results = [v for k, v in eval_results.items()]
134
 
135
  return eval_results
136
 
@@ -139,12 +114,3 @@ def get_eval_results_dicts(is_public=True) -> List[Dict]:
139
  eval_results = get_eval_results(is_public)
140
 
141
  return [e.to_dict() for e in eval_results]
142
-
143
-
144
- get_window_url_params = """
145
- function(url_params) {
146
- const params = new URLSearchParams(window.location.search);
147
- url_params = Object.fromEntries(params);
148
- return url_params;
149
- }
150
- """
 
1
+ from dataclasses import dataclass
2
+
3
  import glob
4
  import json
 
5
  from typing import Dict, List, Tuple
6
 
7
+ from src.utils_display import AutoEvalColumn, make_clickable_model
8
  import numpy as np
9
 
10
  # clone / pull the lmeh eval data
11
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
12
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
13
  BENCH_TO_NAME = {
14
+ "arc_challenge": AutoEvalColumn.arc.name,
15
+ "hellaswag": AutoEvalColumn.hellaswag.name,
16
+ "hendrycks": AutoEvalColumn.mmlu.name,
17
+ "truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
18
  }
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  @dataclass
22
  class EvalResult:
23
  eval_name: str
 
34
  base_model = f"{self.model}"
35
  data_dict = {}
36
 
37
+ data_dict["eval_name"] = self.eval_name # not a column, just a save name
38
+ data_dict[AutoEvalColumn.is_8bit.name] = self.is_8bit
39
+ data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
40
+ data_dict[AutoEvalColumn.dummy.name] = base_model
41
+ data_dict[AutoEvalColumn.revision.name] = self.revision
42
+ data_dict[AutoEvalColumn.average.name] = round(
43
  sum([v for k, v in self.results.items()]) / 4.0, 1
44
  )
45
 
 
64
  revision = path_split[-3]
65
  if len(path_split) == 7:
66
  # handles gpt2 type models that don't have an org
67
+ result_key = f"{model}_{revision}_{is_8bit}"
68
  else:
 
 
 
69
  org = path_split[-5]
70
+ result_key = f"{org}_{model}_{revision}_{is_8bit}"
71
 
72
  eval_result = None
73
  for benchmark, metric in zip(BENCHMARKS, METRICS):
74
  if benchmark in json_filepath:
75
+ accs = np.array([v[metric] for v in data["results"].values()])
76
  mean_acc = round(np.mean(accs) * 100.0, 1)
77
  eval_result = EvalResult(
78
  result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
 
83
 
84
  def get_eval_results(is_public) -> List[EvalResult]:
85
  json_filepaths = glob.glob(
86
+ "auto_evals/eval_results/public/**/16bit/*.json", recursive=True
87
  )
88
  if not is_public:
89
  json_filepaths += glob.glob(
90
+ "auto_evals/eval_results/private/**/*.json", recursive=True
91
  )
92
  json_filepaths += glob.glob(
93
+ "auto_evals/eval_results/private/**/*.json", recursive=True
94
  )
95
+ # include the 8bit evals of public models
96
  json_filepaths += glob.glob(
97
+ "auto_evals/eval_results/public/**/8bit/*.json", recursive=True
98
+ )
99
  eval_results = {}
100
 
101
  for json_filepath in json_filepaths:
 
105
  else:
106
  eval_results[result_key] = eval_result
107
 
108
+ eval_results = [v for v in eval_results.values()]
109
 
110
  return eval_results
111
 
 
114
  eval_results = get_eval_results(is_public)
115
 
116
  return [e.to_dict() for e in eval_results]
 
 
 
 
 
 
 
 
 
elo_utils.py → src/elo_leaderboard/load_results.py RENAMED
@@ -6,9 +6,9 @@ import numpy as np
6
  import pandas as pd
7
  from datasets import load_dataset
8
 
9
- from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
10
- from utils import make_clickable_model
11
- from visualizations import (
12
  get_bootstrap_result,
13
  switch_model_a_b,
14
  visualize_battle_count,
@@ -18,29 +18,6 @@ from visualizations import (
18
  )
19
 
20
 
21
- KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
22
- VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
23
- OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
24
- DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
25
- MODEL_PAGE = "https://huggingface.co/models"
26
-
27
-
28
- def make_clickable_model_elo(model_name):
29
- link = ""
30
- if model_name == "dolly-12b":
31
- link = DOLLY_LINK
32
- elif model_name == "vicuna-13b":
33
- link = VICUNA_LINK
34
- elif model_name == "koala-13b":
35
- link = KOALA_LINK
36
- elif model_name == "oasst-12b":
37
- link = OASST_LINK
38
- else:
39
- link = MODEL_PAGE
40
-
41
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
42
-
43
-
44
  @dataclass
45
  class EloEvalResult:
46
  model: str
@@ -53,11 +30,11 @@ class EloEvalResult:
53
  def to_dict(self):
54
  base_model = f"{self.model}"
55
  data_dict = {}
56
- data_dict["Model"] = make_clickable_model_elo(base_model)
57
- data_dict["GPT-4 (all)"] = self.gpt_4_all
58
- data_dict["Human (all)"] = self.human_all
59
- data_dict["Human (instruct)"] = self.human_instruct
60
- data_dict["Human (code-instruct)"] = self.human_code_instruct
61
 
62
  return data_dict
63
 
 
6
  import pandas as pd
7
  from datasets import load_dataset
8
 
9
+ from src.assets.text_content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
10
+ from src.utils_display import make_clickable_model, EloEvalColumn
11
+ from .visualizations import (
12
  get_bootstrap_result,
13
  switch_model_a_b,
14
  visualize_battle_count,
 
18
  )
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  @dataclass
22
  class EloEvalResult:
23
  model: str
 
30
  def to_dict(self):
31
  base_model = f"{self.model}"
32
  data_dict = {}
33
+ data_dict[EloEvalColumn.model.name] = make_clickable_model(base_model)
34
+ data_dict[EloEvalColumn.gpt4.name] = self.gpt_4_all
35
+ data_dict[EloEvalColumn.human_all.name] = self.human_all
36
+ data_dict[EloEvalColumn.human_instruct.name] = self.human_instruct
37
+ data_dict[EloEvalColumn.human_code_instruct.name] = self.human_code_instruct
38
 
39
  return data_dict
40
 
visualizations.py → src/elo_leaderboard/visualizations.py RENAMED
@@ -133,5 +133,5 @@ def visualize_rating_count(df, title):
133
  fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
134
  fig.update_yaxes(range=[y_begin, y_end])
135
  # save the plot for the blog:
136
- fig.write_html("model_counts.html", full_html=False, include_plotlyjs="cdn")
137
  return fig
 
133
  fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
134
  fig.update_yaxes(range=[y_begin, y_end])
135
  # save the plot for the blog:
136
+ fig.write_html("src/assets/model_counts.html", full_html=False, include_plotlyjs="cdn")
137
  return fig
src/init.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import Repository
3
+
4
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
5
+
6
+
7
+ def get_all_requested_models(requested_models_dir):
8
+ depth = 1
9
+ file_names = []
10
+
11
+ for root, dirs, files in os.walk(requested_models_dir):
12
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
13
+ if current_depth == depth:
14
+ file_names.extend([os.path.join(root, file) for file in files])
15
+
16
+ return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
17
+
18
+ def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
19
+ auto_eval_repo = None
20
+ requested_models = None
21
+ if H4_TOKEN:
22
+ print("Pulling evaluation requests and results.")
23
+ # try:
24
+ # shutil.rmtree("./auto_evals/")
25
+ # except:
26
+ # pass
27
+
28
+ auto_eval_repo = Repository(
29
+ local_dir="./auto_evals/",
30
+ clone_from=LMEH_REPO,
31
+ use_auth_token=H4_TOKEN,
32
+ repo_type="dataset",
33
+ )
34
+ auto_eval_repo.git_pull()
35
+
36
+ requested_models_dir = "./auto_evals/eval_requests"
37
+ requested_models = get_all_requested_models(requested_models_dir)
38
+
39
+ human_eval_repo = None
40
+ if H4_TOKEN and not os.path.isdir("./human_evals"):
41
+ print("Pulling human evaluation repo")
42
+ human_eval_repo = Repository(
43
+ local_dir="./human_evals/",
44
+ clone_from=HUMAN_EVAL_REPO,
45
+ use_auth_token=H4_TOKEN,
46
+ repo_type="dataset",
47
+ )
48
+ human_eval_repo.git_pull()
49
+
50
+ gpt_4_eval_repo = None
51
+ if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
52
+ print("Pulling GPT-4 evaluation repo")
53
+ gpt_4_eval_repo = Repository(
54
+ local_dir="./gpt_4_evals/",
55
+ clone_from=GPT_4_EVAL_REPO,
56
+ use_auth_token=H4_TOKEN,
57
+ repo_type="dataset",
58
+ )
59
+ gpt_4_eval_repo.git_pull()
60
+
61
+ return auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models
62
+
63
+
64
+ #def load_results(model, benchmark, metric):
65
+ # file_path = os.path.join("autoevals", model, f"{model}-eval_{benchmark}.json")
66
+ # if not os.path.exists(file_path):
67
+ # return 0.0, None
68
+
69
+ # with open(file_path) as fp:
70
+ # data = json.load(fp)
71
+ # accs = np.array([v[metric] for k, v in data["results"].items()])
72
+ # mean_acc = np.mean(accs)
73
+ # return mean_acc, data["config"]["model_args"]
src/utils_display.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ # These classes are for user facing column names, to avoid having to change them
4
+ # all around the code when a modif is needed
5
+ @dataclass
6
+ class ColumnContent:
7
+ name: str
8
+ type: str
9
+ displayed_by_default: bool
10
+
11
+ def fields(raw_class):
12
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
13
+
14
+ @dataclass(frozen=True)
15
+ class AutoEvalColumn: # Auto evals column
16
+ model = ColumnContent("Model", "markdown", True)
17
+ revision = ColumnContent("Revision", "str", True)
18
+ is_8bit = ColumnContent("8bit", "bool", False)
19
+ license = ColumnContent("Hub License", "str", False)
20
+ params = ColumnContent("#Params (B)", "number", False)
21
+ likes = ColumnContent("Hub ❤️", "number", False)
22
+ average = ColumnContent("Average ⬆️", "number", True)
23
+ arc = ColumnContent("ARC (25-s) ⬆️", "number", True)
24
+ hellaswag = ColumnContent("HellaSwag (10-s) ⬆️", "number", True)
25
+ mmlu = ColumnContent("MMLU (5-s) ⬆️", "number", True)
26
+ truthfulqa = ColumnContent("TruthfulQA (MC) (0-s) ⬆️", "number", True)
27
+ dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
28
+
29
+ @dataclass(frozen=True)
30
+ class EloEvalColumn: # Elo evals column
31
+ model = ColumnContent("Model", "markdown", True)
32
+ gpt4 = ColumnContent("GPT-4 (all)", "number", True)
33
+ human_all = ColumnContent("Human (all)", "number", True)
34
+ human_instruct = ColumnContent("Human (instruct)", "number", True)
35
+ human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class EvalQueueColumn: # Queue column
40
+ model = ColumnContent("model", "markdown", True)
41
+ revision = ColumnContent("revision", "str", True)
42
+ private = ColumnContent("private", "bool", True)
43
+ is_8bit = ColumnContent("8bit_eval", "bool", True)
44
+ has_delta_weight = ColumnContent("is_delta_weight", "bool", True)
45
+ status = ColumnContent("status", "str", True)
46
+
47
+ LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
48
+
49
+
50
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
51
+ VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
52
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
53
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
54
+ MODEL_PAGE = "https://huggingface.co/models"
55
+ LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
56
+ VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
57
+ ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
58
+
59
+
60
+ def model_hyperlink(link, model_name):
61
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
62
+
63
+
64
+ def make_clickable_model(model_name):
65
+ link = f"https://huggingface.co/{model_name}"
66
+
67
+ if model_name in LLAMAS:
68
+ link = LLAMA_LINK
69
+ model_name = model_name.split("/")[1]
70
+ elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
71
+ link = VICUNA_LINK
72
+ model_name = "stable-vicuna-13b"
73
+ elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
74
+ link = ALPACA_LINK
75
+ model_name = "alpaca-13b"
76
+ if model_name == "dolly-12b":
77
+ link = DOLLY_LINK
78
+ elif model_name == "vicuna-13b":
79
+ link = VICUNA_LINK
80
+ elif model_name == "koala-13b":
81
+ link = KOALA_LINK
82
+ elif model_name == "oasst-12b":
83
+ link = OASST_LINK
84
+ #else:
85
+ # link = MODEL_PAGE
86
+
87
+ return model_hyperlink(link, model_name)
88
+
89
+ def styled_error(error):
90
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
91
+
92
+ def styled_warning(warn):
93
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
94
+
95
+ def styled_message(message):
96
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"