BenchmarkBot commited on
Commit
a894537
·
1 Parent(s): 5608bf7

experiments by model type and weight class

Browse files
Files changed (4) hide show
  1. app.py +46 -46
  2. src/assets/css_html_js.py +2 -2
  3. src/assets/text_content.py +6 -8
  4. src/utils.py +27 -42
app.py CHANGED
@@ -4,11 +4,12 @@ import pandas as pd
4
  import plotly.express as px
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
 
7
  from src.assets.text_content import (
8
  TITLE,
9
  INTRODUCTION_TEXT,
10
  A100_TEXT,
11
- About_TEXT,
12
  CITATION_BUTTON_LABEL,
13
  CITATION_BUTTON_TEXT,
14
  )
@@ -16,45 +17,49 @@ from src.utils import (
16
  change_tab,
17
  restart_space,
18
  load_dataset_repo,
19
- make_clickable_model,
20
- # make_clickable_score,
21
- # num_to_str,
22
  )
23
- from src.assets.css_html_js import custom_css, custom_js
24
 
25
 
26
  LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
27
  LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
28
  OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
29
 
30
- COLUMNS_MAPPING = {
31
- "model": "Model 🤗",
 
 
 
 
32
  "backend.name": "Backend 🏭",
33
- "backend.torch_dtype": "Load Dtype 📥",
34
  "optimizations": "Optimizations 🛠️",
35
  #
36
- "tradeoff": "Open LLM Tradeoff ⬇️",
37
  #
38
- "score": "Open LLM Score ⬆️",
39
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
40
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
41
- "num_params": "#️⃣ Parameters (M) 📏",
 
42
  }
43
- COLUMNS_DATATYPES = [
 
 
44
  "markdown",
 
45
  "str",
46
  "str",
47
  "str",
48
  #
49
- "number",
50
  #
51
  "number",
52
  "number",
53
  "number",
54
- "number",
55
  ]
56
- SORTING_COLUMN = ["Open LLM Tradeoff ⬇️"]
57
-
58
 
59
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
60
 
@@ -65,17 +70,10 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
65
 
66
  # load and merge
67
  bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
68
- scores_df = pd.read_csv(f"./llm-perf-dataset/reports/open-llm-leaderboard.csv")
69
- bench_df = bench_df.merge(scores_df, on="model", how="left")
70
-
71
- # filter out models with no score
72
- bench_df = bench_df[bench_df["score"].notna()]
73
-
74
- # create composite score
75
- score_distance = 100 - bench_df["score"]
76
- latency_distance = bench_df["generate.latency(s)"]
77
- bench_df["tradeoff"] = (score_distance**2 + latency_distance**2) ** 0.5
78
- bench_df["tradeoff"] = bench_df["tradeoff"].round(2)
79
 
80
  # add optimizations
81
  bench_df["optimizations"] = bench_df[
@@ -101,17 +99,19 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
101
 
102
  def get_benchmark_table(bench_df):
103
  # filter
104
- bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
105
  # rename
106
- bench_df.rename(columns=COLUMNS_MAPPING, inplace=True)
107
  # sort
108
  bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
109
  # transform
110
- bench_df["Model 🤗"] = bench_df["Model 🤗"].apply(make_clickable_model)
111
- bench_df["#️⃣ Parameters (M) 📏"] = bench_df["#️⃣ Parameters (M) 📏"].apply(
112
- lambda x: int(x / (1024 * 1024))
 
 
 
113
  )
114
-
115
  return bench_df
116
 
117
 
@@ -122,12 +122,12 @@ def get_benchmark_plot(bench_df):
122
  fig = px.scatter(
123
  bench_df,
124
  x="generate.latency(s)",
125
- y="score",
126
  color="model_type",
127
  symbol="backend.name",
128
  size="forward.peak_memory(MB)",
129
  custom_data=[
130
- "model",
131
  "backend.name",
132
  "backend.torch_dtype",
133
  "optimizations",
@@ -158,12 +158,12 @@ def get_benchmark_plot(bench_df):
158
  [
159
  "Model: %{customdata[0]}",
160
  "Backend: %{customdata[1]}",
161
- "Datatype: %{customdata[2]}",
162
  "Optimizations: %{customdata[3]}",
163
  "Peak Memory (MB): %{customdata[4]}",
164
  "Throughput (tokens/s): %{customdata[5]}",
165
  "Per 1000 Tokens Latency (s): %{x}",
166
- "Open LLM Score: %{y}",
167
  ]
168
  )
169
  )
@@ -183,7 +183,7 @@ def filter_query(
183
  raw_df = get_benchmark_df(benchmark=benchmark)
184
 
185
  filtered_df = raw_df[
186
- raw_df["model"].str.lower().str.contains(text.lower())
187
  & raw_df["backend.name"].isin(backends)
188
  & raw_df["backend.torch_dtype"].isin(datatypes)
189
  & (
@@ -197,7 +197,7 @@ def filter_query(
197
  if len(optimizations) > 0
198
  else True
199
  )
200
- & (raw_df["score"] >= score)
201
  & (raw_df["forward.peak_memory(MB)"] <= memory)
202
  ]
203
 
@@ -223,18 +223,18 @@ with demo:
223
 
224
  # leaderboard tabs
225
  with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
226
- with gr.TabItem("🖥️ A100-80GB Leaderboard Table 🏆", id=0):
227
  gr.HTML(A100_TEXT)
228
 
229
  # Original leaderboard table
230
  A100_leaderboard = gr.components.Dataframe(
231
  value=A100_table,
232
- datatype=COLUMNS_DATATYPES,
233
- headers=list(COLUMNS_MAPPING.values()),
234
  elem_id="1xA100-table",
235
  )
236
 
237
- with gr.TabItem("🖥️ A100-80GB Interactive Plot 📊", id=1):
238
  gr.HTML(A100_TEXT)
239
 
240
  # Original leaderboard plot
@@ -244,7 +244,7 @@ with demo:
244
  show_label=False,
245
  )
246
 
247
- with gr.TabItem("🎮 Control Panel 🎛️", id=2):
248
  # control panel interface
249
  with gr.Row():
250
  with gr.Column(scale=1):
@@ -304,8 +304,8 @@ with demo:
304
  elem_id="filter-button",
305
  )
306
 
307
- with gr.TabItem("❔ About 📖", id=3):
308
- gr.Markdown(About_TEXT)
309
 
310
  demo.load(
311
  change_tab,
 
4
  import plotly.express as px
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
+ from src.assets.css_html_js import custom_css, custom_js
8
  from src.assets.text_content import (
9
  TITLE,
10
  INTRODUCTION_TEXT,
11
  A100_TEXT,
12
+ ABOUT_TEXT,
13
  CITATION_BUTTON_LABEL,
14
  CITATION_BUTTON_TEXT,
15
  )
 
17
  change_tab,
18
  restart_space,
19
  load_dataset_repo,
20
+ process_model_name,
21
+ process_model_type,
22
+ process_weight_class,
23
  )
 
24
 
25
 
26
  LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
27
  LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
28
  OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
29
 
30
+
31
+ ALL_COLUMNS_MAPPING = {
32
+ "model_type": "Model Type 🤗",
33
+ "weight_class": "Weight Class 🏋️",
34
+ "best_scored_model": "Best Scored Model 🏆",
35
+ #
36
  "backend.name": "Backend 🏭",
37
+ "backend.torch_dtype": "Dtype 📥",
38
  "optimizations": "Optimizations 🛠️",
39
  #
40
+ # "tradeoff": "Tradeoff* ⬇️",
41
  #
 
42
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
43
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
44
+ "best_score": "Score (%) ⬆️",
45
+ #
46
  }
47
+ ALL_COLUMNS_DATATYPES = [
48
+ "str",
49
+ "str",
50
  "markdown",
51
+ #
52
  "str",
53
  "str",
54
  "str",
55
  #
56
+ # "number",
57
  #
58
  "number",
59
  "number",
60
  "number",
 
61
  ]
62
+ SORTING_COLUMN = ["Score (%) ⬆️"]
 
63
 
64
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
65
 
 
70
 
71
  # load and merge
72
  bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
73
+ scores_df = pd.read_csv(
74
+ f"./llm-perf-dataset/reports/Grouped-Open-LLM-Leaderboard.csv"
75
+ )
76
+ bench_df = bench_df.merge(scores_df, left_on="model", right_on="best_scored_model")
 
 
 
 
 
 
 
77
 
78
  # add optimizations
79
  bench_df["optimizations"] = bench_df[
 
99
 
100
  def get_benchmark_table(bench_df):
101
  # filter
102
+ bench_df = bench_df[list(ALL_COLUMNS_MAPPING.keys())]
103
  # rename
104
+ bench_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
105
  # sort
106
  bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
107
  # transform
108
+ bench_df["Model Type 🤗"] = bench_df["Model Type 🤗"].apply(process_model_type)
109
+ bench_df["Weight Class 🏋️"] = bench_df["Weight Class 🏋️"].apply(
110
+ process_weight_class
111
+ )
112
+ bench_df["Best Scored Model 🏆"] = bench_df["Best Scored Model 🏆"].apply(
113
+ process_model_name
114
  )
 
115
  return bench_df
116
 
117
 
 
122
  fig = px.scatter(
123
  bench_df,
124
  x="generate.latency(s)",
125
+ y="best_score",
126
  color="model_type",
127
  symbol="backend.name",
128
  size="forward.peak_memory(MB)",
129
  custom_data=[
130
+ "best_scored_model",
131
  "backend.name",
132
  "backend.torch_dtype",
133
  "optimizations",
 
158
  [
159
  "Model: %{customdata[0]}",
160
  "Backend: %{customdata[1]}",
161
+ "Load Datatype: %{customdata[2]}",
162
  "Optimizations: %{customdata[3]}",
163
  "Peak Memory (MB): %{customdata[4]}",
164
  "Throughput (tokens/s): %{customdata[5]}",
165
  "Per 1000 Tokens Latency (s): %{x}",
166
+ "Open LLM Score (%): %{y}",
167
  ]
168
  )
169
  )
 
183
  raw_df = get_benchmark_df(benchmark=benchmark)
184
 
185
  filtered_df = raw_df[
186
+ raw_df["best_scored_model"].str.lower().str.contains(text.lower())
187
  & raw_df["backend.name"].isin(backends)
188
  & raw_df["backend.torch_dtype"].isin(datatypes)
189
  & (
 
197
  if len(optimizations) > 0
198
  else True
199
  )
200
+ & (raw_df["best_score"] >= score)
201
  & (raw_df["forward.peak_memory(MB)"] <= memory)
202
  ]
203
 
 
223
 
224
  # leaderboard tabs
225
  with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
226
+ with gr.TabItem("🖥️ A100-80GB Leaderboard Table 🏅", id=0):
227
  gr.HTML(A100_TEXT)
228
 
229
  # Original leaderboard table
230
  A100_leaderboard = gr.components.Dataframe(
231
  value=A100_table,
232
+ datatype=ALL_COLUMNS_DATATYPES,
233
+ headers=list(ALL_COLUMNS_MAPPING.values()),
234
  elem_id="1xA100-table",
235
  )
236
 
237
+ with gr.TabItem("🖥️ A100-80GB Interactive Plot 📊", id=2):
238
  gr.HTML(A100_TEXT)
239
 
240
  # Original leaderboard plot
 
244
  show_label=False,
245
  )
246
 
247
+ with gr.TabItem("🎮 Control Panel 🎛️", id=3):
248
  # control panel interface
249
  with gr.Row():
250
  with gr.Column(scale=1):
 
304
  elem_id="filter-button",
305
  )
306
 
307
+ with gr.TabItem("❔ About 📖", id=4):
308
+ gr.Markdown(ABOUT_TEXT)
309
 
310
  demo.load(
311
  change_tab,
src/assets/css_html_js.py CHANGED
@@ -25,8 +25,8 @@ custom_css = """
25
  border: none;
26
  }
27
 
28
- table td:first-child,
29
- table th:first-child {
30
  max-width: 300px;
31
  overflow: auto;
32
  white-space: nowrap;
 
25
  border: none;
26
  }
27
 
28
+ table td:nth-child(3),
29
+ table th:nth-child(3) {
30
  max-width: 300px;
31
  overflow: auto;
32
  white-space: nowrap;
src/assets/text_content.py CHANGED
@@ -15,14 +15,12 @@ A100_TEXT = """<h3>Single-GPU Benchmark (1xA100):</h3>
15
  </ul>
16
  """
17
 
18
- About_TEXT = """<h3>About the benchmarks</h3>
19
- <ul>
20
- <li>The performances benchmarks were obtained using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark).</li>
21
- <li>Throughput is measured in tokens per second when generating 1000 tokens with a batch size of 1.</li>
22
- <li>Peak memory is measured in MB during the first forward pass of the model (no warmup).</li>
23
- <li>Open LLM Score is an average evaluation score obtained from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).</li>
24
- <li>Open LLM Tradeoff is the euclidean distance between an LLM and the "perfect LLM" (i.e. 0 latency and 100% accuracy) translating the tradeoff between latency and accuracy.</li>
25
- </ul>
26
  """
27
 
28
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
 
15
  </ul>
16
  """
17
 
18
+ ABOUT_TEXT = """<h3>About the benchmarks:</h3>
19
+ - The performances benchmarks were obtained using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark).
20
+ - Throughput is measured in tokens per second when generating 1000 tokens with a batch size of 1.
21
+ - Peak memory is measured in MB during the first forward pass of the model (no warmup).
22
+ - Open LLM Score is an average evaluation score obtained from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
23
+ - Open LLM Tradeoff is the euclidean distance between an LLM and the "perfect LLM" (i.e. 0 latency and 100% accuracy) translating the tradeoff between latency and accuracy.
 
 
24
  """
25
 
26
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
src/utils.py CHANGED
@@ -22,70 +22,55 @@ def restart_space(LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN):
22
 
23
 
24
  def load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN):
25
- llm_perf_repo = None
26
  if OPTIMUM_TOKEN:
27
  print("Loading LLM-Perf-Dataset from Hub...")
28
- llm_perf_repo = Repository(
29
  local_dir="./llm-perf-dataset",
30
  clone_from=LLM_PERF_DATASET_REPO,
31
  token=OPTIMUM_TOKEN,
32
  repo_type="dataset",
33
  )
34
- llm_perf_repo.git_pull()
35
 
36
- return llm_perf_repo
37
 
38
 
39
- LLAMAS = [
40
- "huggingface/llama-7b",
41
- "huggingface/llama-13b",
42
- "huggingface/llama-30b",
43
- "huggingface/llama-65b",
44
- ]
45
- KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
46
- VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
47
- OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
48
- DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
49
- MODEL_PAGE = "https://huggingface.co/models"
50
- LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
51
- VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
52
- ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
 
 
53
 
54
 
55
  def model_hyperlink(link, model_name):
56
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
57
 
58
 
59
- def make_clickable_model(model_name):
60
  link = f"https://huggingface.co/{model_name}"
61
-
62
- if model_name in LLAMAS:
63
- link = LLAMA_LINK
64
- model_name = model_name.split("/")[1]
65
- elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
66
- link = VICUNA_LINK
67
- model_name = "stable-vicuna-13b"
68
- elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
69
- link = ALPACA_LINK
70
- model_name = "alpaca-13b"
71
- if model_name == "dolly-12b":
72
- link = DOLLY_LINK
73
- elif model_name == "vicuna-13b":
74
- link = VICUNA_LINK
75
- elif model_name == "koala-13b":
76
- link = KOALA_LINK
77
- elif model_name == "oasst-12b":
78
- link = OASST_LINK
79
-
80
  return model_hyperlink(link, model_name)
81
 
82
 
83
- def make_clickable_score(score):
84
- link = f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"
85
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{score}</a>'
 
 
86
 
87
 
88
- def num_to_str(num):
89
  if num < 1000:
90
  return str(int(num))
91
  elif num < 1000000:
 
22
 
23
 
24
  def load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN):
25
+ llm_perf_dataset_repo = None
26
  if OPTIMUM_TOKEN:
27
  print("Loading LLM-Perf-Dataset from Hub...")
28
+ llm_perf_dataset_repo = Repository(
29
  local_dir="./llm-perf-dataset",
30
  clone_from=LLM_PERF_DATASET_REPO,
31
  token=OPTIMUM_TOKEN,
32
  repo_type="dataset",
33
  )
34
+ llm_perf_dataset_repo.git_pull()
35
 
36
+ return llm_perf_dataset_repo
37
 
38
 
39
+ LLM_MODEL_TYPES = {
40
+ "gpt_bigcode": "GPT-BigCode 🌸",
41
+ "RefinedWebModel": "Falcon 🦅",
42
+ "RefinedWeb": "Falcon 🦅",
43
+ "baichuan": "Baichuan 🌊",
44
+ "llama": "LLaMA 🦙",
45
+ "gpt_neox": "GPT-NeoX",
46
+ "gpt_neo": "GPT-Neo",
47
+ "codegen": "CodeGen",
48
+ "chatglm": "ChatGLM",
49
+ "gpt2": "GPT-2",
50
+ "gptj": "GPT-J",
51
+ "xglm": "XGLM",
52
+ "opt": "OPT",
53
+ "mpt": "MPT",
54
+ }
55
 
56
 
57
  def model_hyperlink(link, model_name):
58
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
59
 
60
 
61
+ def process_model_name(model_name):
62
  link = f"https://huggingface.co/{model_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  return model_hyperlink(link, model_name)
64
 
65
 
66
+ def process_model_type(model_type):
67
+ if model_type in LLM_MODEL_TYPES:
68
+ return LLM_MODEL_TYPES[model_type]
69
+ else:
70
+ return model_type
71
 
72
 
73
+ def process_weight_class(num):
74
  if num < 1000:
75
  return str(int(num))
76
  elif num < 1000000: