Spaces:
Running
Running
BenchmarkBot
commited on
Commit
·
a894537
1
Parent(s):
5608bf7
experiments by model type and weight class
Browse files- app.py +46 -46
- src/assets/css_html_js.py +2 -2
- src/assets/text_content.py +6 -8
- src/utils.py +27 -42
app.py
CHANGED
@@ -4,11 +4,12 @@ import pandas as pd
|
|
4 |
import plotly.express as px
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
|
|
|
7 |
from src.assets.text_content import (
|
8 |
TITLE,
|
9 |
INTRODUCTION_TEXT,
|
10 |
A100_TEXT,
|
11 |
-
|
12 |
CITATION_BUTTON_LABEL,
|
13 |
CITATION_BUTTON_TEXT,
|
14 |
)
|
@@ -16,45 +17,49 @@ from src.utils import (
|
|
16 |
change_tab,
|
17 |
restart_space,
|
18 |
load_dataset_repo,
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
)
|
23 |
-
from src.assets.css_html_js import custom_css, custom_js
|
24 |
|
25 |
|
26 |
LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
|
27 |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
28 |
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
32 |
"backend.name": "Backend 🏭",
|
33 |
-
"backend.torch_dtype": "
|
34 |
"optimizations": "Optimizations 🛠️",
|
35 |
#
|
36 |
-
"tradeoff": "
|
37 |
#
|
38 |
-
"score": "Open LLM Score ⬆️",
|
39 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
|
40 |
"forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
|
41 |
-
"
|
|
|
42 |
}
|
43 |
-
|
|
|
|
|
44 |
"markdown",
|
|
|
45 |
"str",
|
46 |
"str",
|
47 |
"str",
|
48 |
#
|
49 |
-
"number",
|
50 |
#
|
51 |
"number",
|
52 |
"number",
|
53 |
"number",
|
54 |
-
"number",
|
55 |
]
|
56 |
-
SORTING_COLUMN = ["
|
57 |
-
|
58 |
|
59 |
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
|
60 |
|
@@ -65,17 +70,10 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
|
|
65 |
|
66 |
# load and merge
|
67 |
bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
|
68 |
-
scores_df = pd.read_csv(
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
bench_df = bench_df[bench_df["score"].notna()]
|
73 |
-
|
74 |
-
# create composite score
|
75 |
-
score_distance = 100 - bench_df["score"]
|
76 |
-
latency_distance = bench_df["generate.latency(s)"]
|
77 |
-
bench_df["tradeoff"] = (score_distance**2 + latency_distance**2) ** 0.5
|
78 |
-
bench_df["tradeoff"] = bench_df["tradeoff"].round(2)
|
79 |
|
80 |
# add optimizations
|
81 |
bench_df["optimizations"] = bench_df[
|
@@ -101,17 +99,19 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
|
|
101 |
|
102 |
def get_benchmark_table(bench_df):
|
103 |
# filter
|
104 |
-
bench_df = bench_df[list(
|
105 |
# rename
|
106 |
-
bench_df.rename(columns=
|
107 |
# sort
|
108 |
bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
|
109 |
# transform
|
110 |
-
bench_df["Model 🤗"] = bench_df["Model 🤗"].apply(
|
111 |
-
bench_df["
|
112 |
-
|
|
|
|
|
|
|
113 |
)
|
114 |
-
|
115 |
return bench_df
|
116 |
|
117 |
|
@@ -122,12 +122,12 @@ def get_benchmark_plot(bench_df):
|
|
122 |
fig = px.scatter(
|
123 |
bench_df,
|
124 |
x="generate.latency(s)",
|
125 |
-
y="
|
126 |
color="model_type",
|
127 |
symbol="backend.name",
|
128 |
size="forward.peak_memory(MB)",
|
129 |
custom_data=[
|
130 |
-
"
|
131 |
"backend.name",
|
132 |
"backend.torch_dtype",
|
133 |
"optimizations",
|
@@ -158,12 +158,12 @@ def get_benchmark_plot(bench_df):
|
|
158 |
[
|
159 |
"Model: %{customdata[0]}",
|
160 |
"Backend: %{customdata[1]}",
|
161 |
-
"Datatype: %{customdata[2]}",
|
162 |
"Optimizations: %{customdata[3]}",
|
163 |
"Peak Memory (MB): %{customdata[4]}",
|
164 |
"Throughput (tokens/s): %{customdata[5]}",
|
165 |
"Per 1000 Tokens Latency (s): %{x}",
|
166 |
-
"Open LLM Score: %{y}",
|
167 |
]
|
168 |
)
|
169 |
)
|
@@ -183,7 +183,7 @@ def filter_query(
|
|
183 |
raw_df = get_benchmark_df(benchmark=benchmark)
|
184 |
|
185 |
filtered_df = raw_df[
|
186 |
-
raw_df["
|
187 |
& raw_df["backend.name"].isin(backends)
|
188 |
& raw_df["backend.torch_dtype"].isin(datatypes)
|
189 |
& (
|
@@ -197,7 +197,7 @@ def filter_query(
|
|
197 |
if len(optimizations) > 0
|
198 |
else True
|
199 |
)
|
200 |
-
& (raw_df["
|
201 |
& (raw_df["forward.peak_memory(MB)"] <= memory)
|
202 |
]
|
203 |
|
@@ -223,18 +223,18 @@ with demo:
|
|
223 |
|
224 |
# leaderboard tabs
|
225 |
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
|
226 |
-
with gr.TabItem("🖥️ A100-80GB Leaderboard Table
|
227 |
gr.HTML(A100_TEXT)
|
228 |
|
229 |
# Original leaderboard table
|
230 |
A100_leaderboard = gr.components.Dataframe(
|
231 |
value=A100_table,
|
232 |
-
datatype=
|
233 |
-
headers=list(
|
234 |
elem_id="1xA100-table",
|
235 |
)
|
236 |
|
237 |
-
with gr.TabItem("🖥️ A100-80GB Interactive Plot 📊", id=
|
238 |
gr.HTML(A100_TEXT)
|
239 |
|
240 |
# Original leaderboard plot
|
@@ -244,7 +244,7 @@ with demo:
|
|
244 |
show_label=False,
|
245 |
)
|
246 |
|
247 |
-
with gr.TabItem("🎮 Control Panel 🎛️", id=
|
248 |
# control panel interface
|
249 |
with gr.Row():
|
250 |
with gr.Column(scale=1):
|
@@ -304,8 +304,8 @@ with demo:
|
|
304 |
elem_id="filter-button",
|
305 |
)
|
306 |
|
307 |
-
with gr.TabItem("❔ About 📖", id=
|
308 |
-
gr.Markdown(
|
309 |
|
310 |
demo.load(
|
311 |
change_tab,
|
|
|
4 |
import plotly.express as px
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
|
7 |
+
from src.assets.css_html_js import custom_css, custom_js
|
8 |
from src.assets.text_content import (
|
9 |
TITLE,
|
10 |
INTRODUCTION_TEXT,
|
11 |
A100_TEXT,
|
12 |
+
ABOUT_TEXT,
|
13 |
CITATION_BUTTON_LABEL,
|
14 |
CITATION_BUTTON_TEXT,
|
15 |
)
|
|
|
17 |
change_tab,
|
18 |
restart_space,
|
19 |
load_dataset_repo,
|
20 |
+
process_model_name,
|
21 |
+
process_model_type,
|
22 |
+
process_weight_class,
|
23 |
)
|
|
|
24 |
|
25 |
|
26 |
LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
|
27 |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
28 |
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
|
29 |
|
30 |
+
|
31 |
+
ALL_COLUMNS_MAPPING = {
|
32 |
+
"model_type": "Model Type 🤗",
|
33 |
+
"weight_class": "Weight Class 🏋️",
|
34 |
+
"best_scored_model": "Best Scored Model 🏆",
|
35 |
+
#
|
36 |
"backend.name": "Backend 🏭",
|
37 |
+
"backend.torch_dtype": "Dtype 📥",
|
38 |
"optimizations": "Optimizations 🛠️",
|
39 |
#
|
40 |
+
# "tradeoff": "Tradeoff* ⬇️",
|
41 |
#
|
|
|
42 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
|
43 |
"forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
|
44 |
+
"best_score": "Score (%) ⬆️",
|
45 |
+
#
|
46 |
}
|
47 |
+
ALL_COLUMNS_DATATYPES = [
|
48 |
+
"str",
|
49 |
+
"str",
|
50 |
"markdown",
|
51 |
+
#
|
52 |
"str",
|
53 |
"str",
|
54 |
"str",
|
55 |
#
|
56 |
+
# "number",
|
57 |
#
|
58 |
"number",
|
59 |
"number",
|
60 |
"number",
|
|
|
61 |
]
|
62 |
+
SORTING_COLUMN = ["Score (%) ⬆️"]
|
|
|
63 |
|
64 |
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
|
65 |
|
|
|
70 |
|
71 |
# load and merge
|
72 |
bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
|
73 |
+
scores_df = pd.read_csv(
|
74 |
+
f"./llm-perf-dataset/reports/Grouped-Open-LLM-Leaderboard.csv"
|
75 |
+
)
|
76 |
+
bench_df = bench_df.merge(scores_df, left_on="model", right_on="best_scored_model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# add optimizations
|
79 |
bench_df["optimizations"] = bench_df[
|
|
|
99 |
|
100 |
def get_benchmark_table(bench_df):
|
101 |
# filter
|
102 |
+
bench_df = bench_df[list(ALL_COLUMNS_MAPPING.keys())]
|
103 |
# rename
|
104 |
+
bench_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
|
105 |
# sort
|
106 |
bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
|
107 |
# transform
|
108 |
+
bench_df["Model Type 🤗"] = bench_df["Model Type 🤗"].apply(process_model_type)
|
109 |
+
bench_df["Weight Class 🏋️"] = bench_df["Weight Class 🏋️"].apply(
|
110 |
+
process_weight_class
|
111 |
+
)
|
112 |
+
bench_df["Best Scored Model 🏆"] = bench_df["Best Scored Model 🏆"].apply(
|
113 |
+
process_model_name
|
114 |
)
|
|
|
115 |
return bench_df
|
116 |
|
117 |
|
|
|
122 |
fig = px.scatter(
|
123 |
bench_df,
|
124 |
x="generate.latency(s)",
|
125 |
+
y="best_score",
|
126 |
color="model_type",
|
127 |
symbol="backend.name",
|
128 |
size="forward.peak_memory(MB)",
|
129 |
custom_data=[
|
130 |
+
"best_scored_model",
|
131 |
"backend.name",
|
132 |
"backend.torch_dtype",
|
133 |
"optimizations",
|
|
|
158 |
[
|
159 |
"Model: %{customdata[0]}",
|
160 |
"Backend: %{customdata[1]}",
|
161 |
+
"Load Datatype: %{customdata[2]}",
|
162 |
"Optimizations: %{customdata[3]}",
|
163 |
"Peak Memory (MB): %{customdata[4]}",
|
164 |
"Throughput (tokens/s): %{customdata[5]}",
|
165 |
"Per 1000 Tokens Latency (s): %{x}",
|
166 |
+
"Open LLM Score (%): %{y}",
|
167 |
]
|
168 |
)
|
169 |
)
|
|
|
183 |
raw_df = get_benchmark_df(benchmark=benchmark)
|
184 |
|
185 |
filtered_df = raw_df[
|
186 |
+
raw_df["best_scored_model"].str.lower().str.contains(text.lower())
|
187 |
& raw_df["backend.name"].isin(backends)
|
188 |
& raw_df["backend.torch_dtype"].isin(datatypes)
|
189 |
& (
|
|
|
197 |
if len(optimizations) > 0
|
198 |
else True
|
199 |
)
|
200 |
+
& (raw_df["best_score"] >= score)
|
201 |
& (raw_df["forward.peak_memory(MB)"] <= memory)
|
202 |
]
|
203 |
|
|
|
223 |
|
224 |
# leaderboard tabs
|
225 |
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
|
226 |
+
with gr.TabItem("🖥️ A100-80GB Leaderboard Table 🏅", id=0):
|
227 |
gr.HTML(A100_TEXT)
|
228 |
|
229 |
# Original leaderboard table
|
230 |
A100_leaderboard = gr.components.Dataframe(
|
231 |
value=A100_table,
|
232 |
+
datatype=ALL_COLUMNS_DATATYPES,
|
233 |
+
headers=list(ALL_COLUMNS_MAPPING.values()),
|
234 |
elem_id="1xA100-table",
|
235 |
)
|
236 |
|
237 |
+
with gr.TabItem("🖥️ A100-80GB Interactive Plot 📊", id=2):
|
238 |
gr.HTML(A100_TEXT)
|
239 |
|
240 |
# Original leaderboard plot
|
|
|
244 |
show_label=False,
|
245 |
)
|
246 |
|
247 |
+
with gr.TabItem("🎮 Control Panel 🎛️", id=3):
|
248 |
# control panel interface
|
249 |
with gr.Row():
|
250 |
with gr.Column(scale=1):
|
|
|
304 |
elem_id="filter-button",
|
305 |
)
|
306 |
|
307 |
+
with gr.TabItem("❔ About 📖", id=4):
|
308 |
+
gr.Markdown(ABOUT_TEXT)
|
309 |
|
310 |
demo.load(
|
311 |
change_tab,
|
src/assets/css_html_js.py
CHANGED
@@ -25,8 +25,8 @@ custom_css = """
|
|
25 |
border: none;
|
26 |
}
|
27 |
|
28 |
-
table td:
|
29 |
-
table th:
|
30 |
max-width: 300px;
|
31 |
overflow: auto;
|
32 |
white-space: nowrap;
|
|
|
25 |
border: none;
|
26 |
}
|
27 |
|
28 |
+
table td:nth-child(3),
|
29 |
+
table th:nth-child(3) {
|
30 |
max-width: 300px;
|
31 |
overflow: auto;
|
32 |
white-space: nowrap;
|
src/assets/text_content.py
CHANGED
@@ -15,14 +15,12 @@ A100_TEXT = """<h3>Single-GPU Benchmark (1xA100):</h3>
|
|
15 |
</ul>
|
16 |
"""
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
<li>Open LLM Tradeoff is the euclidean distance between an LLM and the "perfect LLM" (i.e. 0 latency and 100% accuracy) translating the tradeoff between latency and accuracy.</li>
|
25 |
-
</ul>
|
26 |
"""
|
27 |
|
28 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
|
|
15 |
</ul>
|
16 |
"""
|
17 |
|
18 |
+
ABOUT_TEXT = """<h3>About the benchmarks:</h3>
|
19 |
+
- The performances benchmarks were obtained using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark).
|
20 |
+
- Throughput is measured in tokens per second when generating 1000 tokens with a batch size of 1.
|
21 |
+
- Peak memory is measured in MB during the first forward pass of the model (no warmup).
|
22 |
+
- Open LLM Score is an average evaluation score obtained from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
|
23 |
+
- Open LLM Tradeoff is the euclidean distance between an LLM and the "perfect LLM" (i.e. 0 latency and 100% accuracy) translating the tradeoff between latency and accuracy.
|
|
|
|
|
24 |
"""
|
25 |
|
26 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
src/utils.py
CHANGED
@@ -22,70 +22,55 @@ def restart_space(LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN):
|
|
22 |
|
23 |
|
24 |
def load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN):
|
25 |
-
|
26 |
if OPTIMUM_TOKEN:
|
27 |
print("Loading LLM-Perf-Dataset from Hub...")
|
28 |
-
|
29 |
local_dir="./llm-perf-dataset",
|
30 |
clone_from=LLM_PERF_DATASET_REPO,
|
31 |
token=OPTIMUM_TOKEN,
|
32 |
repo_type="dataset",
|
33 |
)
|
34 |
-
|
35 |
|
36 |
-
return
|
37 |
|
38 |
|
39 |
-
|
40 |
-
"
|
41 |
-
"
|
42 |
-
"
|
43 |
-
"
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
53 |
|
54 |
|
55 |
def model_hyperlink(link, model_name):
|
56 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
57 |
|
58 |
|
59 |
-
def
|
60 |
link = f"https://huggingface.co/{model_name}"
|
61 |
-
|
62 |
-
if model_name in LLAMAS:
|
63 |
-
link = LLAMA_LINK
|
64 |
-
model_name = model_name.split("/")[1]
|
65 |
-
elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
|
66 |
-
link = VICUNA_LINK
|
67 |
-
model_name = "stable-vicuna-13b"
|
68 |
-
elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
|
69 |
-
link = ALPACA_LINK
|
70 |
-
model_name = "alpaca-13b"
|
71 |
-
if model_name == "dolly-12b":
|
72 |
-
link = DOLLY_LINK
|
73 |
-
elif model_name == "vicuna-13b":
|
74 |
-
link = VICUNA_LINK
|
75 |
-
elif model_name == "koala-13b":
|
76 |
-
link = KOALA_LINK
|
77 |
-
elif model_name == "oasst-12b":
|
78 |
-
link = OASST_LINK
|
79 |
-
|
80 |
return model_hyperlink(link, model_name)
|
81 |
|
82 |
|
83 |
-
def
|
84 |
-
|
85 |
-
|
|
|
|
|
86 |
|
87 |
|
88 |
-
def
|
89 |
if num < 1000:
|
90 |
return str(int(num))
|
91 |
elif num < 1000000:
|
|
|
22 |
|
23 |
|
24 |
def load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN):
|
25 |
+
llm_perf_dataset_repo = None
|
26 |
if OPTIMUM_TOKEN:
|
27 |
print("Loading LLM-Perf-Dataset from Hub...")
|
28 |
+
llm_perf_dataset_repo = Repository(
|
29 |
local_dir="./llm-perf-dataset",
|
30 |
clone_from=LLM_PERF_DATASET_REPO,
|
31 |
token=OPTIMUM_TOKEN,
|
32 |
repo_type="dataset",
|
33 |
)
|
34 |
+
llm_perf_dataset_repo.git_pull()
|
35 |
|
36 |
+
return llm_perf_dataset_repo
|
37 |
|
38 |
|
39 |
+
LLM_MODEL_TYPES = {
|
40 |
+
"gpt_bigcode": "GPT-BigCode 🌸",
|
41 |
+
"RefinedWebModel": "Falcon 🦅",
|
42 |
+
"RefinedWeb": "Falcon 🦅",
|
43 |
+
"baichuan": "Baichuan 🌊",
|
44 |
+
"llama": "LLaMA 🦙",
|
45 |
+
"gpt_neox": "GPT-NeoX",
|
46 |
+
"gpt_neo": "GPT-Neo",
|
47 |
+
"codegen": "CodeGen",
|
48 |
+
"chatglm": "ChatGLM",
|
49 |
+
"gpt2": "GPT-2",
|
50 |
+
"gptj": "GPT-J",
|
51 |
+
"xglm": "XGLM",
|
52 |
+
"opt": "OPT",
|
53 |
+
"mpt": "MPT",
|
54 |
+
}
|
55 |
|
56 |
|
57 |
def model_hyperlink(link, model_name):
|
58 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
59 |
|
60 |
|
61 |
+
def process_model_name(model_name):
|
62 |
link = f"https://huggingface.co/{model_name}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
return model_hyperlink(link, model_name)
|
64 |
|
65 |
|
66 |
+
def process_model_type(model_type):
|
67 |
+
if model_type in LLM_MODEL_TYPES:
|
68 |
+
return LLM_MODEL_TYPES[model_type]
|
69 |
+
else:
|
70 |
+
return model_type
|
71 |
|
72 |
|
73 |
+
def process_weight_class(num):
|
74 |
if num < 1000:
|
75 |
return str(int(num))
|
76 |
elif num < 1000000:
|