Spaces:
Running
Running
Commit
•
3d7033f
1
Parent(s):
16a8bbd
update
Browse files- app.py +38 -61
- script.py +14 -0
- src/utils.py +10 -5
app.py
CHANGED
@@ -21,33 +21,27 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
21 |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
22 |
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
|
23 |
ALL_COLUMNS_MAPPING = {
|
24 |
-
# model
|
25 |
"Model": "Model 🤗",
|
26 |
"Arch": "Arch 🏛️",
|
27 |
-
"Size": "
|
28 |
# deployment settings
|
29 |
"backend.name": "Backend 🏭",
|
30 |
"backend.torch_dtype": "Dtype 📥",
|
31 |
-
"
|
32 |
"quantization": "Quantization 🗜️",
|
33 |
-
#
|
34 |
-
"Score": "
|
35 |
-
# throughput measurements
|
36 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
|
37 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
|
38 |
-
# latency measurements
|
39 |
"forward.latency(s)": "Prefill Latency (s) ⬇️",
|
40 |
"generate.latency(s)": "E2E Latency (s) ⬇️",
|
41 |
-
# memory measurements
|
42 |
"generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
|
43 |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
|
44 |
"generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
|
45 |
-
# energy measurements
|
46 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
|
47 |
}
|
48 |
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
49 |
SORTING_ASCENDING = [False, True]
|
50 |
-
|
51 |
ALL_COLUMNS_DATATYPES = [
|
52 |
# open llm
|
53 |
"markdown",
|
@@ -70,17 +64,18 @@ ALL_COLUMNS_DATATYPES = [
|
|
70 |
"number",
|
71 |
"number",
|
72 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
# download data
|
77 |
-
hf_hub_download(
|
78 |
-
repo_id="optimum/llm-perf-dataset",
|
79 |
-
filename="open-llm.csv",
|
80 |
-
local_dir="dataset",
|
81 |
-
repo_type="dataset",
|
82 |
-
token=HF_TOKEN,
|
83 |
-
)
|
84 |
hf_hub_download(
|
85 |
repo_id="optimum/llm-perf-dataset",
|
86 |
filename=f"{machine}/full-report.csv",
|
@@ -88,11 +83,13 @@ def get_benchmark_df(machine="hf-dgx-01"):
|
|
88 |
repo_type="dataset",
|
89 |
token=HF_TOKEN,
|
90 |
)
|
91 |
-
|
92 |
-
|
93 |
|
|
|
94 |
# merge on model
|
95 |
-
|
|
|
96 |
# transpose energy consumption
|
97 |
merged_df["generate.energy_consumption(tokens/kWh)"] = (
|
98 |
1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
|
@@ -102,8 +99,8 @@ def get_benchmark_df(machine="hf-dgx-01"):
|
|
102 |
merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
103 |
"generate.energy_consumption(tokens/kWh)",
|
104 |
] = pd.NA
|
105 |
-
# add
|
106 |
-
merged_df["
|
107 |
["backend.to_bettertransformer", "backend.use_flash_attention_2"]
|
108 |
].apply(
|
109 |
lambda x: "BetterTransformer"
|
@@ -135,10 +132,10 @@ def get_benchmark_table(bench_df):
|
|
135 |
copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
|
136 |
copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
|
137 |
# process quantization
|
138 |
-
copy_df["
|
139 |
-
lambda x: f"{x['
|
140 |
if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
|
141 |
-
else x["
|
142 |
axis=1,
|
143 |
)
|
144 |
return copy_df
|
@@ -151,7 +148,7 @@ def get_benchmark_chart(bench_df):
|
|
151 |
# plot
|
152 |
fig = px.scatter(
|
153 |
copy_df,
|
154 |
-
y="
|
155 |
x="E2E Latency (s) ⬇️",
|
156 |
size="Allocated Memory (MB) ⬇️",
|
157 |
color="Arch 🏛️",
|
@@ -167,7 +164,7 @@ def get_benchmark_chart(bench_df):
|
|
167 |
"yanchor": "top",
|
168 |
},
|
169 |
xaxis_title="Per 1000 Tokens Latency (s)",
|
170 |
-
yaxis_title="
|
171 |
legend_title="LLM Architecture",
|
172 |
width=1200,
|
173 |
height=600,
|
@@ -188,7 +185,7 @@ def filter_query(
|
|
188 |
backends,
|
189 |
datatypes,
|
190 |
optimizations,
|
191 |
-
|
192 |
score,
|
193 |
memory,
|
194 |
machine,
|
@@ -198,29 +195,9 @@ def filter_query(
|
|
198 |
raw_df["Model 🤗"].str.contains(text, case=False)
|
199 |
& raw_df["Backend ����"].isin(backends)
|
200 |
& raw_df["Dtype 📥"].isin(datatypes)
|
201 |
-
& (
|
202 |
-
|
203 |
-
|
204 |
-
raw_df["Optimizations 🛠️"].str.contains(optimization, case=False)
|
205 |
-
for optimization in optimizations
|
206 |
-
],
|
207 |
-
axis=1,
|
208 |
-
).any(axis="columns")
|
209 |
-
if len(optimizations) > 0
|
210 |
-
else True
|
211 |
-
)
|
212 |
-
& (
|
213 |
-
pd.concat(
|
214 |
-
[
|
215 |
-
raw_df["Quantization 🗜️"].str.contains(quantization, case=False)
|
216 |
-
for quantization in quantization_scheme
|
217 |
-
],
|
218 |
-
axis=1,
|
219 |
-
).any(axis="columns")
|
220 |
-
if len(quantization_scheme) > 0
|
221 |
-
else True
|
222 |
-
)
|
223 |
-
& (raw_df["Avg Score (%) ⬆️"] >= score)
|
224 |
& (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
|
225 |
]
|
226 |
filtered_table = get_benchmark_table(filtered_df)
|
@@ -289,7 +266,7 @@ with demo:
|
|
289 |
with gr.Row():
|
290 |
with gr.Column(scale=1):
|
291 |
score_slider = gr.Slider(
|
292 |
-
label="Open LLM Score 📈",
|
293 |
info="🎚️ Slide to minimum Open LLM score",
|
294 |
value=0,
|
295 |
elem_id="threshold-slider",
|
@@ -321,12 +298,12 @@ with demo:
|
|
321 |
elem_id="dtype-checkboxes",
|
322 |
)
|
323 |
with gr.Column(scale=1):
|
324 |
-
|
325 |
label="Optimizations 🛠️",
|
326 |
-
choices=["None", "BetterTransformer"],
|
327 |
-
value=["None", "BetterTransformer"],
|
328 |
-
info="☑️ Select the
|
329 |
-
elem_id="
|
330 |
)
|
331 |
with gr.Column(scale=1):
|
332 |
quantization_checkboxes = gr.CheckboxGroup(
|
@@ -348,7 +325,7 @@ with demo:
|
|
348 |
search_bar,
|
349 |
backend_checkboxes,
|
350 |
datatype_checkboxes,
|
351 |
-
|
352 |
quantization_checkboxes,
|
353 |
score_slider,
|
354 |
memory_slider,
|
|
|
21 |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
22 |
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
|
23 |
ALL_COLUMNS_MAPPING = {
|
|
|
24 |
"Model": "Model 🤗",
|
25 |
"Arch": "Arch 🏛️",
|
26 |
+
"Size": "Params (B) 📏",
|
27 |
# deployment settings
|
28 |
"backend.name": "Backend 🏭",
|
29 |
"backend.torch_dtype": "Dtype 📥",
|
30 |
+
"optimization": "Optimization 🛠️",
|
31 |
"quantization": "Quantization 🗜️",
|
32 |
+
# measurements
|
33 |
+
"Score": "Open LLM Score (%) ⬆️",
|
|
|
34 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
|
35 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
|
|
|
36 |
"forward.latency(s)": "Prefill Latency (s) ⬇️",
|
37 |
"generate.latency(s)": "E2E Latency (s) ⬇️",
|
|
|
38 |
"generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
|
39 |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
|
40 |
"generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
|
|
|
41 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
|
42 |
}
|
43 |
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
44 |
SORTING_ASCENDING = [False, True]
|
|
|
45 |
ALL_COLUMNS_DATATYPES = [
|
46 |
# open llm
|
47 |
"markdown",
|
|
|
64 |
"number",
|
65 |
"number",
|
66 |
]
|
67 |
+
# download data
|
68 |
+
hf_hub_download(
|
69 |
+
repo_id="optimum/llm-perf-dataset",
|
70 |
+
filename="open-llm.csv",
|
71 |
+
local_dir="dataset",
|
72 |
+
repo_type="dataset",
|
73 |
+
token=HF_TOKEN,
|
74 |
+
)
|
75 |
+
OPEN_LLM = pd.read_csv("dataset/open-llm.csv")
|
76 |
|
77 |
+
MACHINE_TO_DATAFRAME = {}
|
78 |
+
for machine in MACHINE_TO_HARDWARE:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
hf_hub_download(
|
80 |
repo_id="optimum/llm-perf-dataset",
|
81 |
filename=f"{machine}/full-report.csv",
|
|
|
83 |
repo_type="dataset",
|
84 |
token=HF_TOKEN,
|
85 |
)
|
86 |
+
MACHINE_TO_DATAFRAME[machine] = pd.read_csv(f"dataset/{machine}/full-report.csv")
|
87 |
+
|
88 |
|
89 |
+
def get_benchmark_df(machine="hf-dgx-01"):
|
90 |
# merge on model
|
91 |
+
llm_perf = MACHINE_TO_DATAFRAME[machine].copy()
|
92 |
+
merged_df = OPEN_LLM.merge(llm_perf, left_on="Model", right_on="model")
|
93 |
# transpose energy consumption
|
94 |
merged_df["generate.energy_consumption(tokens/kWh)"] = (
|
95 |
1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
|
|
|
99 |
merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
100 |
"generate.energy_consumption(tokens/kWh)",
|
101 |
] = pd.NA
|
102 |
+
# add optimization column
|
103 |
+
merged_df["optimization"] = merged_df[
|
104 |
["backend.to_bettertransformer", "backend.use_flash_attention_2"]
|
105 |
].apply(
|
106 |
lambda x: "BetterTransformer"
|
|
|
132 |
copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
|
133 |
copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
|
134 |
# process quantization
|
135 |
+
copy_df["Open LLM Score (%) ⬆️"] = copy_df.apply(
|
136 |
+
lambda x: f"{x['Open LLM Score (%) ⬆️']}**"
|
137 |
if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
|
138 |
+
else x["Open LLM Score (%) ⬆️"],
|
139 |
axis=1,
|
140 |
)
|
141 |
return copy_df
|
|
|
148 |
# plot
|
149 |
fig = px.scatter(
|
150 |
copy_df,
|
151 |
+
y="Open LLM Score (%) ⬆️",
|
152 |
x="E2E Latency (s) ⬇️",
|
153 |
size="Allocated Memory (MB) ⬇️",
|
154 |
color="Arch 🏛️",
|
|
|
164 |
"yanchor": "top",
|
165 |
},
|
166 |
xaxis_title="Per 1000 Tokens Latency (s)",
|
167 |
+
yaxis_title="Open LLM Score (%)",
|
168 |
legend_title="LLM Architecture",
|
169 |
width=1200,
|
170 |
height=600,
|
|
|
185 |
backends,
|
186 |
datatypes,
|
187 |
optimizations,
|
188 |
+
quantizations,
|
189 |
score,
|
190 |
memory,
|
191 |
machine,
|
|
|
195 |
raw_df["Model 🤗"].str.contains(text, case=False)
|
196 |
& raw_df["Backend ����"].isin(backends)
|
197 |
& raw_df["Dtype 📥"].isin(datatypes)
|
198 |
+
& raw_df["Optimization 🛠️"].isin(optimizations)
|
199 |
+
& raw_df["Quantization 🗜️"].isin(quantizations)
|
200 |
+
& (raw_df["Open LLM Score (%) ⬆️"] >= score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
& (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
|
202 |
]
|
203 |
filtered_table = get_benchmark_table(filtered_df)
|
|
|
266 |
with gr.Row():
|
267 |
with gr.Column(scale=1):
|
268 |
score_slider = gr.Slider(
|
269 |
+
label="Open LLM Score (%) 📈",
|
270 |
info="🎚️ Slide to minimum Open LLM score",
|
271 |
value=0,
|
272 |
elem_id="threshold-slider",
|
|
|
298 |
elem_id="dtype-checkboxes",
|
299 |
)
|
300 |
with gr.Column(scale=1):
|
301 |
+
optimization_checkboxes = gr.CheckboxGroup(
|
302 |
label="Optimizations 🛠️",
|
303 |
+
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
304 |
+
value=["None", "BetterTransformer", "FlashAttentionV2"],
|
305 |
+
info="☑️ Select the optimization",
|
306 |
+
elem_id="optimization-checkboxes",
|
307 |
)
|
308 |
with gr.Column(scale=1):
|
309 |
quantization_checkboxes = gr.CheckboxGroup(
|
|
|
325 |
search_bar,
|
326 |
backend_checkboxes,
|
327 |
datatype_checkboxes,
|
328 |
+
optimization_checkboxes,
|
329 |
quantization_checkboxes,
|
330 |
score_slider,
|
331 |
memory_slider,
|
script.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import hf_hub_download
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
|
5 |
+
hf_hub_download(
|
6 |
+
repo_id="optimum/llm-perf-dataset",
|
7 |
+
filename="open-llm.csv",
|
8 |
+
local_dir="dataset",
|
9 |
+
repo_type="dataset",
|
10 |
+
)
|
11 |
+
|
12 |
+
open_llm = pd.read_csv("dataset/open-llm.csv")
|
13 |
+
print(open_llm["Arch"].unique())
|
14 |
+
print(open_llm[open_llm["Arch"] == "rwkv"]["Model"].unique())
|
src/utils.py
CHANGED
@@ -18,26 +18,31 @@ def change_tab(query_param):
|
|
18 |
|
19 |
|
20 |
LLM_MODEL_ARCHS = {
|
21 |
-
|
22 |
-
"gpt_bigcode": "GPT-BigCode 🌸",
|
23 |
"RefinedWebModel": "Falcon 🦅",
|
|
|
24 |
"RefinedWeb": "Falcon 🦅",
|
25 |
"baichuan": "Baichuan 🌊",
|
|
|
|
|
|
|
26 |
"bloom": "Bloom 🌸",
|
27 |
"llama": "LLaMA 🦙",
|
28 |
-
|
|
|
|
|
|
|
29 |
"stablelm_alpha": "StableLM-Alpha",
|
30 |
"gpt_neox": "GPT-NeoX",
|
31 |
"gpt_neo": "GPT-Neo",
|
32 |
-
"codegen": "CodeGen",
|
33 |
"chatglm": "ChatGLM",
|
|
|
34 |
"gpt2": "GPT-2",
|
35 |
"gptj": "GPT-J",
|
36 |
"xglm": "XGLM",
|
37 |
"rwkv": "RWKV",
|
38 |
"bart": "BART",
|
39 |
"opt": "OPT",
|
40 |
-
"mpt": "MPT",
|
41 |
}
|
42 |
|
43 |
|
|
|
18 |
|
19 |
|
20 |
LLM_MODEL_ARCHS = {
|
21 |
+
"mixformer-sequential": "Phi φ",
|
|
|
22 |
"RefinedWebModel": "Falcon 🦅",
|
23 |
+
"gpt_bigcode": "StarCoder ⭐",
|
24 |
"RefinedWeb": "Falcon 🦅",
|
25 |
"baichuan": "Baichuan 🌊",
|
26 |
+
"mistral": "Mistral Ⓜ️",
|
27 |
+
"codegen": "CodeGen ♾️",
|
28 |
+
"falcon": "Falcon 🦅",
|
29 |
"bloom": "Bloom 🌸",
|
30 |
"llama": "LLaMA 🦙",
|
31 |
+
"mpt": "MPT 🧱",
|
32 |
+
"Yi": "Yi 人",
|
33 |
+
# suggest something
|
34 |
+
"stablelm_epoch": "StableLM-Epoch",
|
35 |
"stablelm_alpha": "StableLM-Alpha",
|
36 |
"gpt_neox": "GPT-NeoX",
|
37 |
"gpt_neo": "GPT-Neo",
|
|
|
38 |
"chatglm": "ChatGLM",
|
39 |
+
"internlm": "InternLM",
|
40 |
"gpt2": "GPT-2",
|
41 |
"gptj": "GPT-J",
|
42 |
"xglm": "XGLM",
|
43 |
"rwkv": "RWKV",
|
44 |
"bart": "BART",
|
45 |
"opt": "OPT",
|
|
|
46 |
}
|
47 |
|
48 |
|