Spaces:
Running
Running
Commit
β’
a1135a9
1
Parent(s):
8b4dd08
update the leaderboard
Browse files- src/bettertransformer.py +3 -8
- src/content.py +16 -10
- src/control_panel.py +7 -7
- src/flashattentionv2.py +3 -8
- src/llm_perf.py +8 -16
- src/utils.py +22 -1
src/bettertransformer.py
CHANGED
@@ -3,9 +3,6 @@ import pandas as pd
|
|
3 |
import plotly.express as px
|
4 |
|
5 |
|
6 |
-
from src.utils import process_arch
|
7 |
-
|
8 |
-
|
9 |
BETTERTRANSFORMER_DATA = [
|
10 |
# open llm
|
11 |
"Model π€",
|
@@ -33,10 +30,8 @@ BETTERTRANSFORMER_DATA = [
|
|
33 |
|
34 |
def get_bt_df(llm_perf_df):
|
35 |
bt_df = llm_perf_df.copy()
|
36 |
-
# process
|
37 |
-
bt_df["Arch ποΈ"] = bt_df["Arch ποΈ"].apply(process_arch)
|
38 |
# seperate original model experiments from BetterTransformer experiments
|
39 |
-
original_df = bt_df[bt_df["Optimization π οΈ"] == "None"]
|
40 |
bt_df = bt_df[bt_df["Optimization π οΈ"] == "BetterTransformer"]
|
41 |
# merge the two dataframes
|
42 |
bt_df = pd.merge(
|
@@ -48,10 +43,10 @@ def get_bt_df(llm_perf_df):
|
|
48 |
# compute speedups
|
49 |
bt_df["Prefill Latency Speedup (%)"] = (
|
50 |
(bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
|
51 |
-
).round(2)
|
52 |
bt_df["Decode Throughput Speedup (%)"] = (
|
53 |
(bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
|
54 |
-
).round(2)
|
55 |
|
56 |
# filter speedups > 1000%
|
57 |
bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
|
|
|
3 |
import plotly.express as px
|
4 |
|
5 |
|
|
|
|
|
|
|
6 |
BETTERTRANSFORMER_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
|
|
30 |
|
31 |
def get_bt_df(llm_perf_df):
|
32 |
bt_df = llm_perf_df.copy()
|
|
|
|
|
33 |
# seperate original model experiments from BetterTransformer experiments
|
34 |
+
original_df = bt_df[(bt_df["Optimization π οΈ"] == "None") & (bt_df["DType π₯"] == "float16")]
|
35 |
bt_df = bt_df[bt_df["Optimization π οΈ"] == "BetterTransformer"]
|
36 |
# merge the two dataframes
|
37 |
bt_df = pd.merge(
|
|
|
43 |
# compute speedups
|
44 |
bt_df["Prefill Latency Speedup (%)"] = (
|
45 |
(bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
|
46 |
+
).round(2) - 100
|
47 |
bt_df["Decode Throughput Speedup (%)"] = (
|
48 |
(bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
|
49 |
+
).round(2) - 100
|
50 |
|
51 |
# filter speedups > 1000%
|
52 |
bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
|
src/content.py
CHANGED
@@ -14,7 +14,7 @@ ABOUT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
|
14 |
<ul>
|
15 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
16 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
17 |
-
<li>LLMs are running on a singleton batch with a prompt size of 256 and generating a
|
18 |
<li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
|
19 |
<li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li>
|
20 |
</ul>
|
@@ -28,19 +28,26 @@ defaults:
|
|
28 |
- _base_ # inheriting from base config
|
29 |
- _self_ # for hydra 1.1 compatibility
|
30 |
|
31 |
-
experiment_name: pytorch+cuda+float16+
|
32 |
device: cuda
|
33 |
|
34 |
backend:
|
35 |
no_weights: true
|
36 |
torch_dtype: float16
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
```
|
39 |
|
40 |
Where the base config is:
|
41 |
```yaml
|
42 |
defaults:
|
43 |
- benchmark: inference # default benchmark
|
|
|
44 |
- experiment # inheriting from experiment config
|
45 |
- _self_ # for hydra 1.1 compatibility
|
46 |
- override hydra/job_logging: colorlog # colorful logging
|
@@ -48,30 +55,29 @@ defaults:
|
|
48 |
|
49 |
hydra:
|
50 |
run:
|
51 |
-
dir:
|
52 |
job:
|
53 |
chdir: true
|
54 |
env_set:
|
|
|
|
|
55 |
CUDA_VISIBLE_DEVICES: 0
|
56 |
CUDA_DEVICE_ORDER: PCI_BUS_ID
|
57 |
|
58 |
-
model: ???
|
59 |
-
experiment_name: ???
|
60 |
-
|
61 |
backend:
|
62 |
-
|
63 |
-
continous_isolation_check: true
|
64 |
|
65 |
benchmark:
|
66 |
duration: 10
|
67 |
memory: true
|
68 |
energy: true
|
69 |
|
70 |
-
new_tokens: 1000
|
71 |
input_shapes:
|
72 |
batch_size: 1
|
73 |
sequence_length: 256
|
74 |
|
|
|
|
|
75 |
hub_kwargs:
|
76 |
trust_remote_code: true
|
77 |
```
|
|
|
14 |
<ul>
|
15 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
16 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
17 |
+
<li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 256 tokens.</li>
|
18 |
<li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
|
19 |
<li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li>
|
20 |
</ul>
|
|
|
28 |
- _base_ # inheriting from base config
|
29 |
- _self_ # for hydra 1.1 compatibility
|
30 |
|
31 |
+
experiment_name: pytorch+cuda+float16+gptq-4bit+exllama-v1
|
32 |
device: cuda
|
33 |
|
34 |
backend:
|
35 |
no_weights: true
|
36 |
torch_dtype: float16
|
37 |
+
quantization_scheme: gptq
|
38 |
+
quantization_config:
|
39 |
+
bits: 4
|
40 |
+
use_cuda_fp16: false
|
41 |
+
use_exllama: true
|
42 |
+
exllama_config:
|
43 |
+
version: 1
|
44 |
```
|
45 |
|
46 |
Where the base config is:
|
47 |
```yaml
|
48 |
defaults:
|
49 |
- benchmark: inference # default benchmark
|
50 |
+
- launcher: process # isolated process launcher
|
51 |
- experiment # inheriting from experiment config
|
52 |
- _self_ # for hydra 1.1 compatibility
|
53 |
- override hydra/job_logging: colorlog # colorful logging
|
|
|
55 |
|
56 |
hydra:
|
57 |
run:
|
58 |
+
dir: dataset/${oc.env:HOSTNAME}/${experiment_name}/${model}
|
59 |
job:
|
60 |
chdir: true
|
61 |
env_set:
|
62 |
+
COUNTRY_ISO_CODE: FRA
|
63 |
+
OVERRIDE_BENCHMARKS: 0
|
64 |
CUDA_VISIBLE_DEVICES: 0
|
65 |
CUDA_DEVICE_ORDER: PCI_BUS_ID
|
66 |
|
|
|
|
|
|
|
67 |
backend:
|
68 |
+
continuous_isolation: true
|
|
|
69 |
|
70 |
benchmark:
|
71 |
duration: 10
|
72 |
memory: true
|
73 |
energy: true
|
74 |
|
|
|
75 |
input_shapes:
|
76 |
batch_size: 1
|
77 |
sequence_length: 256
|
78 |
|
79 |
+
new_tokens: 256
|
80 |
+
|
81 |
hub_kwargs:
|
82 |
trust_remote_code: true
|
83 |
```
|
src/control_panel.py
CHANGED
@@ -39,17 +39,17 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
39 |
with gr.Column(scale=1):
|
40 |
backend_checkboxes = gr.CheckboxGroup(
|
41 |
label="Backends π",
|
42 |
-
choices=["pytorch"
|
43 |
-
value=["pytorch"
|
44 |
info="βοΈ Select the backends",
|
45 |
elem_id="backend-checkboxes",
|
46 |
)
|
47 |
with gr.Row():
|
48 |
with gr.Column(scale=1):
|
49 |
datatype_checkboxes = gr.CheckboxGroup(
|
50 |
-
label="DTypes π₯",
|
51 |
-
choices=["float32", "float16"],
|
52 |
-
value=["float32", "float16"],
|
53 |
info="βοΈ Select the load data types",
|
54 |
elem_id="dtype-checkboxes",
|
55 |
)
|
@@ -64,8 +64,8 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
64 |
with gr.Column(scale=1):
|
65 |
quantization_checkboxes = gr.CheckboxGroup(
|
66 |
label="Quantizations ποΈ",
|
67 |
-
choices=["None", "BnB.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
|
68 |
-
value=["None", "BnB.4bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
|
69 |
info="βοΈ Select the quantization schemes",
|
70 |
elem_id="quantization-checkboxes",
|
71 |
)
|
|
|
39 |
with gr.Column(scale=1):
|
40 |
backend_checkboxes = gr.CheckboxGroup(
|
41 |
label="Backends π",
|
42 |
+
choices=["pytorch"],
|
43 |
+
value=["pytorch"],
|
44 |
info="βοΈ Select the backends",
|
45 |
elem_id="backend-checkboxes",
|
46 |
)
|
47 |
with gr.Row():
|
48 |
with gr.Column(scale=1):
|
49 |
datatype_checkboxes = gr.CheckboxGroup(
|
50 |
+
label="Load DTypes π₯",
|
51 |
+
choices=["float32", "float16", "bfloat16"],
|
52 |
+
value=["float32", "float16", "bfloat16"],
|
53 |
info="βοΈ Select the load data types",
|
54 |
elem_id="dtype-checkboxes",
|
55 |
)
|
|
|
64 |
with gr.Column(scale=1):
|
65 |
quantization_checkboxes = gr.CheckboxGroup(
|
66 |
label="Quantizations ποΈ",
|
67 |
+
choices=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
|
68 |
+
value=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
|
69 |
info="βοΈ Select the quantization schemes",
|
70 |
elem_id="quantization-checkboxes",
|
71 |
)
|
src/flashattentionv2.py
CHANGED
@@ -3,9 +3,6 @@ import pandas as pd
|
|
3 |
import plotly.express as px
|
4 |
|
5 |
|
6 |
-
from src.utils import process_arch
|
7 |
-
|
8 |
-
|
9 |
FLASHATTENTIONV2_DATA = [
|
10 |
# open llm
|
11 |
"Model π€",
|
@@ -33,10 +30,8 @@ FLASHATTENTIONV2_DATA = [
|
|
33 |
|
34 |
def get_fa2_df(llm_perf_df):
|
35 |
fa2_df = llm_perf_df.copy()
|
36 |
-
# process
|
37 |
-
fa2_df["Arch ποΈ"] = fa2_df["Arch ποΈ"].apply(process_arch)
|
38 |
# seperate original model experiments from FlashAttentionV2 experiments
|
39 |
-
original_df = fa2_df[fa2_df["Optimization π οΈ"] == "None"]
|
40 |
fa2_df = fa2_df[fa2_df["Optimization π οΈ"] == "FlashAttentionV2"]
|
41 |
# merge the two dataframes
|
42 |
fa2_df = pd.merge(
|
@@ -48,10 +43,10 @@ def get_fa2_df(llm_perf_df):
|
|
48 |
# compute speedups
|
49 |
fa2_df["Prefill Latency Speedup (%)"] = (
|
50 |
(fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
|
51 |
-
).round(2)
|
52 |
fa2_df["Decode Throughput Speedup (%)"] = (
|
53 |
(fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
|
54 |
-
).round(2)
|
55 |
|
56 |
# filter speedups > 1000%
|
57 |
fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
|
|
|
3 |
import plotly.express as px
|
4 |
|
5 |
|
|
|
|
|
|
|
6 |
FLASHATTENTIONV2_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
|
|
30 |
|
31 |
def get_fa2_df(llm_perf_df):
|
32 |
fa2_df = llm_perf_df.copy()
|
|
|
|
|
33 |
# seperate original model experiments from FlashAttentionV2 experiments
|
34 |
+
original_df = fa2_df[(fa2_df["Optimization π οΈ"] == "None") & (fa2_df["DType π₯"] == "float16")]
|
35 |
fa2_df = fa2_df[fa2_df["Optimization π οΈ"] == "FlashAttentionV2"]
|
36 |
# merge the two dataframes
|
37 |
fa2_df = pd.merge(
|
|
|
43 |
# compute speedups
|
44 |
fa2_df["Prefill Latency Speedup (%)"] = (
|
45 |
(fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
|
46 |
+
).round(2) - 100
|
47 |
fa2_df["Decode Throughput Speedup (%)"] = (
|
48 |
(fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
|
49 |
+
).round(2) - 100
|
50 |
|
51 |
# filter speedups > 1000%
|
52 |
fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
|
src/llm_perf.py
CHANGED
@@ -3,6 +3,8 @@ import os
|
|
3 |
import pandas as pd
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
|
|
|
|
|
6 |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
7 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
8 |
|
@@ -91,24 +93,14 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
|
|
91 |
llm_perf_df["quantization"] = llm_perf_df[
|
92 |
[
|
93 |
"backend.quantization_scheme",
|
|
|
|
|
|
|
94 |
"backend.quantization_config.exllama_config.version",
|
95 |
]
|
96 |
-
].apply(
|
97 |
-
|
98 |
-
|
99 |
-
else (
|
100 |
-
"GPTQ.4bit+ExllamaV1"
|
101 |
-
if (x["backend.quantization_scheme"] == "gptq")
|
102 |
-
and (x["backend.quantization_config.exllama_config.version"] == 1)
|
103 |
-
else (
|
104 |
-
"GPTQ.4bit+ExllamaV2"
|
105 |
-
if (x["backend.quantization_scheme"] == "gptq")
|
106 |
-
and (x["backend.quantization_config.exllama_config.version"] == 2)
|
107 |
-
else "None"
|
108 |
-
)
|
109 |
-
),
|
110 |
-
axis=1,
|
111 |
-
)
|
112 |
# add decode throughput
|
113 |
llm_perf_df["decode.throughput(tokens/s)"] = (
|
114 |
1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
|
|
|
3 |
import pandas as pd
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
|
6 |
+
from .utils import process_quantization_scheme, process_arch
|
7 |
+
|
8 |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
9 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
10 |
|
|
|
93 |
llm_perf_df["quantization"] = llm_perf_df[
|
94 |
[
|
95 |
"backend.quantization_scheme",
|
96 |
+
"backend.quantization_config.bits",
|
97 |
+
"backend.quantization_config.load_in_4bit",
|
98 |
+
"backend.quantization_config.load_in_8bit",
|
99 |
"backend.quantization_config.exllama_config.version",
|
100 |
]
|
101 |
+
].apply(lambda x: process_quantization_scheme(x), axis=1)
|
102 |
+
# add arch
|
103 |
+
llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
# add decode throughput
|
105 |
llm_perf_df["decode.throughput(tokens/s)"] = (
|
106 |
1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
|
src/utils.py
CHANGED
@@ -8,14 +8,16 @@ LLM_MODEL_ARCHS = {
|
|
8 |
"baichuan": "π Baichuan ηΎε·", # river
|
9 |
"internlm": "π§βπ InternLM δΉ¦η", # scholar
|
10 |
"mistral": "βοΈ Mistral",
|
|
|
11 |
"codegen": "βΎοΈ CodeGen",
|
12 |
"chatglm": "π¬ ChatGLM",
|
13 |
"falcon": "π¦
Falcon",
|
14 |
"bloom": "πΈ Bloom",
|
15 |
"llama": "π¦ LLaMA",
|
16 |
"rwkv": "π¦ββ¬ RWKV",
|
|
|
|
|
17 |
"mpt": "𧱠MPT",
|
18 |
-
"Yi": "π« Yi δΊΊ" , # people
|
19 |
# suggest something
|
20 |
"gpt_neox": "GPT-NeoX",
|
21 |
"gpt_neo": "GPT-Neo",
|
@@ -45,6 +47,25 @@ def process_score(score, quantization):
|
|
45 |
return f"{score:.2f} "
|
46 |
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
# def change_tab(query_param):
|
49 |
# query_param = query_param.replace("'", '"')
|
50 |
# query_param = json.loads(query_param)
|
|
|
8 |
"baichuan": "π Baichuan ηΎε·", # river
|
9 |
"internlm": "π§βπ InternLM δΉ¦η", # scholar
|
10 |
"mistral": "βοΈ Mistral",
|
11 |
+
"mixtral": "βοΈ Mixtral",
|
12 |
"codegen": "βΎοΈ CodeGen",
|
13 |
"chatglm": "π¬ ChatGLM",
|
14 |
"falcon": "π¦
Falcon",
|
15 |
"bloom": "πΈ Bloom",
|
16 |
"llama": "π¦ LLaMA",
|
17 |
"rwkv": "π¦ββ¬ RWKV",
|
18 |
+
"deci": "π΅ deci",
|
19 |
+
"Yi": "π« Yi δΊΊ", # people
|
20 |
"mpt": "𧱠MPT",
|
|
|
21 |
# suggest something
|
22 |
"gpt_neox": "GPT-NeoX",
|
23 |
"gpt_neo": "GPT-Neo",
|
|
|
47 |
return f"{score:.2f} "
|
48 |
|
49 |
|
50 |
+
def process_quantization_scheme(x):
|
51 |
+
if x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_4bit"] == True:
|
52 |
+
return "BnB.4bit"
|
53 |
+
elif x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_8bit"] == True:
|
54 |
+
return "BnB.8bit"
|
55 |
+
elif (x["backend.quantization_scheme"] == "gptq") and (
|
56 |
+
x["backend.quantization_config.exllama_config.version"] == 1
|
57 |
+
):
|
58 |
+
return "GPTQ.4bit+ExllamaV1"
|
59 |
+
elif (x["backend.quantization_scheme"] == "gptq") and (
|
60 |
+
x["backend.quantization_config.exllama_config.version"] == 2
|
61 |
+
):
|
62 |
+
return "GPTQ.4bit+ExllamaV2"
|
63 |
+
elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
|
64 |
+
return "GPTQ.4bit"
|
65 |
+
else:
|
66 |
+
return "None"
|
67 |
+
|
68 |
+
|
69 |
# def change_tab(query_param):
|
70 |
# query_param = query_param.replace("'", '"')
|
71 |
# query_param = json.loads(query_param)
|