IlyasMoutawwakil HF staff commited on
Commit
a1135a9
β€’
1 Parent(s): 8b4dd08

update the leaderboard

Browse files
src/bettertransformer.py CHANGED
@@ -3,9 +3,6 @@ import pandas as pd
3
  import plotly.express as px
4
 
5
 
6
- from src.utils import process_arch
7
-
8
-
9
  BETTERTRANSFORMER_DATA = [
10
  # open llm
11
  "Model πŸ€—",
@@ -33,10 +30,8 @@ BETTERTRANSFORMER_DATA = [
33
 
34
  def get_bt_df(llm_perf_df):
35
  bt_df = llm_perf_df.copy()
36
- # process
37
- bt_df["Arch πŸ›οΈ"] = bt_df["Arch πŸ›οΈ"].apply(process_arch)
38
  # seperate original model experiments from BetterTransformer experiments
39
- original_df = bt_df[bt_df["Optimization πŸ› οΈ"] == "None"]
40
  bt_df = bt_df[bt_df["Optimization πŸ› οΈ"] == "BetterTransformer"]
41
  # merge the two dataframes
42
  bt_df = pd.merge(
@@ -48,10 +43,10 @@ def get_bt_df(llm_perf_df):
48
  # compute speedups
49
  bt_df["Prefill Latency Speedup (%)"] = (
50
  (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
51
- ).round(2)
52
  bt_df["Decode Throughput Speedup (%)"] = (
53
  (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
54
- ).round(2)
55
 
56
  # filter speedups > 1000%
57
  bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
 
3
  import plotly.express as px
4
 
5
 
 
 
 
6
  BETTERTRANSFORMER_DATA = [
7
  # open llm
8
  "Model πŸ€—",
 
30
 
31
  def get_bt_df(llm_perf_df):
32
  bt_df = llm_perf_df.copy()
 
 
33
  # seperate original model experiments from BetterTransformer experiments
34
+ original_df = bt_df[(bt_df["Optimization πŸ› οΈ"] == "None") & (bt_df["DType πŸ“₯"] == "float16")]
35
  bt_df = bt_df[bt_df["Optimization πŸ› οΈ"] == "BetterTransformer"]
36
  # merge the two dataframes
37
  bt_df = pd.merge(
 
43
  # compute speedups
44
  bt_df["Prefill Latency Speedup (%)"] = (
45
  (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
46
+ ).round(2) - 100
47
  bt_df["Decode Throughput Speedup (%)"] = (
48
  (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
49
+ ).round(2) - 100
50
 
51
  # filter speedups > 1000%
52
  bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
src/content.py CHANGED
@@ -14,7 +14,7 @@ ABOUT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
14
  <ul>
15
  <li>To avoid communication-dependent results, only one GPU is used.</li>
16
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
17
- <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 1000 tokens.</li>
18
  <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
19
  <li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li>
20
  </ul>
@@ -28,19 +28,26 @@ defaults:
28
  - _base_ # inheriting from base config
29
  - _self_ # for hydra 1.1 compatibility
30
 
31
- experiment_name: pytorch+cuda+float16+bettertransformer
32
  device: cuda
33
 
34
  backend:
35
  no_weights: true
36
  torch_dtype: float16
37
- to_bettertransformer: true
 
 
 
 
 
 
38
  ```
39
 
40
  Where the base config is:
41
  ```yaml
42
  defaults:
43
  - benchmark: inference # default benchmark
 
44
  - experiment # inheriting from experiment config
45
  - _self_ # for hydra 1.1 compatibility
46
  - override hydra/job_logging: colorlog # colorful logging
@@ -48,30 +55,29 @@ defaults:
48
 
49
  hydra:
50
  run:
51
- dir: ???
52
  job:
53
  chdir: true
54
  env_set:
 
 
55
  CUDA_VISIBLE_DEVICES: 0
56
  CUDA_DEVICE_ORDER: PCI_BUS_ID
57
 
58
- model: ???
59
- experiment_name: ???
60
-
61
  backend:
62
- initial_isolation_check: true
63
- continous_isolation_check: true
64
 
65
  benchmark:
66
  duration: 10
67
  memory: true
68
  energy: true
69
 
70
- new_tokens: 1000
71
  input_shapes:
72
  batch_size: 1
73
  sequence_length: 256
74
 
 
 
75
  hub_kwargs:
76
  trust_remote_code: true
77
  ```
 
14
  <ul>
15
  <li>To avoid communication-dependent results, only one GPU is used.</li>
16
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
17
+ <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 256 tokens.</li>
18
  <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
19
  <li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li>
20
  </ul>
 
28
  - _base_ # inheriting from base config
29
  - _self_ # for hydra 1.1 compatibility
30
 
31
+ experiment_name: pytorch+cuda+float16+gptq-4bit+exllama-v1
32
  device: cuda
33
 
34
  backend:
35
  no_weights: true
36
  torch_dtype: float16
37
+ quantization_scheme: gptq
38
+ quantization_config:
39
+ bits: 4
40
+ use_cuda_fp16: false
41
+ use_exllama: true
42
+ exllama_config:
43
+ version: 1
44
  ```
45
 
46
  Where the base config is:
47
  ```yaml
48
  defaults:
49
  - benchmark: inference # default benchmark
50
+ - launcher: process # isolated process launcher
51
  - experiment # inheriting from experiment config
52
  - _self_ # for hydra 1.1 compatibility
53
  - override hydra/job_logging: colorlog # colorful logging
 
55
 
56
  hydra:
57
  run:
58
+ dir: dataset/${oc.env:HOSTNAME}/${experiment_name}/${model}
59
  job:
60
  chdir: true
61
  env_set:
62
+ COUNTRY_ISO_CODE: FRA
63
+ OVERRIDE_BENCHMARKS: 0
64
  CUDA_VISIBLE_DEVICES: 0
65
  CUDA_DEVICE_ORDER: PCI_BUS_ID
66
 
 
 
 
67
  backend:
68
+ continuous_isolation: true
 
69
 
70
  benchmark:
71
  duration: 10
72
  memory: true
73
  energy: true
74
 
 
75
  input_shapes:
76
  batch_size: 1
77
  sequence_length: 256
78
 
79
+ new_tokens: 256
80
+
81
  hub_kwargs:
82
  trust_remote_code: true
83
  ```
src/control_panel.py CHANGED
@@ -39,17 +39,17 @@ def create_control_panel(machine: str = "hf-dgx-01"):
39
  with gr.Column(scale=1):
40
  backend_checkboxes = gr.CheckboxGroup(
41
  label="Backends 🏭",
42
- choices=["pytorch", "onnxruntime"],
43
- value=["pytorch", "onnxruntime"],
44
  info="β˜‘οΈ Select the backends",
45
  elem_id="backend-checkboxes",
46
  )
47
  with gr.Row():
48
  with gr.Column(scale=1):
49
  datatype_checkboxes = gr.CheckboxGroup(
50
- label="DTypes πŸ“₯",
51
- choices=["float32", "float16"],
52
- value=["float32", "float16"],
53
  info="β˜‘οΈ Select the load data types",
54
  elem_id="dtype-checkboxes",
55
  )
@@ -64,8 +64,8 @@ def create_control_panel(machine: str = "hf-dgx-01"):
64
  with gr.Column(scale=1):
65
  quantization_checkboxes = gr.CheckboxGroup(
66
  label="Quantizations πŸ—œοΈ",
67
- choices=["None", "BnB.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
68
- value=["None", "BnB.4bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
69
  info="β˜‘οΈ Select the quantization schemes",
70
  elem_id="quantization-checkboxes",
71
  )
 
39
  with gr.Column(scale=1):
40
  backend_checkboxes = gr.CheckboxGroup(
41
  label="Backends 🏭",
42
+ choices=["pytorch"],
43
+ value=["pytorch"],
44
  info="β˜‘οΈ Select the backends",
45
  elem_id="backend-checkboxes",
46
  )
47
  with gr.Row():
48
  with gr.Column(scale=1):
49
  datatype_checkboxes = gr.CheckboxGroup(
50
+ label="Load DTypes πŸ“₯",
51
+ choices=["float32", "float16", "bfloat16"],
52
+ value=["float32", "float16", "bfloat16"],
53
  info="β˜‘οΈ Select the load data types",
54
  elem_id="dtype-checkboxes",
55
  )
 
64
  with gr.Column(scale=1):
65
  quantization_checkboxes = gr.CheckboxGroup(
66
  label="Quantizations πŸ—œοΈ",
67
+ choices=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
68
+ value=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
69
  info="β˜‘οΈ Select the quantization schemes",
70
  elem_id="quantization-checkboxes",
71
  )
src/flashattentionv2.py CHANGED
@@ -3,9 +3,6 @@ import pandas as pd
3
  import plotly.express as px
4
 
5
 
6
- from src.utils import process_arch
7
-
8
-
9
  FLASHATTENTIONV2_DATA = [
10
  # open llm
11
  "Model πŸ€—",
@@ -33,10 +30,8 @@ FLASHATTENTIONV2_DATA = [
33
 
34
  def get_fa2_df(llm_perf_df):
35
  fa2_df = llm_perf_df.copy()
36
- # process
37
- fa2_df["Arch πŸ›οΈ"] = fa2_df["Arch πŸ›οΈ"].apply(process_arch)
38
  # seperate original model experiments from FlashAttentionV2 experiments
39
- original_df = fa2_df[fa2_df["Optimization πŸ› οΈ"] == "None"]
40
  fa2_df = fa2_df[fa2_df["Optimization πŸ› οΈ"] == "FlashAttentionV2"]
41
  # merge the two dataframes
42
  fa2_df = pd.merge(
@@ -48,10 +43,10 @@ def get_fa2_df(llm_perf_df):
48
  # compute speedups
49
  fa2_df["Prefill Latency Speedup (%)"] = (
50
  (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
51
- ).round(2)
52
  fa2_df["Decode Throughput Speedup (%)"] = (
53
  (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
54
- ).round(2)
55
 
56
  # filter speedups > 1000%
57
  fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
 
3
  import plotly.express as px
4
 
5
 
 
 
 
6
  FLASHATTENTIONV2_DATA = [
7
  # open llm
8
  "Model πŸ€—",
 
30
 
31
  def get_fa2_df(llm_perf_df):
32
  fa2_df = llm_perf_df.copy()
 
 
33
  # seperate original model experiments from FlashAttentionV2 experiments
34
+ original_df = fa2_df[(fa2_df["Optimization πŸ› οΈ"] == "None") & (fa2_df["DType πŸ“₯"] == "float16")]
35
  fa2_df = fa2_df[fa2_df["Optimization πŸ› οΈ"] == "FlashAttentionV2"]
36
  # merge the two dataframes
37
  fa2_df = pd.merge(
 
43
  # compute speedups
44
  fa2_df["Prefill Latency Speedup (%)"] = (
45
  (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
46
+ ).round(2) - 100
47
  fa2_df["Decode Throughput Speedup (%)"] = (
48
  (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
49
+ ).round(2) - 100
50
 
51
  # filter speedups > 1000%
52
  fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
src/llm_perf.py CHANGED
@@ -3,6 +3,8 @@ import os
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download
5
 
 
 
6
  LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
7
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
8
 
@@ -91,24 +93,14 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
91
  llm_perf_df["quantization"] = llm_perf_df[
92
  [
93
  "backend.quantization_scheme",
 
 
 
94
  "backend.quantization_config.exllama_config.version",
95
  ]
96
- ].apply(
97
- lambda x: "BnB.4bit"
98
- if x["backend.quantization_scheme"] == "bnb"
99
- else (
100
- "GPTQ.4bit+ExllamaV1"
101
- if (x["backend.quantization_scheme"] == "gptq")
102
- and (x["backend.quantization_config.exllama_config.version"] == 1)
103
- else (
104
- "GPTQ.4bit+ExllamaV2"
105
- if (x["backend.quantization_scheme"] == "gptq")
106
- and (x["backend.quantization_config.exllama_config.version"] == 2)
107
- else "None"
108
- )
109
- ),
110
- axis=1,
111
- )
112
  # add decode throughput
113
  llm_perf_df["decode.throughput(tokens/s)"] = (
114
  1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
 
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download
5
 
6
+ from .utils import process_quantization_scheme, process_arch
7
+
8
  LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
9
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
10
 
 
93
  llm_perf_df["quantization"] = llm_perf_df[
94
  [
95
  "backend.quantization_scheme",
96
+ "backend.quantization_config.bits",
97
+ "backend.quantization_config.load_in_4bit",
98
+ "backend.quantization_config.load_in_8bit",
99
  "backend.quantization_config.exllama_config.version",
100
  ]
101
+ ].apply(lambda x: process_quantization_scheme(x), axis=1)
102
+ # add arch
103
+ llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  # add decode throughput
105
  llm_perf_df["decode.throughput(tokens/s)"] = (
106
  1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
src/utils.py CHANGED
@@ -8,14 +8,16 @@ LLM_MODEL_ARCHS = {
8
  "baichuan": "🌊 Baichuan 百川", # river
9
  "internlm": "πŸ§‘β€πŸŽ“ InternLM δΉ¦η”Ÿ", # scholar
10
  "mistral": "Ⓜ️ Mistral",
 
11
  "codegen": "♾️ CodeGen",
12
  "chatglm": "πŸ’¬ ChatGLM",
13
  "falcon": "πŸ¦… Falcon",
14
  "bloom": "🌸 Bloom",
15
  "llama": "πŸ¦™ LLaMA",
16
  "rwkv": "πŸ¦β€β¬› RWKV",
 
 
17
  "mpt": "🧱 MPT",
18
- "Yi": "πŸ«‚ Yi δΊΊ" , # people
19
  # suggest something
20
  "gpt_neox": "GPT-NeoX",
21
  "gpt_neo": "GPT-Neo",
@@ -45,6 +47,25 @@ def process_score(score, quantization):
45
  return f"{score:.2f} "
46
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  # def change_tab(query_param):
49
  # query_param = query_param.replace("'", '"')
50
  # query_param = json.loads(query_param)
 
8
  "baichuan": "🌊 Baichuan 百川", # river
9
  "internlm": "πŸ§‘β€πŸŽ“ InternLM δΉ¦η”Ÿ", # scholar
10
  "mistral": "Ⓜ️ Mistral",
11
+ "mixtral": "Ⓜ️ Mixtral",
12
  "codegen": "♾️ CodeGen",
13
  "chatglm": "πŸ’¬ ChatGLM",
14
  "falcon": "πŸ¦… Falcon",
15
  "bloom": "🌸 Bloom",
16
  "llama": "πŸ¦™ LLaMA",
17
  "rwkv": "πŸ¦β€β¬› RWKV",
18
+ "deci": "πŸ”΅ deci",
19
+ "Yi": "πŸ«‚ Yi δΊΊ", # people
20
  "mpt": "🧱 MPT",
 
21
  # suggest something
22
  "gpt_neox": "GPT-NeoX",
23
  "gpt_neo": "GPT-Neo",
 
47
  return f"{score:.2f} "
48
 
49
 
50
+ def process_quantization_scheme(x):
51
+ if x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_4bit"] == True:
52
+ return "BnB.4bit"
53
+ elif x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_8bit"] == True:
54
+ return "BnB.8bit"
55
+ elif (x["backend.quantization_scheme"] == "gptq") and (
56
+ x["backend.quantization_config.exllama_config.version"] == 1
57
+ ):
58
+ return "GPTQ.4bit+ExllamaV1"
59
+ elif (x["backend.quantization_scheme"] == "gptq") and (
60
+ x["backend.quantization_config.exllama_config.version"] == 2
61
+ ):
62
+ return "GPTQ.4bit+ExllamaV2"
63
+ elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
64
+ return "GPTQ.4bit"
65
+ else:
66
+ return "None"
67
+
68
+
69
  # def change_tab(query_param):
70
  # query_param = query_param.replace("'", '"')
71
  # query_param = json.loads(query_param)