Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

IlyasMoutawwakil HF staff commited on Jan 4, 2024

Commit

a1135a9

1 Parent(s): 8b4dd08

update the leaderboard

Browse files

Files changed (6) hide show

src/bettertransformer.py +3 -8
src/content.py +16 -10
src/control_panel.py +7 -7
src/flashattentionv2.py +3 -8
src/llm_perf.py +8 -16
src/utils.py +22 -1

src/bettertransformer.py CHANGED Viewed

@@ -3,9 +3,6 @@ import pandas as pd
 import plotly.express as px
-from src.utils import process_arch
 BETTERTRANSFORMER_DATA = [
     # open llm
     "Model 🤗",
@@ -33,10 +30,8 @@ BETTERTRANSFORMER_DATA = [
 def get_bt_df(llm_perf_df):
     bt_df = llm_perf_df.copy()
-    # process
-    bt_df["Arch 🏛️"] = bt_df["Arch 🏛️"].apply(process_arch)
     # seperate original model experiments from BetterTransformer experiments
-    original_df = bt_df[bt_df["Optimization 🛠️"] == "None"]
     bt_df = bt_df[bt_df["Optimization 🛠️"] == "BetterTransformer"]
     # merge the two dataframes
     bt_df = pd.merge(
@@ -48,10 +43,10 @@ def get_bt_df(llm_perf_df):
     # compute speedups
     bt_df["Prefill Latency Speedup (%)"] = (
         (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
-    ).round(2)
     bt_df["Decode Throughput Speedup (%)"] = (
         (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
-    ).round(2)
     # filter speedups > 1000%
     bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]

 import plotly.express as px
 BETTERTRANSFORMER_DATA = [
     # open llm
     "Model 🤗",
 def get_bt_df(llm_perf_df):
     bt_df = llm_perf_df.copy()
     # seperate original model experiments from BetterTransformer experiments
+    original_df = bt_df[(bt_df["Optimization 🛠️"] == "None") & (bt_df["DType 📥"] == "float16")]
     bt_df = bt_df[bt_df["Optimization 🛠️"] == "BetterTransformer"]
     # merge the two dataframes
     bt_df = pd.merge(
     # compute speedups
     bt_df["Prefill Latency Speedup (%)"] = (
         (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
+    ).round(2) - 100
     bt_df["Decode Throughput Speedup (%)"] = (
         (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
+    ).round(2) - 100
     # filter speedups > 1000%
     bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]

src/content.py CHANGED Viewed

@@ -14,7 +14,7 @@ ABOUT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>
 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
-    <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 1000 tokens.</li>
     <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
     <li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li>
 </ul>
@@ -28,19 +28,26 @@ defaults:
   - _base_ # inheriting from base config
   - _self_ # for hydra 1.1 compatibility
-experiment_name: pytorch+cuda+float16+bettertransformer
 device: cuda
 backend:
   no_weights: true
   torch_dtype: float16
-  to_bettertransformer: true
 ```
 Where the base config is:
 ```yaml
 defaults:
   - benchmark: inference # default benchmark
   - experiment # inheriting from experiment config
   - _self_ # for hydra 1.1 compatibility
   - override hydra/job_logging: colorlog # colorful logging
@@ -48,30 +55,29 @@ defaults:
 hydra:
   run:
-    dir: ???
   job:
     chdir: true
     env_set:
       CUDA_VISIBLE_DEVICES: 0
       CUDA_DEVICE_ORDER: PCI_BUS_ID
-model: ???
-experiment_name: ???
 backend:
-  initial_isolation_check: true
-  continous_isolation_check: true
 benchmark:
   duration: 10
   memory: true
   energy: true
-  new_tokens: 1000
   input_shapes:
     batch_size: 1
     sequence_length: 256
 hub_kwargs:
   trust_remote_code: true
 ```

 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
+    <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 256 tokens.</li>
     <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
     <li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li>
 </ul>
   - _base_ # inheriting from base config
   - _self_ # for hydra 1.1 compatibility
+experiment_name: pytorch+cuda+float16+gptq-4bit+exllama-v1
 device: cuda
 backend:
   no_weights: true
   torch_dtype: float16
+  quantization_scheme: gptq
+  quantization_config:
+    bits: 4
+    use_cuda_fp16: false
+    use_exllama: true
+    exllama_config:
+      version: 1
 ```
 Where the base config is:
 ```yaml
 defaults:
   - benchmark: inference # default benchmark
+  - launcher: process # isolated process launcher
   - experiment # inheriting from experiment config
   - _self_ # for hydra 1.1 compatibility
   - override hydra/job_logging: colorlog # colorful logging
 hydra:
   run:
+    dir: dataset/${oc.env:HOSTNAME}/${experiment_name}/${model}
   job:
     chdir: true
     env_set:
+      COUNTRY_ISO_CODE: FRA
+      OVERRIDE_BENCHMARKS: 0
       CUDA_VISIBLE_DEVICES: 0
       CUDA_DEVICE_ORDER: PCI_BUS_ID
 backend:
+  continuous_isolation: true
 benchmark:
   duration: 10
   memory: true
   energy: true
   input_shapes:
     batch_size: 1
     sequence_length: 256
+  new_tokens: 256
 hub_kwargs:
   trust_remote_code: true
 ```

src/control_panel.py CHANGED Viewed

@@ -39,17 +39,17 @@ def create_control_panel(machine: str = "hf-dgx-01"):
         with gr.Column(scale=1):
             backend_checkboxes = gr.CheckboxGroup(
                 label="Backends 🏭",
-                choices=["pytorch", "onnxruntime"],
-                value=["pytorch", "onnxruntime"],
                 info="☑️ Select the backends",
                 elem_id="backend-checkboxes",
             )
     with gr.Row():
         with gr.Column(scale=1):
             datatype_checkboxes = gr.CheckboxGroup(
-                label="DTypes 📥",
-                choices=["float32", "float16"],
-                value=["float32", "float16"],
                 info="☑️ Select the load data types",
                 elem_id="dtype-checkboxes",
             )
@@ -64,8 +64,8 @@ def create_control_panel(machine: str = "hf-dgx-01"):
         with gr.Column(scale=1):
             quantization_checkboxes = gr.CheckboxGroup(
                 label="Quantizations 🗜️",
-                choices=["None", "BnB.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
-                value=["None", "BnB.4bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
                 info="☑️ Select the quantization schemes",
                 elem_id="quantization-checkboxes",
             )

         with gr.Column(scale=1):
             backend_checkboxes = gr.CheckboxGroup(
                 label="Backends 🏭",
+                choices=["pytorch"],
+                value=["pytorch"],
                 info="☑️ Select the backends",
                 elem_id="backend-checkboxes",
             )
     with gr.Row():
         with gr.Column(scale=1):
             datatype_checkboxes = gr.CheckboxGroup(
+                label="Load DTypes 📥",
+                choices=["float32", "float16", "bfloat16"],
+                value=["float32", "float16", "bfloat16"],
                 info="☑️ Select the load data types",
                 elem_id="dtype-checkboxes",
             )
         with gr.Column(scale=1):
             quantization_checkboxes = gr.CheckboxGroup(
                 label="Quantizations 🗜️",
+                choices=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
+                value=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
                 info="☑️ Select the quantization schemes",
                 elem_id="quantization-checkboxes",
             )

src/flashattentionv2.py CHANGED Viewed

@@ -3,9 +3,6 @@ import pandas as pd
 import plotly.express as px
-from src.utils import process_arch
 FLASHATTENTIONV2_DATA = [
     # open llm
     "Model 🤗",
@@ -33,10 +30,8 @@ FLASHATTENTIONV2_DATA = [
 def get_fa2_df(llm_perf_df):
     fa2_df = llm_perf_df.copy()
-    # process
-    fa2_df["Arch 🏛️"] = fa2_df["Arch 🏛️"].apply(process_arch)
     # seperate original model experiments from FlashAttentionV2 experiments
-    original_df = fa2_df[fa2_df["Optimization 🛠️"] == "None"]
     fa2_df = fa2_df[fa2_df["Optimization 🛠️"] == "FlashAttentionV2"]
     # merge the two dataframes
     fa2_df = pd.merge(
@@ -48,10 +43,10 @@ def get_fa2_df(llm_perf_df):
     # compute speedups
     fa2_df["Prefill Latency Speedup (%)"] = (
         (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
-    ).round(2)
     fa2_df["Decode Throughput Speedup (%)"] = (
         (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
-    ).round(2)
     # filter speedups > 1000%
     fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]

 import plotly.express as px
 FLASHATTENTIONV2_DATA = [
     # open llm
     "Model 🤗",
 def get_fa2_df(llm_perf_df):
     fa2_df = llm_perf_df.copy()
     # seperate original model experiments from FlashAttentionV2 experiments
+    original_df = fa2_df[(fa2_df["Optimization 🛠️"] == "None") & (fa2_df["DType 📥"] == "float16")]
     fa2_df = fa2_df[fa2_df["Optimization 🛠️"] == "FlashAttentionV2"]
     # merge the two dataframes
     fa2_df = pd.merge(
     # compute speedups
     fa2_df["Prefill Latency Speedup (%)"] = (
         (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
+    ).round(2) - 100
     fa2_df["Decode Throughput Speedup (%)"] = (
         (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
+    ).round(2) - 100
     # filter speedups > 1000%
     fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]

src/llm_perf.py CHANGED Viewed

@@ -3,6 +3,8 @@ import os
 import pandas as pd
 from huggingface_hub import hf_hub_download
 LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -91,24 +93,14 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
     llm_perf_df["quantization"] = llm_perf_df[
         [
             "backend.quantization_scheme",
             "backend.quantization_config.exllama_config.version",
         ]
-    ].apply(
-        lambda x: "BnB.4bit"
-        if x["backend.quantization_scheme"] == "bnb"
-        else (
-            "GPTQ.4bit+ExllamaV1"
-            if (x["backend.quantization_scheme"] == "gptq")
-            and (x["backend.quantization_config.exllama_config.version"] == 1)
-            else (
-                "GPTQ.4bit+ExllamaV2"
-                if (x["backend.quantization_scheme"] == "gptq")
-                and (x["backend.quantization_config.exllama_config.version"] == 2)
-                else "None"
-            )
-        ),
-        axis=1,
-    )
     # add decode throughput
     llm_perf_df["decode.throughput(tokens/s)"] = (
         1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])

 import pandas as pd
 from huggingface_hub import hf_hub_download
+from .utils import process_quantization_scheme, process_arch
 LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
     llm_perf_df["quantization"] = llm_perf_df[
         [
             "backend.quantization_scheme",
+            "backend.quantization_config.bits",
+            "backend.quantization_config.load_in_4bit",
+            "backend.quantization_config.load_in_8bit",
             "backend.quantization_config.exllama_config.version",
         ]
+    ].apply(lambda x: process_quantization_scheme(x), axis=1)
+    # add arch
+    llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
     # add decode throughput
     llm_perf_df["decode.throughput(tokens/s)"] = (
         1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])

src/utils.py CHANGED Viewed

@@ -8,14 +8,16 @@ LLM_MODEL_ARCHS = {
     "baichuan": "🌊 Baichuan 百川",  # river
     "internlm": "🧑‍🎓 InternLM 书生",  # scholar
     "mistral": "Ⓜ️ Mistral",
     "codegen": "♾️ CodeGen",
     "chatglm": "💬 ChatGLM",
     "falcon": "🦅 Falcon",
     "bloom": "🌸 Bloom",
     "llama": "🦙 LLaMA",
     "rwkv": "🐦‍⬛ RWKV",
     "mpt": "🧱 MPT",
-    "Yi": "🫂 Yi 人" , # people
     # suggest something
     "gpt_neox": "GPT-NeoX",
     "gpt_neo": "GPT-Neo",
@@ -45,6 +47,25 @@ def process_score(score, quantization):
         return f"{score:.2f} "
 # def change_tab(query_param):
 #     query_param = query_param.replace("'", '"')
 #     query_param = json.loads(query_param)

     "baichuan": "🌊 Baichuan 百川",  # river
     "internlm": "🧑‍🎓 InternLM 书生",  # scholar
     "mistral": "Ⓜ️ Mistral",
+    "mixtral": "Ⓜ️ Mixtral",
     "codegen": "♾️ CodeGen",
     "chatglm": "💬 ChatGLM",
     "falcon": "🦅 Falcon",
     "bloom": "🌸 Bloom",
     "llama": "🦙 LLaMA",
     "rwkv": "🐦‍⬛ RWKV",
+    "deci": "🔵 deci",
+    "Yi": "🫂 Yi 人", # people
     "mpt": "🧱 MPT",
     # suggest something
     "gpt_neox": "GPT-NeoX",
     "gpt_neo": "GPT-Neo",
         return f"{score:.2f} "
+def process_quantization_scheme(x):
+    if x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_4bit"] == True:
+        return "BnB.4bit"
+    elif x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_8bit"] == True:
+        return "BnB.8bit"
+    elif (x["backend.quantization_scheme"] == "gptq") and (
+        x["backend.quantization_config.exllama_config.version"] == 1
+    ):
+        return "GPTQ.4bit+ExllamaV1"
+    elif (x["backend.quantization_scheme"] == "gptq") and (
+        x["backend.quantization_config.exllama_config.version"] == 2
+    ):
+        return "GPTQ.4bit+ExllamaV2"
+    elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
+        return "GPTQ.4bit"
+    else:
+        return "None"
 # def change_tab(query_param):
 #     query_param = query_param.replace("'", '"')
 #     query_param = json.loads(query_param)