Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

IlyasMoutawwakil HF staff commited on Jan 11

Commit

14d526b

•

1 Parent(s): 08604d0

added custom kernels comparison

Browse files

Files changed (4) hide show

app.py +7 -5
src/control_panel.py +5 -5
src/{exllama.py → custom_kernels.py} +63 -36
src/llm_perf.py +10 -8

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from src.latency_score_memory import create_lat_score_mem_plot
 from src.leaderboard import create_leaderboard_table
 from src.bettertransformer import create_bt_plots
 from src.flashattentionv2 import create_fa2_plots
-from src.exllama import create_exllama_plots
 from src.llm_perf import get_llm_perf_df
 from src.assets import custom_css
 from src.content import (
@@ -60,8 +60,10 @@ with demo:
                         bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
                     with gr.TabItem("FlashAttentionV2 Speedup 📈", id=3):
                         fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
-                    with gr.TabItem("Exllama Speedup 📈", id=4):
-                        exllama_prefill_plot, exllama_decode_plot = create_exllama_plots(llm_perf_df)
                 ####################### CONTROL CALLBACK #######################
                 create_control_callback(
@@ -82,8 +84,8 @@ with demo:
                     bt_decode_plot,
                     fa2_prefill_plot,
                     fa2_decode_plot,
-                    exllama_prefill_plot,
-                    exllama_decode_plot,
                 )
         ####################### ABOUT TAB #######################
         with gr.TabItem("About 📖", id=3):

 from src.leaderboard import create_leaderboard_table
 from src.bettertransformer import create_bt_plots
 from src.flashattentionv2 import create_fa2_plots
+from src.custom_kernels import create_custom_kernels_plots
 from src.llm_perf import get_llm_perf_df
 from src.assets import custom_css
 from src.content import (
                         bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
                     with gr.TabItem("FlashAttentionV2 Speedup 📈", id=3):
                         fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
+                    with gr.TabItem("Custom Quantization Kernels Comparison 🏆", id=4):
+                        custom_kernels_prefill_plot, custom_kernels_decode_plot = create_custom_kernels_plots(
+                            llm_perf_df
+                        )
                 ####################### CONTROL CALLBACK #######################
                 create_control_callback(
                     bt_decode_plot,
                     fa2_prefill_plot,
                     fa2_decode_plot,
+                    custom_kernels_prefill_plot,
+                    custom_kernels_decode_plot,
                 )
         ####################### ABOUT TAB #######################
         with gr.TabItem("About 📖", id=3):

src/control_panel.py CHANGED Viewed

@@ -5,7 +5,7 @@ from src.leaderboard import get_leaderboard_df
 from src.latency_score_memory import get_lat_score_mem_fig
 from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
 from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
-from src.exllama import get_exllama_prefill_fig, get_exllama_decode_fig
 def create_control_panel(machine: str = "hf-dgx-01"):
@@ -133,8 +133,8 @@ def filter_fn(
     filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
     filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
     filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
-    filtered_exllama_prefill_fig = get_exllama_prefill_fig(filtered_df)
-    filtered_exllama_decode_fig = get_exllama_decode_fig(filtered_df)
     return [
         filtered_leaderboard_df,
@@ -143,8 +143,8 @@ def filter_fn(
         filtered_bt_decode_fig,
         filtered_fa2_prefill_fig,
         filtered_fa2_decode_fig,
-        filtered_exllama_prefill_fig,
-        filtered_exllama_decode_fig,
     ]

 from src.latency_score_memory import get_lat_score_mem_fig
 from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
 from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
+from src.custom_kernels import get_custom_kernels_prefill_fig, get_custom_kernels_decode_fig
 def create_control_panel(machine: str = "hf-dgx-01"):
     filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
     filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
     filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
+    filtered_custom_kernels_prefill_fig = get_custom_kernels_prefill_fig(filtered_df)
+    filtered_custom_kernels_decode_fig = get_custom_kernels_decode_fig(filtered_df)
     return [
         filtered_leaderboard_df,
         filtered_bt_decode_fig,
         filtered_fa2_prefill_fig,
         filtered_fa2_decode_fig,
+        filtered_custom_kernels_prefill_fig,
+        filtered_custom_kernels_decode_fig,
     ]

src/{exllama.py → custom_kernels.py} RENAMED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import plotly.express as px
-EXLLAMA_DATA = [
     # open llm
     "Model 🤗",
     "Arch 🏛️",
@@ -14,71 +14,96 @@ EXLLAMA_DATA = [
     # deployment settings
     "DType 📥",
     "Backend 🏭",
     "Quantization 🗜️",
     # primary measurements
     "Prefill Latency (s)",
-    "Prefill Latency (s) Exllama",
     "Decode Throughput (tokens/s)",
-    "Decode Throughput (tokens/s) Exllama",
-    "E2E Throughput (tokens/s)",
-    "E2E Throughput (tokens/s) Exllama",
     # speedups
     "Prefill Latency Speedup (%)",
     "Decode Throughput Speedup (%)",
 ]
-def get_exllama_df(llm_perf_df):
     copy_df = llm_perf_df.copy()
-    # seperate vanilla GPTQ experiments from Exllama experiments
-    gptq_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit")]
     exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
     exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
     # merge the three dataframes
     exllamav1_df = pd.merge(
-        gptq_df,
         exllamav1_df,
         on=["Model 🤗"],
-        suffixes=["", " Exllama"],
     )
     exllamav2_df = pd.merge(
-        gptq_df,
         exllamav2_df,
         on=["Model 🤗"],
-        suffixes=["", " Exllama"],
     )
     # concat the two dataframes row-wise
-    exllama_df = pd.concat([exllamav1_df, exllamav2_df])
-    exllama_df["Quantization 🗜️"] = exllama_df["Quantization 🗜️ Exllama"]
     # compute speedups
-    exllama_df["Prefill Latency Speedup (%)"] = (
-        (exllama_df["Prefill Latency (s)"] / exllama_df["Prefill Latency (s) Exllama"]) * 100
     ).round(2) - 100
-    exllama_df["Decode Throughput Speedup (%)"] = (
-        (exllama_df["Decode Throughput (tokens/s) Exllama"] / exllama_df["Decode Throughput (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
-    exllama_df = exllama_df[exllama_df["Prefill Latency Speedup (%)"] < 1000]
-    exllama_df = exllama_df[exllama_df["Decode Throughput Speedup (%)"] < 1000]
-    return exllama_df
-def get_exllama_decode_fig(llm_perf_df):
-    exllama_df = get_exllama_df(llm_perf_df)
     # plot
     decode_fig = px.box(
-        exllama_df,
         x="Arch 🏛️",
         y="Decode Throughput Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
-        custom_data=EXLLAMA_DATA,
-        color="Quantization 🗜️ Exllama",
         points="all",
     )
     # add hover data
     decode_fig.update_traces(
-        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
     )
     # add layout
     decode_fig.update_layout(
@@ -99,21 +124,23 @@ def get_exllama_decode_fig(llm_perf_df):
     return decode_fig
-def get_exllama_prefill_fig(llm_perf_df):
-    exllama_df = get_exllama_df(llm_perf_df)
     # plot
     prefill_fig = px.box(
-        exllama_df,
         x="Arch 🏛️",
         y="Prefill Latency Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
-        custom_data=EXLLAMA_DATA,
-        color="Quantization 🗜️ Exllama",
         points="all",
     )
     # add hover data
     prefill_fig.update_traces(
-        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
     )
     # add layout
     prefill_fig.update_layout(
@@ -134,12 +161,12 @@ def get_exllama_prefill_fig(llm_perf_df):
     return prefill_fig
-def create_exllama_plots(llm_perf_df):
     # descriptive text
     gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
     # get figures
-    prefill_fig = get_exllama_prefill_fig(llm_perf_df)
-    decode_fig = get_exllama_decode_fig(llm_perf_df)
     # create plots
     prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)

 import plotly.express as px
+CUSTOM_KERNELS_DATA = [
     # open llm
     "Model 🤗",
     "Arch 🏛️",
     # deployment settings
     "DType 📥",
     "Backend 🏭",
+    "Optimization 🛠️",
     "Quantization 🗜️",
+    "Optimization 🛠️ Custom Kernel",
+    "Quantization 🗜️ Custom Kernel",
     # primary measurements
     "Prefill Latency (s)",
+    "Prefill Latency (s) Custom Kernel",
     "Decode Throughput (tokens/s)",
+    "Decode Throughput (tokens/s) Custom Kernel",
     # speedups
     "Prefill Latency Speedup (%)",
     "Decode Throughput Speedup (%)",
 ]
+def get_custom_kernels_df(llm_perf_df):
     copy_df = llm_perf_df.copy()
+    # seperate vanilla GPTQ experiments from Custom Kernel experiments
+    vanilla_df = copy_df[
+        (copy_df["Backend 🏭"] == "pytorch") &
+        (copy_df["Quantization 🗜️"] == "None") &
+        (copy_df["Optimization 🛠️"] == "None") &
+        (copy_df["DType 📥"] == "float16")
+    ]
     exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
     exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
+    gemm_df = copy_df[(copy_df["Quantization 🗜️"] == "AWQ.4bit+GEMM")]
+    gemv_df = copy_df[(copy_df["Quantization 🗜️"] == "AWQ.4bit+GEMV")]
     # merge the three dataframes
     exllamav1_df = pd.merge(
+        vanilla_df,
         exllamav1_df,
         on=["Model 🤗"],
+        suffixes=["", " Custom Kernel"],
     )
     exllamav2_df = pd.merge(
+        vanilla_df,
         exllamav2_df,
         on=["Model 🤗"],
+        suffixes=["", " Custom Kernel"],
+    )
+    gemm_df = pd.merge(
+        vanilla_df,
+        gemm_df,
+        on=["Model 🤗"],
+        suffixes=["", " Custom Kernel"],
+    )
+    gemv_df = pd.merge(
+        vanilla_df,
+        gemv_df,
+        on=["Model 🤗"],
+        suffixes=["", " Custom Kernel"],
     )
     # concat the two dataframes row-wise
+    custom_kernels_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
     # compute speedups
+    custom_kernels_df["Prefill Latency Speedup (%)"] = (
+        (custom_kernels_df["Prefill Latency (s)"] / custom_kernels_df["Prefill Latency (s) Custom Kernel"]) * 100
     ).round(2) - 100
+    custom_kernels_df["Decode Throughput Speedup (%)"] = (
+        (
+            custom_kernels_df["Decode Throughput (tokens/s) Custom Kernel"]
+            / custom_kernels_df["Decode Throughput (tokens/s)"]
+        )
+        * 100
     ).round(2) - 100
     # filter speedups > 1000%
+    custom_kernels_df = custom_kernels_df[custom_kernels_df["Prefill Latency Speedup (%)"] < 1000]
+    custom_kernels_df = custom_kernels_df[custom_kernels_df["Decode Throughput Speedup (%)"] < 1000]
+    return custom_kernels_df
+def get_custom_kernels_decode_fig(llm_perf_df):
+    custom_kernels_df = get_custom_kernels_df(llm_perf_df)
     # plot
     decode_fig = px.box(
+        custom_kernels_df,
         x="Arch 🏛️",
         y="Decode Throughput Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=CUSTOM_KERNELS_DATA,
+        color="Quantization 🗜️ Custom Kernel",
         points="all",
     )
     # add hover data
     decode_fig.update_traces(
+        hovertemplate="<br>".join(
+            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(CUSTOM_KERNELS_DATA)]
+        )
     )
     # add layout
     decode_fig.update_layout(
     return decode_fig
+def get_custom_kernels_prefill_fig(llm_perf_df):
+    custom_kernels_df = get_custom_kernels_df(llm_perf_df)
     # plot
     prefill_fig = px.box(
+        custom_kernels_df,
         x="Arch 🏛️",
         y="Prefill Latency Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=CUSTOM_KERNELS_DATA,
+        color="Quantization 🗜️ Custom Kernel",
         points="all",
     )
     # add hover data
     prefill_fig.update_traces(
+        hovertemplate="<br>".join(
+            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(CUSTOM_KERNELS_DATA)]
+        )
     )
     # add layout
     prefill_fig.update_layout(
     return prefill_fig
+def create_custom_kernels_plots(llm_perf_df):
     # descriptive text
     gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
     # get figures
+    prefill_fig = get_custom_kernels_prefill_fig(llm_perf_df)
+    decode_fig = get_custom_kernels_decode_fig(llm_perf_df)
     # create plots
     prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)

src/llm_perf.py CHANGED Viewed

@@ -38,14 +38,16 @@ SORTING_ASCENDING = [False, True, False]
 def get_llm_df():
-    hf_hub_download(
-        repo_id=LLM_PERF_DATASET_REPO,
-        filename="open-llm.csv",
-        local_dir="dataset",
-        repo_type="dataset",
-        token=HF_TOKEN,
-    )
-    llm_df = pd.read_csv("dataset/open-llm.csv")
     return llm_df

 def get_llm_df():
+    # commented for now since scraping script is not working
+    # hf_hub_download(
+    #     repo_id=LLM_PERF_DATASET_REPO,
+    #     filename="open-llm.csv",
+    #     local_dir="dataset",
+    #     repo_type="dataset",
+    #     token=HF_TOKEN,
+    # )
+    # llm_df = pd.read_csv("dataset/open-llm.csv")
+    llm_df = pd.read_csv("https://huggingface.co/datasets/optimum/llm-perf-dataset/raw/e8628583f0c31457cd5f8b81352735263117fbb4/open-llm.csv")
     return llm_df