IlyasMoutawwakil HF staff commited on
Commit
14d526b
β€’
1 Parent(s): 08604d0

added custom kernels comparison

Browse files
app.py CHANGED
@@ -7,7 +7,7 @@ from src.latency_score_memory import create_lat_score_mem_plot
7
  from src.leaderboard import create_leaderboard_table
8
  from src.bettertransformer import create_bt_plots
9
  from src.flashattentionv2 import create_fa2_plots
10
- from src.exllama import create_exllama_plots
11
  from src.llm_perf import get_llm_perf_df
12
  from src.assets import custom_css
13
  from src.content import (
@@ -60,8 +60,10 @@ with demo:
60
  bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
61
  with gr.TabItem("FlashAttentionV2 Speedup πŸ“ˆ", id=3):
62
  fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
63
- with gr.TabItem("Exllama Speedup πŸ“ˆ", id=4):
64
- exllama_prefill_plot, exllama_decode_plot = create_exllama_plots(llm_perf_df)
 
 
65
 
66
  ####################### CONTROL CALLBACK #######################
67
  create_control_callback(
@@ -82,8 +84,8 @@ with demo:
82
  bt_decode_plot,
83
  fa2_prefill_plot,
84
  fa2_decode_plot,
85
- exllama_prefill_plot,
86
- exllama_decode_plot,
87
  )
88
  ####################### ABOUT TAB #######################
89
  with gr.TabItem("About πŸ“–", id=3):
 
7
  from src.leaderboard import create_leaderboard_table
8
  from src.bettertransformer import create_bt_plots
9
  from src.flashattentionv2 import create_fa2_plots
10
+ from src.custom_kernels import create_custom_kernels_plots
11
  from src.llm_perf import get_llm_perf_df
12
  from src.assets import custom_css
13
  from src.content import (
 
60
  bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
61
  with gr.TabItem("FlashAttentionV2 Speedup πŸ“ˆ", id=3):
62
  fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
63
+ with gr.TabItem("Custom Quantization Kernels Comparison πŸ†", id=4):
64
+ custom_kernels_prefill_plot, custom_kernels_decode_plot = create_custom_kernels_plots(
65
+ llm_perf_df
66
+ )
67
 
68
  ####################### CONTROL CALLBACK #######################
69
  create_control_callback(
 
84
  bt_decode_plot,
85
  fa2_prefill_plot,
86
  fa2_decode_plot,
87
+ custom_kernels_prefill_plot,
88
+ custom_kernels_decode_plot,
89
  )
90
  ####################### ABOUT TAB #######################
91
  with gr.TabItem("About πŸ“–", id=3):
src/control_panel.py CHANGED
@@ -5,7 +5,7 @@ from src.leaderboard import get_leaderboard_df
5
  from src.latency_score_memory import get_lat_score_mem_fig
6
  from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
7
  from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
8
- from src.exllama import get_exllama_prefill_fig, get_exllama_decode_fig
9
 
10
 
11
  def create_control_panel(machine: str = "hf-dgx-01"):
@@ -133,8 +133,8 @@ def filter_fn(
133
  filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
134
  filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
135
  filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
136
- filtered_exllama_prefill_fig = get_exllama_prefill_fig(filtered_df)
137
- filtered_exllama_decode_fig = get_exllama_decode_fig(filtered_df)
138
 
139
  return [
140
  filtered_leaderboard_df,
@@ -143,8 +143,8 @@ def filter_fn(
143
  filtered_bt_decode_fig,
144
  filtered_fa2_prefill_fig,
145
  filtered_fa2_decode_fig,
146
- filtered_exllama_prefill_fig,
147
- filtered_exllama_decode_fig,
148
  ]
149
 
150
 
 
5
  from src.latency_score_memory import get_lat_score_mem_fig
6
  from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
7
  from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
8
+ from src.custom_kernels import get_custom_kernels_prefill_fig, get_custom_kernels_decode_fig
9
 
10
 
11
  def create_control_panel(machine: str = "hf-dgx-01"):
 
133
  filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
134
  filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
135
  filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
136
+ filtered_custom_kernels_prefill_fig = get_custom_kernels_prefill_fig(filtered_df)
137
+ filtered_custom_kernels_decode_fig = get_custom_kernels_decode_fig(filtered_df)
138
 
139
  return [
140
  filtered_leaderboard_df,
 
143
  filtered_bt_decode_fig,
144
  filtered_fa2_prefill_fig,
145
  filtered_fa2_decode_fig,
146
+ filtered_custom_kernels_prefill_fig,
147
+ filtered_custom_kernels_decode_fig,
148
  ]
149
 
150
 
src/{exllama.py β†’ custom_kernels.py} RENAMED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import plotly.express as px
4
 
5
 
6
- EXLLAMA_DATA = [
7
  # open llm
8
  "Model πŸ€—",
9
  "Arch πŸ›οΈ",
@@ -14,71 +14,96 @@ EXLLAMA_DATA = [
14
  # deployment settings
15
  "DType πŸ“₯",
16
  "Backend 🏭",
 
17
  "Quantization πŸ—œοΈ",
 
 
18
  # primary measurements
19
  "Prefill Latency (s)",
20
- "Prefill Latency (s) Exllama",
21
  "Decode Throughput (tokens/s)",
22
- "Decode Throughput (tokens/s) Exllama",
23
- "E2E Throughput (tokens/s)",
24
- "E2E Throughput (tokens/s) Exllama",
25
  # speedups
26
  "Prefill Latency Speedup (%)",
27
  "Decode Throughput Speedup (%)",
28
  ]
29
 
30
 
31
- def get_exllama_df(llm_perf_df):
32
  copy_df = llm_perf_df.copy()
33
- # seperate vanilla GPTQ experiments from Exllama experiments
34
- gptq_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit")]
 
 
 
 
 
35
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
36
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
 
 
37
  # merge the three dataframes
38
  exllamav1_df = pd.merge(
39
- gptq_df,
40
  exllamav1_df,
41
  on=["Model πŸ€—"],
42
- suffixes=["", " Exllama"],
43
  )
44
  exllamav2_df = pd.merge(
45
- gptq_df,
46
  exllamav2_df,
47
  on=["Model πŸ€—"],
48
- suffixes=["", " Exllama"],
 
 
 
 
 
 
 
 
 
 
 
 
49
  )
50
  # concat the two dataframes row-wise
51
- exllama_df = pd.concat([exllamav1_df, exllamav2_df])
52
- exllama_df["Quantization πŸ—œοΈ"] = exllama_df["Quantization πŸ—œοΈ Exllama"]
53
  # compute speedups
54
- exllama_df["Prefill Latency Speedup (%)"] = (
55
- (exllama_df["Prefill Latency (s)"] / exllama_df["Prefill Latency (s) Exllama"]) * 100
56
  ).round(2) - 100
57
- exllama_df["Decode Throughput Speedup (%)"] = (
58
- (exllama_df["Decode Throughput (tokens/s) Exllama"] / exllama_df["Decode Throughput (tokens/s)"]) * 100
 
 
 
 
59
  ).round(2) - 100
60
  # filter speedups > 1000%
61
- exllama_df = exllama_df[exllama_df["Prefill Latency Speedup (%)"] < 1000]
62
- exllama_df = exllama_df[exllama_df["Decode Throughput Speedup (%)"] < 1000]
63
 
64
- return exllama_df
65
 
66
 
67
- def get_exllama_decode_fig(llm_perf_df):
68
- exllama_df = get_exllama_df(llm_perf_df)
69
  # plot
70
  decode_fig = px.box(
71
- exllama_df,
72
  x="Arch πŸ›οΈ",
73
  y="Decode Throughput Speedup (%)",
74
  color_discrete_sequence=px.colors.qualitative.Light24,
75
- custom_data=EXLLAMA_DATA,
76
- color="Quantization πŸ—œοΈ Exllama",
77
  points="all",
78
  )
79
  # add hover data
80
  decode_fig.update_traces(
81
- hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
 
 
82
  )
83
  # add layout
84
  decode_fig.update_layout(
@@ -99,21 +124,23 @@ def get_exllama_decode_fig(llm_perf_df):
99
  return decode_fig
100
 
101
 
102
- def get_exllama_prefill_fig(llm_perf_df):
103
- exllama_df = get_exllama_df(llm_perf_df)
104
  # plot
105
  prefill_fig = px.box(
106
- exllama_df,
107
  x="Arch πŸ›οΈ",
108
  y="Prefill Latency Speedup (%)",
109
  color_discrete_sequence=px.colors.qualitative.Light24,
110
- custom_data=EXLLAMA_DATA,
111
- color="Quantization πŸ—œοΈ Exllama",
112
  points="all",
113
  )
114
  # add hover data
115
  prefill_fig.update_traces(
116
- hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
 
 
117
  )
118
  # add layout
119
  prefill_fig.update_layout(
@@ -134,12 +161,12 @@ def get_exllama_prefill_fig(llm_perf_df):
134
  return prefill_fig
135
 
136
 
137
- def create_exllama_plots(llm_perf_df):
138
  # descriptive text
139
  gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
140
  # get figures
141
- prefill_fig = get_exllama_prefill_fig(llm_perf_df)
142
- decode_fig = get_exllama_decode_fig(llm_perf_df)
143
 
144
  # create plots
145
  prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
 
3
  import plotly.express as px
4
 
5
 
6
+ CUSTOM_KERNELS_DATA = [
7
  # open llm
8
  "Model πŸ€—",
9
  "Arch πŸ›οΈ",
 
14
  # deployment settings
15
  "DType πŸ“₯",
16
  "Backend 🏭",
17
+ "Optimization πŸ› οΈ",
18
  "Quantization πŸ—œοΈ",
19
+ "Optimization πŸ› οΈ Custom Kernel",
20
+ "Quantization πŸ—œοΈ Custom Kernel",
21
  # primary measurements
22
  "Prefill Latency (s)",
23
+ "Prefill Latency (s) Custom Kernel",
24
  "Decode Throughput (tokens/s)",
25
+ "Decode Throughput (tokens/s) Custom Kernel",
 
 
26
  # speedups
27
  "Prefill Latency Speedup (%)",
28
  "Decode Throughput Speedup (%)",
29
  ]
30
 
31
 
32
+ def get_custom_kernels_df(llm_perf_df):
33
  copy_df = llm_perf_df.copy()
34
+ # seperate vanilla GPTQ experiments from Custom Kernel experiments
35
+ vanilla_df = copy_df[
36
+ (copy_df["Backend 🏭"] == "pytorch") &
37
+ (copy_df["Quantization πŸ—œοΈ"] == "None") &
38
+ (copy_df["Optimization πŸ› οΈ"] == "None") &
39
+ (copy_df["DType πŸ“₯"] == "float16")
40
+ ]
41
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
42
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
43
+ gemm_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMM")]
44
+ gemv_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMV")]
45
  # merge the three dataframes
46
  exllamav1_df = pd.merge(
47
+ vanilla_df,
48
  exllamav1_df,
49
  on=["Model πŸ€—"],
50
+ suffixes=["", " Custom Kernel"],
51
  )
52
  exllamav2_df = pd.merge(
53
+ vanilla_df,
54
  exllamav2_df,
55
  on=["Model πŸ€—"],
56
+ suffixes=["", " Custom Kernel"],
57
+ )
58
+ gemm_df = pd.merge(
59
+ vanilla_df,
60
+ gemm_df,
61
+ on=["Model πŸ€—"],
62
+ suffixes=["", " Custom Kernel"],
63
+ )
64
+ gemv_df = pd.merge(
65
+ vanilla_df,
66
+ gemv_df,
67
+ on=["Model πŸ€—"],
68
+ suffixes=["", " Custom Kernel"],
69
  )
70
  # concat the two dataframes row-wise
71
+ custom_kernels_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
 
72
  # compute speedups
73
+ custom_kernels_df["Prefill Latency Speedup (%)"] = (
74
+ (custom_kernels_df["Prefill Latency (s)"] / custom_kernels_df["Prefill Latency (s) Custom Kernel"]) * 100
75
  ).round(2) - 100
76
+ custom_kernels_df["Decode Throughput Speedup (%)"] = (
77
+ (
78
+ custom_kernels_df["Decode Throughput (tokens/s) Custom Kernel"]
79
+ / custom_kernels_df["Decode Throughput (tokens/s)"]
80
+ )
81
+ * 100
82
  ).round(2) - 100
83
  # filter speedups > 1000%
84
+ custom_kernels_df = custom_kernels_df[custom_kernels_df["Prefill Latency Speedup (%)"] < 1000]
85
+ custom_kernels_df = custom_kernels_df[custom_kernels_df["Decode Throughput Speedup (%)"] < 1000]
86
 
87
+ return custom_kernels_df
88
 
89
 
90
+ def get_custom_kernels_decode_fig(llm_perf_df):
91
+ custom_kernels_df = get_custom_kernels_df(llm_perf_df)
92
  # plot
93
  decode_fig = px.box(
94
+ custom_kernels_df,
95
  x="Arch πŸ›οΈ",
96
  y="Decode Throughput Speedup (%)",
97
  color_discrete_sequence=px.colors.qualitative.Light24,
98
+ custom_data=CUSTOM_KERNELS_DATA,
99
+ color="Quantization πŸ—œοΈ Custom Kernel",
100
  points="all",
101
  )
102
  # add hover data
103
  decode_fig.update_traces(
104
+ hovertemplate="<br>".join(
105
+ [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(CUSTOM_KERNELS_DATA)]
106
+ )
107
  )
108
  # add layout
109
  decode_fig.update_layout(
 
124
  return decode_fig
125
 
126
 
127
+ def get_custom_kernels_prefill_fig(llm_perf_df):
128
+ custom_kernels_df = get_custom_kernels_df(llm_perf_df)
129
  # plot
130
  prefill_fig = px.box(
131
+ custom_kernels_df,
132
  x="Arch πŸ›οΈ",
133
  y="Prefill Latency Speedup (%)",
134
  color_discrete_sequence=px.colors.qualitative.Light24,
135
+ custom_data=CUSTOM_KERNELS_DATA,
136
+ color="Quantization πŸ—œοΈ Custom Kernel",
137
  points="all",
138
  )
139
  # add hover data
140
  prefill_fig.update_traces(
141
+ hovertemplate="<br>".join(
142
+ [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(CUSTOM_KERNELS_DATA)]
143
+ )
144
  )
145
  # add layout
146
  prefill_fig.update_layout(
 
161
  return prefill_fig
162
 
163
 
164
+ def create_custom_kernels_plots(llm_perf_df):
165
  # descriptive text
166
  gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
167
  # get figures
168
+ prefill_fig = get_custom_kernels_prefill_fig(llm_perf_df)
169
+ decode_fig = get_custom_kernels_decode_fig(llm_perf_df)
170
 
171
  # create plots
172
  prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
src/llm_perf.py CHANGED
@@ -38,14 +38,16 @@ SORTING_ASCENDING = [False, True, False]
38
 
39
 
40
  def get_llm_df():
41
- hf_hub_download(
42
- repo_id=LLM_PERF_DATASET_REPO,
43
- filename="open-llm.csv",
44
- local_dir="dataset",
45
- repo_type="dataset",
46
- token=HF_TOKEN,
47
- )
48
- llm_df = pd.read_csv("dataset/open-llm.csv")
 
 
49
 
50
  return llm_df
51
 
 
38
 
39
 
40
  def get_llm_df():
41
+ # commented for now since scraping script is not working
42
+ # hf_hub_download(
43
+ # repo_id=LLM_PERF_DATASET_REPO,
44
+ # filename="open-llm.csv",
45
+ # local_dir="dataset",
46
+ # repo_type="dataset",
47
+ # token=HF_TOKEN,
48
+ # )
49
+ # llm_df = pd.read_csv("dataset/open-llm.csv")
50
+ llm_df = pd.read_csv("https://huggingface.co/datasets/optimum/llm-perf-dataset/raw/e8628583f0c31457cd5f8b81352735263117fbb4/open-llm.csv")
51
 
52
  return llm_df
53