IlyasMoutawwakil HF staff commited on
Commit
0232cf1
β€’
1 Parent(s): a8a6326
app.py CHANGED
@@ -35,7 +35,6 @@ with demo:
35
  (
36
  filter_button,
37
  machine_textbox,
38
- search_bar,
39
  score_slider,
40
  memory_slider,
41
  backend_checkboxes,
@@ -48,7 +47,7 @@ with demo:
48
  llm_perf_df = get_llm_perf_df(machine=machine)
49
  ####################### LEADERBOARD TAB #######################
50
  with gr.TabItem("Leaderboard πŸ…", id=0):
51
- leaderboard_table, columns_checkboxes = create_leaderboard_table(llm_perf_df)
52
  lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
53
  ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
54
  with gr.TabItem("BetterTransformer πŸ“ˆ", id=2):
@@ -63,14 +62,15 @@ with demo:
63
  filter_button,
64
  # inputs
65
  machine_textbox,
66
- search_bar,
67
  score_slider,
68
  memory_slider,
69
  backend_checkboxes,
70
  datatype_checkboxes,
71
  optimization_checkboxes,
72
  quantization_checkboxes,
 
73
  columns_checkboxes,
 
74
  # outputs
75
  leaderboard_table,
76
  lat_score_mem_plot,
@@ -85,7 +85,9 @@ with demo:
85
  create_select_callback(
86
  # inputs
87
  machine_textbox,
 
88
  columns_checkboxes,
 
89
  # outputs
90
  leaderboard_table,
91
  )
 
35
  (
36
  filter_button,
37
  machine_textbox,
 
38
  score_slider,
39
  memory_slider,
40
  backend_checkboxes,
 
47
  llm_perf_df = get_llm_perf_df(machine=machine)
48
  ####################### LEADERBOARD TAB #######################
49
  with gr.TabItem("Leaderboard πŸ…", id=0):
50
+ search_bar, columns_checkboxes, leaderboard_table = create_leaderboard_table(llm_perf_df)
51
  lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
52
  ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
53
  with gr.TabItem("BetterTransformer πŸ“ˆ", id=2):
 
62
  filter_button,
63
  # inputs
64
  machine_textbox,
 
65
  score_slider,
66
  memory_slider,
67
  backend_checkboxes,
68
  datatype_checkboxes,
69
  optimization_checkboxes,
70
  quantization_checkboxes,
71
+ # interactive
72
  columns_checkboxes,
73
+ search_bar,
74
  # outputs
75
  leaderboard_table,
76
  lat_score_mem_plot,
 
85
  create_select_callback(
86
  # inputs
87
  machine_textbox,
88
+ # interactive
89
  columns_checkboxes,
90
+ search_bar,
91
  # outputs
92
  leaderboard_table,
93
  )
src/bettertransformer.py CHANGED
@@ -6,10 +6,10 @@ import plotly.express as px
6
  BETTERTRANSFORMER_DATA = [
7
  # open llm
8
  "Model πŸ€—",
9
- "Arch πŸ›οΈ",
10
  "DType πŸ“₯",
11
  "Backend 🏭",
12
  "Params (B)",
 
13
  "Open LLM Score (%)",
14
  # deployment settings
15
  "DType πŸ“₯",
@@ -18,15 +18,15 @@ BETTERTRANSFORMER_DATA = [
18
  "Quantization πŸ—œοΈ",
19
  "Optimization πŸ› οΈ BetterTransformer",
20
  # primary measurements
21
- "Prefill Latency (s)",
22
- "Prefill Latency (s) BetterTransformer",
23
- "Decode Throughput (tokens/s)",
24
- "Decode Throughput (tokens/s) BetterTransformer",
25
- "E2E Throughput (tokens/s)",
26
- "E2E Throughput (tokens/s) BetterTransformer",
27
  # speedups
28
- "Prefill Latency Speedup (%)",
29
- "Decode Throughput Speedup (%)",
30
  ]
31
 
32
 
@@ -43,15 +43,15 @@ def get_bt_df(llm_perf_df):
43
  suffixes=["", " BetterTransformer"],
44
  )
45
  # compute speedups
46
- bt_df["Prefill Latency Speedup (%)"] = (
47
- (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
48
  ).round(2) - 100
49
- bt_df["Decode Throughput Speedup (%)"] = (
50
- (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
51
  ).round(2) - 100
52
  # filter speedups > 1000%
53
- bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
54
- bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
55
 
56
  return bt_df
57
 
@@ -61,8 +61,8 @@ def get_bt_prefill_fig(llm_perf_df):
61
  # plot
62
  prefill_fig = px.box(
63
  bt_df,
64
- x="Arch πŸ›οΈ",
65
- y="Prefill Latency Speedup (%)",
66
  color_discrete_sequence=px.colors.qualitative.Light24,
67
  custom_data=BETTERTRANSFORMER_DATA,
68
  color="Quantization πŸ—œοΈ",
@@ -77,7 +77,7 @@ def get_bt_prefill_fig(llm_perf_df):
77
  # add layout
78
  prefill_fig.update_layout(
79
  title={
80
- "text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
81
  "y": 0.95,
82
  "x": 0.5,
83
  "xanchor": "center",
@@ -98,8 +98,8 @@ def get_bt_decode_fig(llm_perf_df):
98
  # plot
99
  decode_fig = px.box(
100
  bt_df,
101
- x="Arch πŸ›οΈ",
102
- y="Decode Throughput Speedup (%)",
103
  color_discrete_sequence=px.colors.qualitative.Light24,
104
  custom_data=BETTERTRANSFORMER_DATA,
105
  color="Quantization πŸ—œοΈ",
@@ -114,7 +114,7 @@ def get_bt_decode_fig(llm_perf_df):
114
  # add layout
115
  decode_fig.update_layout(
116
  title={
117
- "text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
118
  "y": 0.95,
119
  "x": 0.5,
120
  "xanchor": "center",
 
6
  BETTERTRANSFORMER_DATA = [
7
  # open llm
8
  "Model πŸ€—",
 
9
  "DType πŸ“₯",
10
  "Backend 🏭",
11
  "Params (B)",
12
+ "Architecture πŸ›οΈ",
13
  "Open LLM Score (%)",
14
  # deployment settings
15
  "DType πŸ“₯",
 
18
  "Quantization πŸ—œοΈ",
19
  "Optimization πŸ› οΈ BetterTransformer",
20
  # primary measurements
21
+ "Prefill (s)",
22
+ "Prefill (s) BetterTransformer",
23
+ "Decode (tokens/s)",
24
+ "Decode (tokens/s) BetterTransformer",
25
+ "End-to-End (tokens/s)",
26
+ "End-to-End (tokens/s) BetterTransformer",
27
  # speedups
28
+ "Prefill Speedup (%)",
29
+ "Decode Speedup (%)",
30
  ]
31
 
32
 
 
43
  suffixes=["", " BetterTransformer"],
44
  )
45
  # compute speedups
46
+ bt_df["Prefill Speedup (%)"] = (
47
+ (bt_df["Prefill (s)"] / bt_df["Prefill (s) BetterTransformer"]) * 100
48
  ).round(2) - 100
49
+ bt_df["Decode Speedup (%)"] = (
50
+ (bt_df["Decode (tokens/s) BetterTransformer"] / bt_df["Decode (tokens/s)"]) * 100
51
  ).round(2) - 100
52
  # filter speedups > 1000%
53
+ bt_df = bt_df[bt_df["Prefill Speedup (%)"] < 1000]
54
+ bt_df = bt_df[bt_df["Decode Speedup (%)"] < 1000]
55
 
56
  return bt_df
57
 
 
61
  # plot
62
  prefill_fig = px.box(
63
  bt_df,
64
+ x="Architecture πŸ›οΈ",
65
+ y="Prefill Speedup (%)",
66
  color_discrete_sequence=px.colors.qualitative.Light24,
67
  custom_data=BETTERTRANSFORMER_DATA,
68
  color="Quantization πŸ—œοΈ",
 
77
  # add layout
78
  prefill_fig.update_layout(
79
  title={
80
+ "text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
81
  "y": 0.95,
82
  "x": 0.5,
83
  "xanchor": "center",
 
98
  # plot
99
  decode_fig = px.box(
100
  bt_df,
101
+ x="Architecture πŸ›οΈ",
102
+ y="Decode Speedup (%)",
103
  color_discrete_sequence=px.colors.qualitative.Light24,
104
  custom_data=BETTERTRANSFORMER_DATA,
105
  color="Quantization πŸ—œοΈ",
 
114
  # add layout
115
  decode_fig.update_layout(
116
  title={
117
+ "text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
118
  "y": 0.95,
119
  "x": 0.5,
120
  "xanchor": "center",
src/control_panel.py CHANGED
@@ -12,13 +12,6 @@ def create_control_panel(machine: str = "hf-dgx-01"):
12
  # controls
13
  machine_textbox = gr.Textbox(value=machine, visible=False)
14
  with gr.Accordion("Control Panel πŸŽ›οΈ", open=False, elem_id="control-panel"):
15
- with gr.Row():
16
- with gr.Column():
17
- search_bar = gr.Textbox(
18
- label="Model πŸ€—",
19
- info="πŸ” Search for a model name",
20
- elem_id="search-bar",
21
- )
22
  with gr.Row():
23
  with gr.Column(scale=1, variant="panel"):
24
  score_slider = gr.Slider(
@@ -98,7 +91,6 @@ def create_control_panel(machine: str = "hf-dgx-01"):
98
  return (
99
  filter_button,
100
  machine_textbox,
101
- search_bar,
102
  score_slider,
103
  memory_slider,
104
  backend_checkboxes,
@@ -110,27 +102,28 @@ def create_control_panel(machine: str = "hf-dgx-01"):
110
 
111
  def filter_fn(
112
  machine,
113
- model,
 
 
114
  backends,
115
  datatypes,
116
  optimizations,
117
  quantizations,
 
118
  columns,
119
- score,
120
- memory,
121
  ):
122
  raw_df = get_llm_perf_df(machine=machine)
123
  filtered_df = raw_df[
124
- raw_df["Model πŸ€—"].str.contains(model, case=False)
125
- & raw_df["Backend 🏭"].isin(backends)
126
  & raw_df["DType πŸ“₯"].isin(datatypes)
127
  & raw_df["Optimization πŸ› οΈ"].isin(optimizations)
128
  & raw_df["Quantization πŸ—œοΈ"].isin(quantizations)
129
  & (raw_df["Open LLM Score (%)"] >= score)
130
  & (raw_df["Allocated Memory (MB)"] <= memory)
131
  ]
132
- filtered_leaderboard_df = get_leaderboard_df(filtered_df)
133
- filtered_leaderboard_df = filtered_leaderboard_df[columns]
134
  filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
135
  filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
136
  filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
@@ -154,16 +147,18 @@ def filter_fn(
154
  def create_control_callback(
155
  # button
156
  filter_button,
157
- # inputs
158
  machine_textbox,
159
- search_bar,
160
  score_slider,
161
  memory_slider,
162
  backend_checkboxes,
163
  datatype_checkboxes,
164
  optimization_checkboxes,
165
  quantization_checkboxes,
 
166
  columns_checkboxes,
 
167
  # outputs
168
  leaderboard_table,
169
  lat_score_mem_plot,
@@ -177,15 +172,18 @@ def create_control_callback(
177
  filter_button.click(
178
  fn=filter_fn,
179
  inputs=[
 
180
  machine_textbox,
181
- search_bar,
 
 
182
  backend_checkboxes,
183
  datatype_checkboxes,
184
  optimization_checkboxes,
185
  quantization_checkboxes,
 
186
  columns_checkboxes,
187
- score_slider,
188
- memory_slider,
189
  ],
190
  outputs=[
191
  leaderboard_table,
@@ -200,23 +198,33 @@ def create_control_callback(
200
  )
201
 
202
 
203
- def select_fn(machine, columns):
204
  raw_df = get_llm_perf_df(machine=machine)
205
  selected_leaderboard_df = get_leaderboard_df(raw_df)
206
  selected_leaderboard_df = selected_leaderboard_df[columns]
 
 
 
207
 
208
  return selected_leaderboard_df
209
 
210
 
211
  def create_select_callback(
212
- # inputs
213
  machine_textbox,
 
214
  columns_checkboxes,
 
215
  # outputs
216
  leaderboard_table,
217
  ):
218
  columns_checkboxes.change(
219
  fn=select_fn,
220
- inputs=[machine_textbox, columns_checkboxes],
 
 
 
 
 
221
  outputs=[leaderboard_table],
222
  )
 
12
  # controls
13
  machine_textbox = gr.Textbox(value=machine, visible=False)
14
  with gr.Accordion("Control Panel πŸŽ›οΈ", open=False, elem_id="control-panel"):
 
 
 
 
 
 
 
15
  with gr.Row():
16
  with gr.Column(scale=1, variant="panel"):
17
  score_slider = gr.Slider(
 
91
  return (
92
  filter_button,
93
  machine_textbox,
 
94
  score_slider,
95
  memory_slider,
96
  backend_checkboxes,
 
102
 
103
  def filter_fn(
104
  machine,
105
+ # inputs
106
+ score,
107
+ memory,
108
  backends,
109
  datatypes,
110
  optimizations,
111
  quantizations,
112
+ # interactive
113
  columns,
114
+ search,
 
115
  ):
116
  raw_df = get_llm_perf_df(machine=machine)
117
  filtered_df = raw_df[
118
+ # raw_df["Model πŸ€—"].str.contains(model, case=False)
119
+ raw_df["Backend 🏭"].isin(backends)
120
  & raw_df["DType πŸ“₯"].isin(datatypes)
121
  & raw_df["Optimization πŸ› οΈ"].isin(optimizations)
122
  & raw_df["Quantization πŸ—œοΈ"].isin(quantizations)
123
  & (raw_df["Open LLM Score (%)"] >= score)
124
  & (raw_df["Allocated Memory (MB)"] <= memory)
125
  ]
126
+ filtered_leaderboard_df = select_fn(machine, columns, search)
 
127
  filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
128
  filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
129
  filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
 
147
  def create_control_callback(
148
  # button
149
  filter_button,
150
+ # fixed
151
  machine_textbox,
152
+ # inputs
153
  score_slider,
154
  memory_slider,
155
  backend_checkboxes,
156
  datatype_checkboxes,
157
  optimization_checkboxes,
158
  quantization_checkboxes,
159
+ # interactive
160
  columns_checkboxes,
161
+ search_bar,
162
  # outputs
163
  leaderboard_table,
164
  lat_score_mem_plot,
 
172
  filter_button.click(
173
  fn=filter_fn,
174
  inputs=[
175
+ # fixed
176
  machine_textbox,
177
+ # inputs
178
+ score_slider,
179
+ memory_slider,
180
  backend_checkboxes,
181
  datatype_checkboxes,
182
  optimization_checkboxes,
183
  quantization_checkboxes,
184
+ # interactive
185
  columns_checkboxes,
186
+ search_bar,
 
187
  ],
188
  outputs=[
189
  leaderboard_table,
 
198
  )
199
 
200
 
201
+ def select_fn(machine, columns, search):
202
  raw_df = get_llm_perf_df(machine=machine)
203
  selected_leaderboard_df = get_leaderboard_df(raw_df)
204
  selected_leaderboard_df = selected_leaderboard_df[columns]
205
+ selected_leaderboard_df = selected_leaderboard_df[
206
+ selected_leaderboard_df["Model πŸ€—"].str.contains(search, case=False)
207
+ ]
208
 
209
  return selected_leaderboard_df
210
 
211
 
212
  def create_select_callback(
213
+ # fixed
214
  machine_textbox,
215
+ # interactive
216
  columns_checkboxes,
217
+ search_bar,
218
  # outputs
219
  leaderboard_table,
220
  ):
221
  columns_checkboxes.change(
222
  fn=select_fn,
223
+ inputs=[machine_textbox, columns_checkboxes, search_bar],
224
+ outputs=[leaderboard_table],
225
+ )
226
+ search_bar.change(
227
+ fn=select_fn,
228
+ inputs=[machine_textbox, columns_checkboxes, search_bar],
229
  outputs=[leaderboard_table],
230
  )
src/flashattentionv2.py CHANGED
@@ -6,10 +6,10 @@ import plotly.express as px
6
  FLASHATTENTIONV2_DATA = [
7
  # open llm
8
  "Model πŸ€—",
9
- "Arch πŸ›οΈ",
10
  "DType πŸ“₯",
11
  "Backend 🏭",
12
  "Params (B)",
 
13
  "Open LLM Score (%)",
14
  # deployment settings
15
  "DType πŸ“₯",
@@ -18,15 +18,15 @@ FLASHATTENTIONV2_DATA = [
18
  "Quantization πŸ—œοΈ",
19
  "Optimization πŸ› οΈ FlashAttentionV2",
20
  # primary measurements
21
- "Prefill Latency (s)",
22
- "Prefill Latency (s) FlashAttentionV2",
23
- "Decode Throughput (tokens/s)",
24
- "Decode Throughput (tokens/s) FlashAttentionV2",
25
- "E2E Throughput (tokens/s)",
26
- "E2E Throughput (tokens/s) FlashAttentionV2",
27
  # speedups
28
- "Prefill Latency Speedup (%)",
29
- "Decode Throughput Speedup (%)",
30
  ]
31
 
32
 
@@ -43,15 +43,15 @@ def get_fa2_df(llm_perf_df):
43
  suffixes=["", " FlashAttentionV2"],
44
  )
45
  # compute speedups
46
- fa2_df["Prefill Latency Speedup (%)"] = (
47
- (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
48
- ).round(2) - 100
49
- fa2_df["Decode Throughput Speedup (%)"] = (
50
- (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
51
  ).round(2) - 100
52
  # filter speedups > 1000%
53
- fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
54
- fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
55
 
56
  return fa2_df
57
 
@@ -61,8 +61,8 @@ def get_fa2_decode_fig(llm_perf_df):
61
  # plot
62
  decode_fig = px.box(
63
  fa2_df,
64
- x="Arch πŸ›οΈ",
65
- y="Decode Throughput Speedup (%)",
66
  color_discrete_sequence=px.colors.qualitative.Light24,
67
  custom_data=FLASHATTENTIONV2_DATA,
68
  color="Quantization πŸ—œοΈ",
@@ -77,7 +77,7 @@ def get_fa2_decode_fig(llm_perf_df):
77
  # add layout
78
  decode_fig.update_layout(
79
  title={
80
- "text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
81
  "y": 0.95,
82
  "x": 0.5,
83
  "xanchor": "center",
@@ -98,8 +98,8 @@ def get_fa2_prefill_fig(llm_perf_df):
98
  # plot
99
  prefill_fig = px.box(
100
  fa2_df,
101
- x="Arch πŸ›οΈ",
102
- y="Prefill Latency Speedup (%)",
103
  color_discrete_sequence=px.colors.qualitative.Light24,
104
  custom_data=FLASHATTENTIONV2_DATA,
105
  color="Quantization πŸ—œοΈ",
@@ -114,7 +114,7 @@ def get_fa2_prefill_fig(llm_perf_df):
114
  # add layout
115
  prefill_fig.update_layout(
116
  title={
117
- "text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
118
  "y": 0.95,
119
  "x": 0.5,
120
  "xanchor": "center",
 
6
  FLASHATTENTIONV2_DATA = [
7
  # open llm
8
  "Model πŸ€—",
 
9
  "DType πŸ“₯",
10
  "Backend 🏭",
11
  "Params (B)",
12
+ "Architecture πŸ›οΈ",
13
  "Open LLM Score (%)",
14
  # deployment settings
15
  "DType πŸ“₯",
 
18
  "Quantization πŸ—œοΈ",
19
  "Optimization πŸ› οΈ FlashAttentionV2",
20
  # primary measurements
21
+ "Prefill (s)",
22
+ "Prefill (s) FlashAttentionV2",
23
+ "Decode (tokens/s)",
24
+ "Decode (tokens/s) FlashAttentionV2",
25
+ "End-to-End (tokens/s)",
26
+ "End-to-End (tokens/s) FlashAttentionV2",
27
  # speedups
28
+ "Prefill Speedup (%)",
29
+ "Decode Speedup (%)",
30
  ]
31
 
32
 
 
43
  suffixes=["", " FlashAttentionV2"],
44
  )
45
  # compute speedups
46
+ fa2_df["Prefill Speedup (%)"] = ((fa2_df["Prefill (s)"] / fa2_df["Prefill (s) FlashAttentionV2"]) * 100).round(
47
+ 2
48
+ ) - 100
49
+ fa2_df["Decode Speedup (%)"] = (
50
+ (fa2_df["Decode (tokens/s) FlashAttentionV2"] / fa2_df["Decode (tokens/s)"]) * 100
51
  ).round(2) - 100
52
  # filter speedups > 1000%
53
+ fa2_df = fa2_df[fa2_df["Prefill Speedup (%)"] < 1000]
54
+ fa2_df = fa2_df[fa2_df["Decode Speedup (%)"] < 1000]
55
 
56
  return fa2_df
57
 
 
61
  # plot
62
  decode_fig = px.box(
63
  fa2_df,
64
+ x="Architecture πŸ›οΈ",
65
+ y="Decode Speedup (%)",
66
  color_discrete_sequence=px.colors.qualitative.Light24,
67
  custom_data=FLASHATTENTIONV2_DATA,
68
  color="Quantization πŸ—œοΈ",
 
77
  # add layout
78
  decode_fig.update_layout(
79
  title={
80
+ "text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
81
  "y": 0.95,
82
  "x": 0.5,
83
  "xanchor": "center",
 
98
  # plot
99
  prefill_fig = px.box(
100
  fa2_df,
101
+ x="Architecture πŸ›οΈ",
102
+ y="Prefill Speedup (%)",
103
  color_discrete_sequence=px.colors.qualitative.Light24,
104
  custom_data=FLASHATTENTIONV2_DATA,
105
  color="Quantization πŸ—œοΈ",
 
114
  # add layout
115
  prefill_fig.update_layout(
116
  title={
117
+ "text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
118
  "y": 0.95,
119
  "x": 0.5,
120
  "xanchor": "center",
src/latency_score_memory.py CHANGED
@@ -4,18 +4,18 @@ import plotly.express as px
4
 
5
  SCORE_MEMORY_LATENCY_DATA = [
6
  "Model πŸ€—",
7
- "Arch πŸ›οΈ",
8
- "Params (B)",
9
  "DType πŸ“₯",
10
  "Backend 🏭",
 
 
11
  "Optimization πŸ› οΈ",
12
  "Quantization πŸ—œοΈ",
13
  "Open LLM Score (%)",
14
- "Prefill Latency (s)",
15
- "Decode Throughput (tokens/s)",
16
- "Allocated Memory (MB)",
17
- "E2E Latency (s)",
18
- # "E2E Throughput (tokens/s)",
19
  ]
20
 
21
 
@@ -24,10 +24,10 @@ def get_lat_score_mem_fig(llm_perf_df):
24
  # plot
25
  fig = px.scatter(
26
  copy_df,
27
- x="E2E Latency (s)",
28
  y="Open LLM Score (%)",
29
- size="Allocated Memory (MB)",
30
- color="Arch πŸ›οΈ",
31
  custom_data=SCORE_MEMORY_LATENCY_DATA,
32
  color_discrete_sequence=px.colors.qualitative.Light24,
33
  )
@@ -38,7 +38,7 @@ def get_lat_score_mem_fig(llm_perf_df):
38
  )
39
  fig.update_layout(
40
  title={
41
- "text": "Latency vs. Score vs. Memory",
42
  "y": 0.95,
43
  "x": 0.5,
44
  "xanchor": "center",
@@ -56,7 +56,7 @@ def get_lat_score_mem_fig(llm_perf_df):
56
 
57
  def create_lat_score_mem_plot(llm_perf_df):
58
  # descriptive text
59
- gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information. ",elem_id="text")
60
  # get figure
61
  fig = get_lat_score_mem_fig(llm_perf_df)
62
  # create plot
 
4
 
5
  SCORE_MEMORY_LATENCY_DATA = [
6
  "Model πŸ€—",
 
 
7
  "DType πŸ“₯",
8
  "Backend 🏭",
9
+ "Params (B)",
10
+ "Architecture πŸ›οΈ",
11
  "Optimization πŸ› οΈ",
12
  "Quantization πŸ—œοΈ",
13
  "Open LLM Score (%)",
14
+ "Prefill (s)",
15
+ "Decode (tokens/s)",
16
+ "Memory (MB)",
17
+ "End-to-End (s)",
18
+ # "End-to-End (tokens/s)",
19
  ]
20
 
21
 
 
24
  # plot
25
  fig = px.scatter(
26
  copy_df,
27
+ x="End-to-End (s)",
28
  y="Open LLM Score (%)",
29
+ size="Memory (MB)",
30
+ color="Architecture πŸ›οΈ",
31
  custom_data=SCORE_MEMORY_LATENCY_DATA,
32
  color_discrete_sequence=px.colors.qualitative.Light24,
33
  )
 
38
  )
39
  fig.update_layout(
40
  title={
41
+ "text": "vs. Score vs. Memory",
42
  "y": 0.95,
43
  "x": 0.5,
44
  "xanchor": "center",
 
56
 
57
  def create_lat_score_mem_plot(llm_perf_df):
58
  # descriptive text
59
+ gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information. ", elem_id="text")
60
  # get figure
61
  fig = get_lat_score_mem_fig(llm_perf_df)
62
  # create plot
src/leaderboard.py CHANGED
@@ -8,9 +8,9 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
8
  "Model πŸ€—": "markdown",
9
  "Experiment πŸ§ͺ": "str",
10
  # primary measurements
11
- "Prefill Latency (s)": "number",
12
- "Decode Throughput (tokens/s)": "number",
13
- "Allocated Memory (MB)": "number",
14
  "Energy (tokens/kWh)": "number",
15
  # deployment settings
16
  "DType πŸ“₯": "str",
@@ -18,15 +18,25 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
18
  "Optimization πŸ› οΈ": "str",
19
  "Quantization πŸ—œοΈ": "str",
20
  # additional measurements
21
- "Arch πŸ›οΈ": "markdown",
22
  "Params (B)": "number",
23
  "Open LLM Score (%)": "number",
24
- "E2E Latency (s)": "number",
25
- "E2E Throughput (tokens/s)": "number",
26
  "Reserved Memory (MB)": "number",
27
  "Used Memory (MB)": "number",
28
  }
29
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def process_model(model_name):
32
  link = f"https://huggingface.co/{model_name}"
@@ -48,20 +58,29 @@ def get_leaderboard_df(llm_perf_df):
48
  def create_leaderboard_table(llm_perf_df):
49
  # get dataframe
50
  leaderboard_df = get_leaderboard_df(llm_perf_df)
 
 
 
 
 
 
 
 
51
  # create checkboxes
52
- columns_checkboxes = gr.CheckboxGroup(
53
- label="Columns πŸ“Š",
54
- choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
55
- value=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
56
- info="β˜‘οΈ Select the columns to display",
57
- elem_id="columns-checkboxes",
58
- )
 
59
  # create table
60
  leaderboard_table = gr.components.Dataframe(
61
- value=leaderboard_df,
62
  datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
63
  headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
64
  elem_id="leaderboard-table",
65
  )
66
 
67
- return leaderboard_table, columns_checkboxes
 
8
  "Model πŸ€—": "markdown",
9
  "Experiment πŸ§ͺ": "str",
10
  # primary measurements
11
+ "Prefill (s)": "number",
12
+ "Decode (tokens/s)": "number",
13
+ "Memory (MB)": "number",
14
  "Energy (tokens/kWh)": "number",
15
  # deployment settings
16
  "DType πŸ“₯": "str",
 
18
  "Optimization πŸ› οΈ": "str",
19
  "Quantization πŸ—œοΈ": "str",
20
  # additional measurements
21
+ "Architecture πŸ›οΈ": "markdown",
22
  "Params (B)": "number",
23
  "Open LLM Score (%)": "number",
24
+ "End-to-End (s)": "number",
25
+ "End-to-End (tokens/s)": "number",
26
  "Reserved Memory (MB)": "number",
27
  "Used Memory (MB)": "number",
28
  }
29
 
30
+ PRIMARY_COLUMNS = [
31
+ "Model πŸ€—",
32
+ "Experiment πŸ§ͺ",
33
+ "Prefill (s)",
34
+ "Decode (tokens/s)",
35
+ "Memory (MB)",
36
+ "Energy (tokens/kWh)",
37
+ "Open LLM Score (%)",
38
+ ]
39
+
40
 
41
  def process_model(model_name):
42
  link = f"https://huggingface.co/{model_name}"
 
58
  def create_leaderboard_table(llm_perf_df):
59
  # get dataframe
60
  leaderboard_df = get_leaderboard_df(llm_perf_df)
61
+
62
+ # create search bar
63
+ with gr.Row():
64
+ search_bar = gr.Textbox(
65
+ label="Model πŸ€—",
66
+ info="πŸ” Search for a model name",
67
+ elem_id="search-bar",
68
+ )
69
  # create checkboxes
70
+ with gr.Row():
71
+ columns_checkboxes = gr.CheckboxGroup(
72
+ label="Columns πŸ“Š",
73
+ value=PRIMARY_COLUMNS,
74
+ choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
75
+ info="β˜‘οΈ Select the columns to display",
76
+ elem_id="columns-checkboxes",
77
+ )
78
  # create table
79
  leaderboard_table = gr.components.Dataframe(
80
+ value=leaderboard_df[PRIMARY_COLUMNS],
81
  datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
82
  headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
83
  elem_id="leaderboard-table",
84
  )
85
 
86
+ return search_bar, columns_checkboxes, leaderboard_table
src/llm_perf.py CHANGED
@@ -12,9 +12,9 @@ COLUMNS_MAPPING = {
12
  "Model": "Model πŸ€—",
13
  "experiment_name": "Experiment πŸ§ͺ",
14
  # primary measurements
15
- "forward.latency(s)": "Prefill Latency (s)",
16
- "decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
17
- "generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
18
  "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
19
  # deployment settings
20
  "backend.name": "Backend 🏭",
@@ -22,18 +22,18 @@ COLUMNS_MAPPING = {
22
  "optimization": "Optimization πŸ› οΈ",
23
  "quantization": "Quantization πŸ—œοΈ",
24
  # additional measurements
25
- "Arch": "Arch πŸ›οΈ",
26
  "Size": "Params (B)",
 
27
  "Score": "Open LLM Score (%)",
28
- "generate.latency(s)": "E2E Latency (s)",
29
- "generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
30
  "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
31
  "generate.max_memory_used(MB)": "Used Memory (MB)",
32
  }
33
  SORTING_COLUMNS = [
34
  "Open LLM Score (%)",
35
- "Prefill Latency (s)",
36
- "Decode Throughput (tokens/s)",
37
  ]
38
  SORTING_ASCENDING = [False, True, False]
39
 
@@ -107,6 +107,13 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
107
  ].apply(lambda x: process_quantization_scheme(x), axis=1)
108
  # process experiment name
109
  llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
 
 
 
 
 
 
 
110
  # add arch
111
  llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
112
  # filter columns
 
12
  "Model": "Model πŸ€—",
13
  "experiment_name": "Experiment πŸ§ͺ",
14
  # primary measurements
15
+ "forward.latency(s)": "Prefill (s)",
16
+ "decode.throughput(tokens/s)": "Decode (tokens/s)",
17
+ "generate.max_memory_allocated(MB)": "Memory (MB)",
18
  "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
19
  # deployment settings
20
  "backend.name": "Backend 🏭",
 
22
  "optimization": "Optimization πŸ› οΈ",
23
  "quantization": "Quantization πŸ—œοΈ",
24
  # additional measurements
 
25
  "Size": "Params (B)",
26
+ "Arch": "Architecture πŸ›οΈ",
27
  "Score": "Open LLM Score (%)",
28
+ "generate.latency(s)": "End-to-End (s)",
29
+ "generate.throughput(tokens/s)": "End-to-End (tokens/s)",
30
  "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
31
  "generate.max_memory_used(MB)": "Used Memory (MB)",
32
  }
33
  SORTING_COLUMNS = [
34
  "Open LLM Score (%)",
35
+ "Decode (tokens/s)",
36
+ "Prefill (s)",
37
  ]
38
  SORTING_ASCENDING = [False, True, False]
39
 
 
107
  ].apply(lambda x: process_quantization_scheme(x), axis=1)
108
  # process experiment name
109
  llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
110
+ llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(
111
+ lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x
112
+ )
113
+ llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "awq"))
114
+ llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "gptq"))
115
+ llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "sdpa"))
116
+ llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA2"))
117
  # add arch
118
  llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
119
  # filter columns
src/quantization_kernels.py CHANGED
@@ -6,10 +6,10 @@ import plotly.express as px
6
  QUANT_DATA = [
7
  # open llm
8
  "Model πŸ€—",
9
- "Arch πŸ›οΈ",
10
  "DType πŸ“₯",
11
  "Backend 🏭",
12
  "Params (B)",
 
13
  "Open LLM Score (%)",
14
  # deployment settings
15
  "DType πŸ“₯",
@@ -19,13 +19,13 @@ QUANT_DATA = [
19
  "Optimization πŸ› οΈ Custom Kernel",
20
  "Quantization πŸ—œοΈ Custom Kernel",
21
  # primary measurements
22
- "Prefill Latency (s)",
23
- "Prefill Latency (s) Custom Kernel",
24
- "Decode Throughput (tokens/s)",
25
- "Decode Throughput (tokens/s) Custom Kernel",
26
  # speedups
27
- "Prefill Latency Speedup (%)",
28
- "Decode Throughput Speedup (%)",
29
  ]
30
 
31
 
@@ -33,10 +33,10 @@ def get_quant_df(llm_perf_df):
33
  copy_df = llm_perf_df.copy()
34
  # seperate vanilla GPTQ experiments from Custom Kernel experiments
35
  vanilla_df = copy_df[
36
- (copy_df["Backend 🏭"] == "pytorch") &
37
- (copy_df["Quantization πŸ—œοΈ"] == "None") &
38
- (copy_df["Optimization πŸ› οΈ"] == "None") &
39
- (copy_df["DType πŸ“₯"] == "float16")
40
  ]
41
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
42
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
@@ -70,15 +70,15 @@ def get_quant_df(llm_perf_df):
70
  # concat the two dataframes row-wise
71
  quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
72
  # compute speedups
73
- quant_df["Prefill Latency Speedup (%)"] = (
74
- (quant_df["Prefill Latency (s)"] / quant_df["Prefill Latency (s) Custom Kernel"]) * 100
75
- ).round(2) - 100
76
- quant_df["Decode Throughput Speedup (%)"] = (
77
- (quant_df["Decode Throughput (tokens/s) Custom Kernel"] / quant_df["Decode Throughput (tokens/s)"]) * 100
78
  ).round(2) - 100
79
  # filter speedups > 1000%
80
- quant_df = quant_df[quant_df["Prefill Latency Speedup (%)"] < 1000]
81
- quant_df = quant_df[quant_df["Decode Throughput Speedup (%)"] < 1000]
82
 
83
  return quant_df
84
 
@@ -88,8 +88,8 @@ def get_quant_decode_fig(llm_perf_df):
88
  # plot
89
  decode_fig = px.box(
90
  quant_df,
91
- x="Arch πŸ›οΈ",
92
- y="Decode Throughput Speedup (%)",
93
  color_discrete_sequence=px.colors.qualitative.Light24,
94
  custom_data=QUANT_DATA,
95
  color="Quantization πŸ—œοΈ Custom Kernel",
@@ -102,7 +102,7 @@ def get_quant_decode_fig(llm_perf_df):
102
  # add layout
103
  decode_fig.update_layout(
104
  title={
105
- "text": "Decode Throughput Speedup per Architecture",
106
  "y": 0.95,
107
  "x": 0.5,
108
  "xanchor": "center",
@@ -123,8 +123,8 @@ def get_quant_prefill_fig(llm_perf_df):
123
  # plot
124
  prefill_fig = px.box(
125
  quant_df,
126
- x="Arch πŸ›οΈ",
127
- y="Prefill Latency Speedup (%)",
128
  color_discrete_sequence=px.colors.qualitative.Light24,
129
  custom_data=QUANT_DATA,
130
  color="Quantization πŸ—œοΈ Custom Kernel",
@@ -137,7 +137,7 @@ def get_quant_prefill_fig(llm_perf_df):
137
  # add layout
138
  prefill_fig.update_layout(
139
  title={
140
- "text": "Prefill Latency Speedup per Architecture",
141
  "y": 0.95,
142
  "x": 0.5,
143
  "xanchor": "center",
 
6
  QUANT_DATA = [
7
  # open llm
8
  "Model πŸ€—",
 
9
  "DType πŸ“₯",
10
  "Backend 🏭",
11
  "Params (B)",
12
+ "Architecture πŸ›οΈ",
13
  "Open LLM Score (%)",
14
  # deployment settings
15
  "DType πŸ“₯",
 
19
  "Optimization πŸ› οΈ Custom Kernel",
20
  "Quantization πŸ—œοΈ Custom Kernel",
21
  # primary measurements
22
+ "Prefill (s)",
23
+ "Prefill (s) Custom Kernel",
24
+ "Decode (tokens/s)",
25
+ "Decode (tokens/s) Custom Kernel",
26
  # speedups
27
+ "Prefill Speedup (%)",
28
+ "Decode Speedup (%)",
29
  ]
30
 
31
 
 
33
  copy_df = llm_perf_df.copy()
34
  # seperate vanilla GPTQ experiments from Custom Kernel experiments
35
  vanilla_df = copy_df[
36
+ (copy_df["Backend 🏭"] == "pytorch")
37
+ & (copy_df["Quantization πŸ—œοΈ"] == "None")
38
+ & (copy_df["Optimization πŸ› οΈ"] == "None")
39
+ & (copy_df["DType πŸ“₯"] == "float16")
40
  ]
41
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
42
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
 
70
  # concat the two dataframes row-wise
71
  quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
72
  # compute speedups
73
+ quant_df["Prefill Speedup (%)"] = ((quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100).round(
74
+ 2
75
+ ) - 100
76
+ quant_df["Decode Speedup (%)"] = (
77
+ (quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"]) * 100
78
  ).round(2) - 100
79
  # filter speedups > 1000%
80
+ quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
81
+ quant_df = quant_df[quant_df["Decode Speedup (%)"] < 1000]
82
 
83
  return quant_df
84
 
 
88
  # plot
89
  decode_fig = px.box(
90
  quant_df,
91
+ x="Architecture πŸ›οΈ",
92
+ y="Decode Speedup (%)",
93
  color_discrete_sequence=px.colors.qualitative.Light24,
94
  custom_data=QUANT_DATA,
95
  color="Quantization πŸ—œοΈ Custom Kernel",
 
102
  # add layout
103
  decode_fig.update_layout(
104
  title={
105
+ "text": "Decode Speedup per Architecture",
106
  "y": 0.95,
107
  "x": 0.5,
108
  "xanchor": "center",
 
123
  # plot
124
  prefill_fig = px.box(
125
  quant_df,
126
+ x="Architecture πŸ›οΈ",
127
+ y="Prefill Speedup (%)",
128
  color_discrete_sequence=px.colors.qualitative.Light24,
129
  custom_data=QUANT_DATA,
130
  color="Quantization πŸ—œοΈ Custom Kernel",
 
137
  # add layout
138
  prefill_fig.update_layout(
139
  title={
140
+ "text": "Prefill Speedup per Architecture",
141
  "y": 0.95,
142
  "x": 0.5,
143
  "xanchor": "center",