llm-perf-leaderboard / src /attention.py
IlyasMoutawwakil's picture
update
76b423c
import gradio as gr
import pandas as pd
import plotly.express as px
ATTN_DATA = [
# open llm
"Model πŸ€—",
"Experiment πŸ§ͺ",
"Params (B)",
"Architecture πŸ›οΈ",
"Open LLM Score (%)",
# deployment settings
"Backend 🏭",
"Quantization πŸ—œοΈ",
"Precision πŸ“₯",
"Attention πŸ‘οΈ",
"Kernel βš›οΈ",
# primary measurements
"Prefill (s)",
"Decode (tokens/s)",
# speedups
"Prefill Speedup (%)",
"Decode Speedup (%)",
]
def get_attn_df(open_llm_perf_df):
copy_df = open_llm_perf_df.copy()
copy_df["Quantization & Kernel"] = (
copy_df["Quantization πŸ—œοΈ"] + " & " + copy_df["Kernel βš›οΈ"]
)
eager_df = copy_df[(copy_df["Attention πŸ‘οΈ"] == "Eager")]
sdpa_df = copy_df[(copy_df["Attention πŸ‘οΈ"] == "SDPA")]
fa2_df = copy_df[(copy_df["Attention πŸ‘οΈ"] == "FAv2")]
sdpa_df = pd.merge(
eager_df,
sdpa_df,
on=["Model πŸ€—", "Quantization & Kernel"],
suffixes=["", " other"],
)
fa2_df = pd.merge(
eager_df,
fa2_df,
on=["Model πŸ€—", "Quantization & Kernel"],
suffixes=["", " other"],
)
attn_df = pd.concat([sdpa_df, fa2_df])
# compute speedups
attn_df["Prefill Speedup (%)"] = (
(attn_df["Prefill (s)"] / attn_df["Prefill (s) other"]) * 100
).round(2) - 100
attn_df["Decode Speedup (%)"] = (
(attn_df["Decode (tokens/s) other"] / attn_df["Decode (tokens/s)"]) * 100
).round(2) - 100
return attn_df
def get_attn_prefill_fig(open_llm_perf_df):
attn_df = get_attn_df(open_llm_perf_df)
# plot
prefill_fig = px.box(
attn_df,
x="Architecture πŸ›οΈ",
y="Prefill Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=ATTN_DATA,
color="Attention πŸ‘οΈ other",
points="all",
)
# add hover data
prefill_fig.update_traces(
hovertemplate="<br>".join(
[
f"<b>{column}:</b> %{{customdata[{i}]}}"
for i, column in enumerate(ATTN_DATA)
]
)
)
# add layout
prefill_fig.update_layout(
title={
"text": "Prefill Speedup per Architecture, Compared To Eager Attention",
"xanchor": "center",
"yanchor": "top",
"y": 0.95,
"x": 0.5,
},
yaxis_title="Prefill Speedup (%)",
xaxis_title="LLM Architecture",
legend_title="Attention",
width=1200,
height=600,
)
return prefill_fig
def get_attn_decode_fig(open_llm_perf_df):
attn_df = get_attn_df(open_llm_perf_df)
print(len(attn_df))
# plot
decode_fig = px.box(
attn_df,
x="Architecture πŸ›οΈ",
y="Decode Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=ATTN_DATA,
color="Attention πŸ‘οΈ other",
points="all",
)
# add hover data
decode_fig.update_traces(
hovertemplate="<br>".join(
[
f"<b>{column}:</b> %{{customdata[{i}]}}"
for i, column in enumerate(ATTN_DATA)
]
)
)
# add layout
decode_fig.update_layout(
title={
"text": "Decode Speedup per Architecture, Compared To Eager Attention",
"xanchor": "center",
"yanchor": "top",
"y": 0.95,
"x": 0.5,
},
yaxis_title="Decode Speedup (%)",
xaxis_title="LLM Architecture",
legend_title="Attention",
width=1200,
height=600,
)
return decode_fig
def create_attn_plots(open_llm_perf_df):
# descriptive text
gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
# get figures
prefill_fig = get_attn_prefill_fig(open_llm_perf_df)
decode_fig = get_attn_decode_fig(open_llm_perf_df)
# create plots
prefill_plot = gr.components.Plot(
value=prefill_fig, elem_id="plot", show_label=False
)
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
return prefill_plot, decode_plot