Spaces:
Running
Running
File size: 4,477 Bytes
ab5f5f1 dc685a9 ab5f5f1 dc685a9 ab5f5f1 dc685a9 ab5f5f1 dc685a9 ab5f5f1 a1135a9 ab5f5f1 a1135a9 ab5f5f1 dc685a9 ab5f5f1 dc685a9 ab5f5f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import pandas as pd
import plotly.express as px
FLASHATTENTIONV2_DATA = [
# open llm
"Model π€",
"Arch ποΈ",
"DType π₯",
"Backend π",
"Params (B)",
"Open LLM Score (%)",
# deployment settings
"DType π₯",
"Backend π",
"Optimization π οΈ",
"Quantization ποΈ",
"Optimization π οΈ FlashAttentionV2",
# primary measurements
"Prefill Latency (s)",
"Prefill Latency (s) FlashAttentionV2",
"Decode Throughput (tokens/s)",
"Decode Throughput (tokens/s) FlashAttentionV2",
"E2E Throughput (tokens/s)",
"E2E Throughput (tokens/s) FlashAttentionV2",
# speedups
"Prefill Latency Speedup (%)",
"Decode Throughput Speedup (%)",
]
def get_fa2_df(llm_perf_df):
copy_df = llm_perf_df.copy()
# seperate original model experiments from FlashAttentionV2 experiments
original_df = copy_df[(copy_df["Optimization π οΈ"] == "None") & (copy_df["DType π₯"] == "float16")]
fa2_df = copy_df[(copy_df["Optimization π οΈ"] == "FlashAttentionV2") & (copy_df["DType π₯"] == "float16")]
# merge the two dataframes
fa2_df = pd.merge(
original_df,
fa2_df,
on=["Model π€", "Quantization ποΈ"],
suffixes=["", " FlashAttentionV2"],
)
# compute speedups
fa2_df["Prefill Latency Speedup (%)"] = (
(fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
).round(2) - 100
fa2_df["Decode Throughput Speedup (%)"] = (
(fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
).round(2) - 100
# filter speedups > 1000%
fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
return fa2_df
def get_fa2_decode_fig(llm_perf_df):
fa2_df = get_fa2_df(llm_perf_df)
# plot
decode_fig = px.box(
fa2_df,
x="Arch ποΈ",
y="Decode Throughput Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=FLASHATTENTIONV2_DATA,
color="Quantization ποΈ",
points="all",
)
# add hover data
decode_fig.update_traces(
hovertemplate="<br>".join(
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
)
)
# add layout
decode_fig.update_layout(
title={
"text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="LLM Architecture",
yaxis_title="Decode Speedup (%)",
legend_title="Quantization Scheme",
width=1200,
height=600,
)
return decode_fig
def get_fa2_prefill_fig(llm_perf_df):
fa2_df = get_fa2_df(llm_perf_df)
# plot
prefill_fig = px.box(
fa2_df,
x="Arch ποΈ",
y="Prefill Latency Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=FLASHATTENTIONV2_DATA,
color="Quantization ποΈ",
points="all",
)
# add hover data
prefill_fig.update_traces(
hovertemplate="<br>".join(
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
)
)
# add layout
prefill_fig.update_layout(
title={
"text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="LLM Architecture",
yaxis_title="Prefill Speedup (%)",
legend_title="Quantization Scheme",
width=1200,
height=600,
)
return prefill_fig
def create_fa2_plots(llm_perf_df):
# descriptive text
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
# get figures
prefill_fig = get_fa2_prefill_fig(llm_perf_df)
decode_fig = get_fa2_decode_fig(llm_perf_df)
# create plots
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
return prefill_plot, decode_plot
|