llm-perf-leaderboard / src /bettertransformer.py
IlyasMoutawwakil's picture
update
ab5f5f1
raw
history blame
4.34 kB
import gradio as gr
import pandas as pd
import plotly.express as px
from src.utils import process_arch
BETTERTRANSFORMER_DATA = [
# open llm
"Model πŸ€—",
"Arch πŸ›οΈ",
"DType πŸ“₯",
"Backend 🏭",
"Params (B)",
"Open LLM Score (%)",
# deployment settings
"DType πŸ“₯",
"Backend 🏭",
"Quantization πŸ—œοΈ",
# primary measurements
"Prefill Latency (s)",
"Prefill Latency (s) BetterTransformer",
"Decode Throughput (tokens/s)",
"Decode Throughput (tokens/s) BetterTransformer",
"E2E Throughput (tokens/s)",
"E2E Throughput (tokens/s) BetterTransformer",
# speedups
"Prefill Latency Speedup (%)",
"Decode Throughput Speedup (%)",
]
def get_bt_df(llm_perf_df):
bt_df = llm_perf_df.copy()
# process
bt_df["Arch πŸ›οΈ"] = bt_df["Arch πŸ›οΈ"].apply(process_arch)
# seperate original model experiments from BetterTransformer experiments
original_df = bt_df[bt_df["Optimization πŸ› οΈ"] == "None"]
bt_df = bt_df[bt_df["Optimization πŸ› οΈ"] == "BetterTransformer"]
# merge the two dataframes
bt_df = pd.merge(
original_df,
bt_df,
on=["Model πŸ€—", "Quantization πŸ—œοΈ"],
suffixes=["", " BetterTransformer"],
)
# compute speedups
bt_df["Prefill Latency Speedup (%)"] = (
(bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
).round(2)
bt_df["Decode Throughput Speedup (%)"] = (
(bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
).round(2)
# filter speedups > 1000%
bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
return bt_df
def get_bt_decode_fig(llm_perf_df):
bt_df = get_bt_df(llm_perf_df)
# plot
decode_fig = px.box(
bt_df,
x="Arch πŸ›οΈ",
y="Decode Throughput Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=BETTERTRANSFORMER_DATA,
color="Quantization πŸ—œοΈ",
points="all",
)
# add hover data
decode_fig.update_traces(
hovertemplate="<br>".join(
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
)
)
# add layout
decode_fig.update_layout(
title={
"text": "Decode Throughput Speedup per Architecture",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="LLM Architecture",
yaxis_title="Decode Speedup (%)",
legend_title="Quantization Scheme",
width=1200,
height=600,
)
return decode_fig
def get_bt_prefill_fig(llm_perf_df):
bt_df = get_bt_df(llm_perf_df)
# plot
prefill_fig = px.box(
bt_df,
x="Arch πŸ›οΈ",
y="Prefill Latency Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=BETTERTRANSFORMER_DATA,
color="Quantization πŸ—œοΈ",
points="all",
)
# add hover data
prefill_fig.update_traces(
hovertemplate="<br>".join(
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
)
)
# add layout
prefill_fig.update_layout(
title={
"text": "Prefill Latency Speedup per Architecture",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="LLM Architecture",
yaxis_title="Prefill Speedup (%)",
legend_title="Quantization Scheme",
width=1200,
height=600,
)
return prefill_fig
def create_bt_plots(llm_perf_df):
# descriptive text
gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
# get figures
prefill_fig = get_bt_prefill_fig(llm_perf_df)
decode_fig = get_bt_decode_fig(llm_perf_df)
# create plots
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
return prefill_plot, decode_plot