Spaces:

optimum
/

llm-perf-leaderboard

Running

File size: 4,477 Bytes

import gradio as gr
import pandas as pd
import plotly.express as px


FLASHATTENTIONV2_DATA = [
    # open llm
    "Model 🤗",
    "Arch 🏛️",
    "DType 📥",
    "Backend 🏭",
    "Params (B)",
    "Open LLM Score (%)",
    # deployment settings
    "DType 📥",
    "Backend 🏭",
    "Optimization 🛠️",
    "Quantization 🗜️",
    "Optimization 🛠️ FlashAttentionV2",
    # primary measurements
    "Prefill Latency (s)",
    "Prefill Latency (s) FlashAttentionV2",
    "Decode Throughput (tokens/s)",
    "Decode Throughput (tokens/s) FlashAttentionV2",
    "E2E Throughput (tokens/s)",
    "E2E Throughput (tokens/s) FlashAttentionV2",
    # speedups
    "Prefill Latency Speedup (%)",
    "Decode Throughput Speedup (%)",
]


def get_fa2_df(llm_perf_df):
    copy_df = llm_perf_df.copy()
    # seperate original model experiments from FlashAttentionV2 experiments
    original_df = copy_df[(copy_df["Optimization 🛠️"] == "None") & (copy_df["DType 📥"] == "float16")]
    fa2_df = copy_df[(copy_df["Optimization 🛠️"] == "FlashAttentionV2") & (copy_df["DType 📥"] == "float16")]
    # merge the two dataframes
    fa2_df = pd.merge(
        original_df,
        fa2_df,
        on=["Model 🤗", "Quantization 🗜️"],
        suffixes=["", " FlashAttentionV2"],
    )
    # compute speedups
    fa2_df["Prefill Latency Speedup (%)"] = (
        (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
    ).round(2) - 100
    fa2_df["Decode Throughput Speedup (%)"] = (
        (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
    ).round(2) - 100
    # filter speedups > 1000%
    fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
    fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]

    return fa2_df


def get_fa2_decode_fig(llm_perf_df):
    fa2_df = get_fa2_df(llm_perf_df)
    # plot
    decode_fig = px.box(
        fa2_df,
        x="Arch 🏛️",
        y="Decode Throughput Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=FLASHATTENTIONV2_DATA,
        color="Quantization 🗜️",
        points="all",
    )
    # add hover data
    decode_fig.update_traces(
        hovertemplate="<br>".join(
            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
        )
    )
    # add layout
    decode_fig.update_layout(
        title={
            "text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Decode Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return decode_fig


def get_fa2_prefill_fig(llm_perf_df):
    fa2_df = get_fa2_df(llm_perf_df)
    # plot
    prefill_fig = px.box(
        fa2_df,
        x="Arch 🏛️",
        y="Prefill Latency Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=FLASHATTENTIONV2_DATA,
        color="Quantization 🗜️",
        points="all",
    )
    # add hover data
    prefill_fig.update_traces(
        hovertemplate="<br>".join(
            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
        )
    )
    # add layout
    prefill_fig.update_layout(
        title={
            "text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Prefill Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return prefill_fig


def create_fa2_plots(llm_perf_df):
    # descriptive text
    gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
    # get figures
    prefill_fig = get_fa2_prefill_fig(llm_perf_df)
    decode_fig = get_fa2_decode_fig(llm_perf_df)

    # create plots
    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)

    return prefill_plot, decode_plot