File size: 4,477 Bytes
ab5f5f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc685a9
ab5f5f1
dc685a9
ab5f5f1
 
 
 
 
 
 
 
 
 
 
 
 
 
dc685a9
ab5f5f1
dc685a9
 
ab5f5f1
 
 
 
 
 
 
 
 
 
a1135a9
ab5f5f1
 
a1135a9
ab5f5f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc685a9
ab5f5f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc685a9
ab5f5f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import pandas as pd
import plotly.express as px


FLASHATTENTIONV2_DATA = [
    # open llm
    "Model πŸ€—",
    "Arch πŸ›οΈ",
    "DType πŸ“₯",
    "Backend 🏭",
    "Params (B)",
    "Open LLM Score (%)",
    # deployment settings
    "DType πŸ“₯",
    "Backend 🏭",
    "Optimization πŸ› οΈ",
    "Quantization πŸ—œοΈ",
    "Optimization πŸ› οΈ FlashAttentionV2",
    # primary measurements
    "Prefill Latency (s)",
    "Prefill Latency (s) FlashAttentionV2",
    "Decode Throughput (tokens/s)",
    "Decode Throughput (tokens/s) FlashAttentionV2",
    "E2E Throughput (tokens/s)",
    "E2E Throughput (tokens/s) FlashAttentionV2",
    # speedups
    "Prefill Latency Speedup (%)",
    "Decode Throughput Speedup (%)",
]


def get_fa2_df(llm_perf_df):
    copy_df = llm_perf_df.copy()
    # seperate original model experiments from FlashAttentionV2 experiments
    original_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "None") & (copy_df["DType πŸ“₯"] == "float16")]
    fa2_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "FlashAttentionV2") & (copy_df["DType πŸ“₯"] == "float16")]
    # merge the two dataframes
    fa2_df = pd.merge(
        original_df,
        fa2_df,
        on=["Model πŸ€—", "Quantization πŸ—œοΈ"],
        suffixes=["", " FlashAttentionV2"],
    )
    # compute speedups
    fa2_df["Prefill Latency Speedup (%)"] = (
        (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
    ).round(2) - 100
    fa2_df["Decode Throughput Speedup (%)"] = (
        (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
    ).round(2) - 100
    # filter speedups > 1000%
    fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
    fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]

    return fa2_df


def get_fa2_decode_fig(llm_perf_df):
    fa2_df = get_fa2_df(llm_perf_df)
    # plot
    decode_fig = px.box(
        fa2_df,
        x="Arch πŸ›οΈ",
        y="Decode Throughput Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=FLASHATTENTIONV2_DATA,
        color="Quantization πŸ—œοΈ",
        points="all",
    )
    # add hover data
    decode_fig.update_traces(
        hovertemplate="<br>".join(
            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
        )
    )
    # add layout
    decode_fig.update_layout(
        title={
            "text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Decode Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return decode_fig


def get_fa2_prefill_fig(llm_perf_df):
    fa2_df = get_fa2_df(llm_perf_df)
    # plot
    prefill_fig = px.box(
        fa2_df,
        x="Arch πŸ›οΈ",
        y="Prefill Latency Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=FLASHATTENTIONV2_DATA,
        color="Quantization πŸ—œοΈ",
        points="all",
    )
    # add hover data
    prefill_fig.update_traces(
        hovertemplate="<br>".join(
            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
        )
    )
    # add layout
    prefill_fig.update_layout(
        title={
            "text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Prefill Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return prefill_fig


def create_fa2_plots(llm_perf_df):
    # descriptive text
    gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
    # get figures
    prefill_fig = get_fa2_prefill_fig(llm_perf_df)
    decode_fig = get_fa2_decode_fig(llm_perf_df)

    # create plots
    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)

    return prefill_plot, decode_plot