import os
import gradio as gr
import pandas as pd
import plotly.express as px
from huggingface_hub.file_download import hf_hub_download
from src.utils import process_model_name, process_model_arch
from src.assets.css_html_js import custom_css
from src.assets.text_content import (
TITLE,
ABOUT_TEXT,
INTRODUCTION_TEXT,
EXAMPLE_CONFIG_TEXT,
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
)
HF_TOKEN = os.environ.get("HF_TOKEN", None)
LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/huggy_bench.png"
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
ALL_COLUMNS_MAPPING = {
"Model": "Model 🤗",
"Arch": "Arch 🏛️",
"Size": "Params (B) 📏",
# deployment settings
"backend.name": "Backend 🏭",
"backend.torch_dtype": "Dtype 📥",
"optimization": "Optimization 🛠️",
"quantization": "Quantization 🗜️",
# measurements
"Score": "Open LLM Score (%) ⬆️",
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
"forward.latency(s)": "Prefill Latency (s) ⬇️",
"generate.latency(s)": "E2E Latency (s) ⬇️",
"generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
"generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
"generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
}
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
SORTING_ASCENDING = [False, False]
ALL_COLUMNS_DATATYPES = [
# open llm
"markdown",
"markdown",
"number",
# deployment settings
"str",
"str",
"str",
"str",
# measurements
"number",
"number",
"number",
"number",
"number",
"number",
"number",
"number",
"number",
"number",
]
# download data
hf_hub_download(
repo_id="optimum/llm-perf-dataset",
filename="open-llm.csv",
local_dir="dataset",
repo_type="dataset",
token=HF_TOKEN,
)
OPEN_LLM_DF = pd.read_csv("dataset/open-llm.csv")
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
MACHINE_TO_PERF = {}
for machine in MACHINE_TO_HARDWARE:
hf_hub_download(
repo_id="optimum/llm-perf-dataset",
filename=f"{machine}/perf-report.csv",
local_dir="dataset",
repo_type="dataset",
token=HF_TOKEN,
)
MACHINE_TO_PERF[machine] = pd.read_csv(f"dataset/{machine}/perf-report.csv")
def get_benchmark_df(machine="hf-dgx-01"):
# merge on model
machine_perf_df = MACHINE_TO_PERF[machine].copy()
merged_df = OPEN_LLM_DF.merge(machine_perf_df, left_on="Model", right_on="model")
# transpose energy consumption
merged_df["generate.energy_consumption(tokens/kWh)"] = (
1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
).astype(int)
# fix nan values
merged_df.loc[
merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
"generate.energy_consumption(tokens/kWh)",
] = pd.NA
# add optimization column
merged_df["optimization"] = merged_df[
["backend.to_bettertransformer", "backend.use_flash_attention_2"]
].apply(
lambda x: "BetterTransformer"
if x["backend.to_bettertransformer"]
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
axis=1,
)
# add quantization scheme
merged_df["quantization"] = merged_df[
["backend.quantization_scheme", "backend.quantization_config.exllama_config.version"]
].apply(
lambda x: "BnB.4bit"
if x["backend.quantization_scheme"] == "bnb"
else (
"GPTQ.4bit+ExllamaV1"
if (x["backend.quantization_scheme"] == "gptq")
and (x["backend.quantization_config.exllama_config.version"] == 1)
else (
"GPTQ.4bit+ExllamaV2"
if (x["backend.quantization_scheme"] == "gptq")
and (x["backend.quantization_config.exllama_config.version"] == 2)
else "None"
)
),
axis=1,
)
# add decode throughput
merged_df["decode.throughput(tokens/s)"] = (
1000 / (merged_df["generate.latency(s)"] - merged_df["forward.latency(s)"])
).round(2)
# sort by metric
merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
# filter columns
merged_df = merged_df[list(ALL_COLUMNS_MAPPING.keys())]
# rename columns
merged_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
return merged_df
def get_benchmark_table(bench_df):
copy_df = bench_df.copy()
# transform
copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
# process quantization
copy_df["Open LLM Score (%) ⬆️"] = copy_df.apply(
lambda x: f"{x['Open LLM Score (%) ⬆️']}**"
if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
else x["Open LLM Score (%) ⬆️"],
axis=1,
)
return copy_df
def get_benchmark_chart(bench_df):
copy_df = bench_df.copy()
# transform
copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
# plot
fig = px.scatter(
copy_df,
y="Open LLM Score (%) ⬆️",
x="E2E Latency (s) ⬇️",
size="Allocated Memory (MB) ⬇️",
color="Arch 🏛️",
custom_data=list(ALL_COLUMNS_MAPPING.values()),
color_discrete_sequence=px.colors.qualitative.Light24,
)
fig.update_layout(
title={
"text": "Latency vs. Score vs. Memory",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="Per 1000 Tokens Latency (s)",
yaxis_title="Open LLM Score (%)",
legend_title="LLM Architecture",
width=1200,
height=600,
)
fig.update_traces(
hovertemplate="
".join(
[
f"{column}: %{{customdata[{i}]}}"
for i, column in enumerate(ALL_COLUMNS_MAPPING.values())
]
)
)
return fig
def filter_query(
text,
backends,
datatypes,
optimizations,
quantizations,
score,
memory,
machine,
):
raw_df = get_benchmark_df(machine=machine)
filtered_df = raw_df[
raw_df["Model 🤗"].str.contains(text, case=False)
& raw_df["Backend 🏭"].isin(backends)
& raw_df["Dtype 📥"].isin(datatypes)
& raw_df["Optimization 🛠️"].isin(optimizations)
& raw_df["Quantization 🗜️"].isin(quantizations)
& (raw_df["Open LLM Score (%) ⬆️"] >= score)
& (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
]
filtered_table = get_benchmark_table(filtered_df)
filtered_chart = get_benchmark_chart(filtered_df)
return filtered_table, filtered_chart
# Demo interface
demo = gr.Blocks(css=custom_css)
with demo:
# logo
gr.HTML(f'', elem_classes="logo")
# leaderboard title
gr.HTML(TITLE)
# introduction text
gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text")
with gr.Tabs(elem_classes="leaderboard-tabs"):
machine_placeholders = {}
machine_tables = {}
machine_plots = {}
####################### HARDWARE TABS #######################
for i, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
# dummy placeholder of the machine name
machine_placeholders[machine] = gr.Textbox(value=machine, visible=False)
with gr.TabItem(hardware, id=i):
with gr.Tabs(elem_classes="machine-tabs"):
# placeholder for full dataframe
machine_df = get_benchmark_df(machine=machine)
with gr.TabItem("Leaderboard 🏅", id=0):
gr.HTML(
"👉 Scroll to the right 👉 for additional columns.",
elem_id="descriptive-text",
)
# Original leaderboard table
machine_tables[machine] = gr.components.Dataframe(
value=get_benchmark_table(machine_df),
headers=list(ALL_COLUMNS_MAPPING.values()),
datatype=ALL_COLUMNS_DATATYPES,
elem_id="machine-table",
)
with gr.TabItem("Plot 📊", id=1):
gr.HTML(
"👆 Hover over the points 👆 for additional information.",
elem_id="descriptive-text",
)
# Original leaderboard plot
machine_plots[machine] = gr.components.Plot(
value=get_benchmark_chart(machine_df),
elem_id="machine-plot",
show_label=False,
)
###################### CONTROL PANEL #######################
with gr.TabItem("Control Panel 🎛️", id=2):
gr.HTML(
"Use this control panel to filter the leaderboard's table and plot.", # noqa: E501
elem_id="descriptive-text",
)
with gr.Row():
with gr.Column():
search_bar = gr.Textbox(
label="Model 🤗",
info="🔍 Search for a model name",
elem_id="search-bar",
)
with gr.Row():
with gr.Column(scale=1):
score_slider = gr.Slider(
label="Open LLM Score (%) 📈",
info="🎚️ Slide to minimum Open LLM score",
value=0,
elem_id="threshold-slider",
)
with gr.Column(scale=1):
memory_slider = gr.Slider(
label="Peak Memory (MB) 📈",
info="🎚️ Slide to maximum Peak Memory",
minimum=0,
maximum=80 * 1024,
value=80 * 1024,
elem_id="memory-slider",
)
with gr.Column(scale=1):
backend_checkboxes = gr.CheckboxGroup(
label="Backends 🏭",
choices=["pytorch", "onnxruntime"],
value=["pytorch", "onnxruntime"],
info="☑️ Select the backends",
elem_id="backend-checkboxes",
)
with gr.Row():
with gr.Column(scale=1):
datatype_checkboxes = gr.CheckboxGroup(
label="Load Dtypes 📥",
choices=["float32", "float16"],
value=["float32", "float16"],
info="☑️ Select the load dtypes",
elem_id="dtype-checkboxes",
)
with gr.Column(scale=1):
optimization_checkboxes = gr.CheckboxGroup(
label="Optimizations 🛠️",
choices=["None", "BetterTransformer", "FlashAttentionV2"],
value=["None", "BetterTransformer", "FlashAttentionV2"],
info="☑️ Select the optimization",
elem_id="optimization-checkboxes",
)
with gr.Column(scale=1):
quantization_checkboxes = gr.CheckboxGroup(
label="Quantizations 🗜️",
choices=["None", "BnB.4bit", "GPTQ.4bit"],
value=["None", "BnB.4bit", "GPTQ.4bit"],
info="☑️ Select the quantization schemes",
elem_id="quantization-checkboxes",
)
with gr.Row():
filter_button = gr.Button(
value="Filter 🚀",
elem_id="filter-button",
)
for machine in MACHINE_TO_HARDWARE:
filter_button.click(
filter_query,
[
search_bar,
backend_checkboxes,
datatype_checkboxes,
optimization_checkboxes,
quantization_checkboxes,
score_slider,
memory_slider,
machine_placeholders[machine],
],
[machine_tables[machine], machine_plots[machine]],
)
####################### ABOUT TAB #######################
with gr.TabItem("About 📖", id=3):
gr.HTML(ABOUT_TEXT, elem_classes="descriptive-text")
gr.Markdown(EXAMPLE_CONFIG_TEXT, elem_classes="descriptive-text")
####################### CITATION #######################
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
show_copy_button=True,
)
# Launch demo
demo.queue().launch()