BenchmarkBot's picture
new runs
ec9f1c7
raw
history blame
No virus
11.4 kB
import os
import gradio as gr
import pandas as pd
import plotly.express as px
from apscheduler.schedulers.background import BackgroundScheduler
from src.assets.css_html_js import custom_css, custom_js
from src.assets.text_content import (
TITLE,
INTRODUCTION_TEXT,
ABOUT_TEXT,
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
)
from src.utils import (
change_tab,
restart_space,
load_dataset_repo,
process_model_name,
process_model_type,
process_weight_class,
)
LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
ALL_COLUMNS_MAPPING = {
"model_type": "Type πŸ€—",
"weight_class": "Class πŸ‹οΈ",
#
"backend.name": "Backend 🏭",
"backend.torch_dtype": "Dtype πŸ“₯",
"optimizations": "Optimizations πŸ› οΈ",
#
"generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
"forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
#
"best_scored_model": "Best Scored Model πŸ†",
"best_score": "Best Score (%) ⬆️",
}
ALL_COLUMNS_DATATYPES = [
"str",
"str",
#
"str",
"str",
"str",
#
"number",
"number",
#
"markdown",
"number",
]
SORTING_COLUMN = ["tradeoff"]
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
def get_benchmark_df(benchmark="1xA100-80GB"):
if llm_perf_dataset_repo:
llm_perf_dataset_repo.git_pull()
# load and merge
bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
scores_df = pd.read_csv(
"./llm-perf-dataset/reports/Weighted+Classed-Open-LLM-Leaderboard.csv"
)
bench_df["merge_id"] = bench_df.experiment_name.str.split("_1_1000_").str[-1]
scores_df["merge_id"] = scores_df.weight_class + "_" + scores_df.model_type
merged_df = bench_df.merge(scores_df, on="merge_id")
# add optimizations
merged_df["optimizations"] = merged_df[
["backend.bettertransformer", "backend.load_in_8bit", "backend.load_in_4bit"]
].apply(
lambda x: ", ".join(
filter(
lambda x: x != "",
[
"BetterTransformer" if x[0] == True else "",
"LLM.int8" if x[1] == True else "",
"LLM.fp4" if x[2] == True else "",
],
),
)
if any([x[0] == True, x[1] == True, x[2] == True])
else "None",
axis=1,
)
# remove score for quantized models
merged_df.loc[
merged_df["optimizations"].str.contains("LLM.int8|LLM.fp4"), "best_score"
] = "Not Evaluated"
# create composite score
score_distance = 100 - merged_df["best_score"]
# normalize latency between 0 and 100
latency_distance = merged_df["generate.latency(s)"]
merged_df["tradeoff"] = (score_distance**2 + latency_distance**2) ** 0.5
merged_df["tradeoff"] = merged_df["tradeoff"].round(2)
return merged_df
def get_benchmark_table(bench_df):
# sort
bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
# filter
bench_df = bench_df[list(ALL_COLUMNS_MAPPING.keys())]
# rename
bench_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
# transform
bench_df["Type πŸ€—"] = bench_df["Type πŸ€—"].apply(process_model_type)
bench_df["Class πŸ‹οΈ"] = bench_df["Class πŸ‹οΈ"].apply(process_weight_class)
bench_df["Best Scored Model πŸ†"] = bench_df["Best Scored Model πŸ†"].apply(
process_model_name
)
return bench_df
def get_benchmark_plot(bench_df):
# untill falcon gets fixed / natively supported
bench_df = bench_df[bench_df["generate.latency(s)"] < 150]
fig = px.scatter(
bench_df,
x="generate.latency(s)",
y="best_score",
color="model_type",
size="forward.peak_memory(MB)",
custom_data=[
"best_scored_model",
"backend.name",
"backend.torch_dtype",
"optimizations",
"forward.peak_memory(MB)",
"generate.throughput(tokens/s)",
],
color_discrete_sequence=px.colors.qualitative.Light24,
)
fig.update_layout(
title={
"text": "Model Score vs. Latency vs. Memory",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="Per 1000 Tokens Latency (s)",
yaxis_title="Open LLM Score (%)",
legend_title="Model Type",
width=1200,
height=600,
)
fig.update_traces(
hovertemplate="<br>".join(
[
"Model: %{customdata[0]}",
"Backend: %{customdata[1]}",
"Load Datatype: %{customdata[2]}",
"Optimizations: %{customdata[3]}",
"Peak Memory (MB): %{customdata[4]}",
"Throughput (tokens/s): %{customdata[5]}",
"Per 1000 Tokens Latency (s): %{x}",
"Open LLM Score (%): %{y}",
]
)
)
return fig
def filter_query(
text,
backends,
datatypes,
optimizations,
score,
memory,
benchmark="1xA100-80GB",
):
raw_df = get_benchmark_df(benchmark=benchmark)
filtered_df = raw_df[
raw_df["best_scored_model"].str.lower().str.contains(text.lower())
& raw_df["backend.name"].isin(backends)
& raw_df["backend.torch_dtype"].isin(datatypes)
& (
pd.concat(
[
raw_df["optimizations"].str.contains(optimization)
for optimization in optimizations
],
axis=1,
).any(axis="columns")
if len(optimizations) > 0
else True
)
& (raw_df["best_score"] >= score)
& (raw_df["forward.peak_memory(MB)"] <= memory)
]
filtered_table = get_benchmark_table(filtered_df)
filtered_plot = get_benchmark_plot(filtered_df)
return filtered_table, filtered_plot
# Dataframes
A100_df = get_benchmark_df(benchmark="1xA100-80GB")
A100_table = get_benchmark_table(A100_df)
A100_plot = get_benchmark_plot(A100_df)
# Demo interface
demo = gr.Blocks(css=custom_css)
with demo:
# leaderboard title
gr.HTML(TITLE)
# introduction text
gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text")
# leaderboard tabs
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
with gr.TabItem("πŸ–₯️ A100-80GB Benchmark πŸ†", id=0):
gr.HTML(
"πŸ‘‰ Scroll to the right πŸ‘‰ for more columns.", elem_id="descriptive-text"
)
# Original leaderboard table
A100_leaderboard = gr.components.Dataframe(
value=A100_table,
datatype=ALL_COLUMNS_DATATYPES,
headers=list(ALL_COLUMNS_MAPPING.values()),
elem_id="1xA100-table",
)
with gr.TabItem("πŸ–₯️ A100-80GB Plot πŸ“Š", id=1):
gr.HTML(
"πŸ‘† Hover over the points πŸ‘† for additional information.",
elem_id="descriptive-text",
)
# Original leaderboard plot
A100_plotly = gr.components.Plot(
value=A100_plot,
elem_id="1xA100-plot",
show_label=False,
)
with gr.TabItem("Control Panel πŸŽ›οΈ", id=2):
gr.HTML(
"Use this control panel to filter the leaderboard's table and plot.",
elem_id="descriptive-text",
)
# control panel interface
with gr.Row():
with gr.Column(scale=1):
search_bar = gr.Textbox(
label="Model πŸ€—",
info="πŸ” Search for a model name",
elem_id="search-bar",
)
with gr.Column(scale=1):
with gr.Box():
score_slider = gr.Slider(
label="Open LLM Score πŸ“ˆ",
info="🎚️ Slide to minimum Open LLM score",
value=0,
elem_id="threshold-slider",
)
with gr.Column(scale=1):
with gr.Box():
memory_slider = gr.Slider(
label="Peak Memory (MB) πŸ“ˆ",
info="🎚️ Slide to maximum Peak Memory",
minimum=0,
maximum=80 * 1024,
value=80 * 1024,
elem_id="memory-slider",
)
with gr.Row():
with gr.Column(scale=1):
backend_checkboxes = gr.CheckboxGroup(
label="Backends 🏭",
choices=["pytorch", "onnxruntime"],
value=["pytorch", "onnxruntime"],
info="β˜‘οΈ Select the backends",
elem_id="backend-checkboxes",
)
with gr.Column(scale=1):
datatype_checkboxes = gr.CheckboxGroup(
label="Dtypes πŸ“₯",
choices=["float32", "float16"],
value=["float32", "float16"],
info="β˜‘οΈ Select the load dtypes",
elem_id="dtype-checkboxes",
)
with gr.Column(scale=2):
optimizations_checkboxes = gr.CheckboxGroup(
label="Optimizations πŸ› οΈ",
choices=["None", "BetterTransformer", "LLM.int8", "LLM.fp4"],
value=["None", "BetterTransformer", "LLM.int8", "LLM.fp4"],
info="β˜‘οΈ Select the optimizations",
elem_id="optimizations-checkboxes",
)
with gr.Row():
filter_button = gr.Button(
value="Filter πŸš€",
elem_id="filter-button",
)
with gr.TabItem("About πŸ“–", id=3):
gr.HTML(ABOUT_TEXT, elem_classes="descriptive-text")
demo.load(
change_tab,
A100_tabs,
_js=custom_js,
)
filter_button.click(
filter_query,
[
search_bar,
backend_checkboxes,
datatype_checkboxes,
optimizations_checkboxes,
score_slider,
memory_slider,
],
[A100_leaderboard, A100_plotly],
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
).style(show_copy_button=True)
# Restart space every hour
scheduler = BackgroundScheduler()
scheduler.add_job(
restart_space,
"interval",
seconds=3600,
args=[LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN],
)
scheduler.start()
# Launch demo
demo.queue(concurrency_count=40).launch()