Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

BenchmarkBot commited on Jul 27, 2023

Commit

a894537

1 Parent(s): 5608bf7

experiments by model type and weight class

Browse files

Files changed (4) hide show

app.py +46 -46
src/assets/css_html_js.py +2 -2
src/assets/text_content.py +6 -8
src/utils.py +27 -42

app.py CHANGED Viewed

@@ -4,11 +4,12 @@ import pandas as pd
 import plotly.express as px
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.assets.text_content import (
     TITLE,
     INTRODUCTION_TEXT,
     A100_TEXT,
-    About_TEXT,
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
 )
@@ -16,45 +17,49 @@ from src.utils import (
     change_tab,
     restart_space,
     load_dataset_repo,
-    make_clickable_model,
-    # make_clickable_score,
-    # num_to_str,
 )
-from src.assets.css_html_js import custom_css, custom_js
 LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
 LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
 OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
-COLUMNS_MAPPING = {
-    "model": "Model 🤗",
     "backend.name": "Backend 🏭",
-    "backend.torch_dtype": "Load Dtype 📥",
     "optimizations": "Optimizations 🛠️",
     #
-    "tradeoff": "Open LLM Tradeoff ⬇️",
     #
-    "score": "Open LLM Score ⬆️",
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
     "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
-    "num_params": "#️⃣ Parameters (M) 📏",
 }
-COLUMNS_DATATYPES = [
     "markdown",
     "str",
     "str",
     "str",
     #
-    "number",
     #
     "number",
     "number",
     "number",
-    "number",
 ]
-SORTING_COLUMN = ["Open LLM Tradeoff ⬇️"]
 llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
@@ -65,17 +70,10 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
     # load and merge
     bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
-    scores_df = pd.read_csv(f"./llm-perf-dataset/reports/open-llm-leaderboard.csv")
-    bench_df = bench_df.merge(scores_df, on="model", how="left")
-    # filter out models with no score
-    bench_df = bench_df[bench_df["score"].notna()]
-    # create composite score
-    score_distance = 100 - bench_df["score"]
-    latency_distance = bench_df["generate.latency(s)"]
-    bench_df["tradeoff"] = (score_distance**2 + latency_distance**2) ** 0.5
-    bench_df["tradeoff"] = bench_df["tradeoff"].round(2)
     # add optimizations
     bench_df["optimizations"] = bench_df[
@@ -101,17 +99,19 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
 def get_benchmark_table(bench_df):
     # filter
-    bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
     # rename
-    bench_df.rename(columns=COLUMNS_MAPPING, inplace=True)
     # sort
     bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
     # transform
-    bench_df["Model 🤗"] = bench_df["Model 🤗"].apply(make_clickable_model)
-    bench_df["#️⃣ Parameters (M) 📏"] = bench_df["#️⃣ Parameters (M) 📏"].apply(
-        lambda x: int(x / (1024 * 1024))
     )
     return bench_df
@@ -122,12 +122,12 @@ def get_benchmark_plot(bench_df):
     fig = px.scatter(
         bench_df,
         x="generate.latency(s)",
-        y="score",
         color="model_type",
         symbol="backend.name",
         size="forward.peak_memory(MB)",
         custom_data=[
-            "model",
             "backend.name",
             "backend.torch_dtype",
             "optimizations",
@@ -158,12 +158,12 @@ def get_benchmark_plot(bench_df):
             [
                 "Model: %{customdata[0]}",
                 "Backend: %{customdata[1]}",
-                "Datatype: %{customdata[2]}",
                 "Optimizations: %{customdata[3]}",
                 "Peak Memory (MB): %{customdata[4]}",
                 "Throughput (tokens/s): %{customdata[5]}",
                 "Per 1000 Tokens Latency (s): %{x}",
-                "Open LLM Score: %{y}",
             ]
         )
     )
@@ -183,7 +183,7 @@ def filter_query(
     raw_df = get_benchmark_df(benchmark=benchmark)
     filtered_df = raw_df[
-        raw_df["model"].str.lower().str.contains(text.lower())
         & raw_df["backend.name"].isin(backends)
         & raw_df["backend.torch_dtype"].isin(datatypes)
         & (
@@ -197,7 +197,7 @@ def filter_query(
             if len(optimizations) > 0
             else True
         )
-        & (raw_df["score"] >= score)
         & (raw_df["forward.peak_memory(MB)"] <= memory)
     ]
@@ -223,18 +223,18 @@ with demo:
     # leaderboard tabs
     with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
-        with gr.TabItem("🖥️ A100-80GB Leaderboard Table   🏆", id=0):
             gr.HTML(A100_TEXT)
             # Original leaderboard table
             A100_leaderboard = gr.components.Dataframe(
                 value=A100_table,
-                datatype=COLUMNS_DATATYPES,
-                headers=list(COLUMNS_MAPPING.values()),
                 elem_id="1xA100-table",
             )
-        with gr.TabItem("🖥️ A100-80GB Interactive Plot 📊", id=1):
             gr.HTML(A100_TEXT)
             # Original leaderboard plot
@@ -244,7 +244,7 @@ with demo:
                 show_label=False,
             )
-        with gr.TabItem("🎮 Control Panel 🎛️", id=2):
             # control panel interface
             with gr.Row():
                 with gr.Column(scale=1):
@@ -304,8 +304,8 @@ with demo:
                     elem_id="filter-button",
                 )
-        with gr.TabItem("❔ About 📖", id=3):
-            gr.Markdown(About_TEXT)
     demo.load(
         change_tab,

 import plotly.express as px
 from apscheduler.schedulers.background import BackgroundScheduler
+from src.assets.css_html_js import custom_css, custom_js
 from src.assets.text_content import (
     TITLE,
     INTRODUCTION_TEXT,
     A100_TEXT,
+    ABOUT_TEXT,
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
 )
     change_tab,
     restart_space,
     load_dataset_repo,
+    process_model_name,
+    process_model_type,
+    process_weight_class,
 )
 LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
 LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
 OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
+ALL_COLUMNS_MAPPING = {
+    "model_type": "Model Type 🤗",
+    "weight_class": "Weight Class 🏋️",
+    "best_scored_model": "Best Scored Model 🏆",
+    #
     "backend.name": "Backend 🏭",
+    "backend.torch_dtype": "Dtype 📥",
     "optimizations": "Optimizations 🛠️",
     #
+    # "tradeoff": "Tradeoff* ⬇️",
     #
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
     "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
+    "best_score": "Score (%) ⬆️",
+    #
 }
+ALL_COLUMNS_DATATYPES = [
+    "str",
+    "str",
     "markdown",
+    #
     "str",
     "str",
     "str",
     #
+    # "number",
     #
     "number",
     "number",
     "number",
 ]
+SORTING_COLUMN = ["Score (%) ⬆️"]
 llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
     # load and merge
     bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
+    scores_df = pd.read_csv(
+        f"./llm-perf-dataset/reports/Grouped-Open-LLM-Leaderboard.csv"
+    )
+    bench_df = bench_df.merge(scores_df, left_on="model", right_on="best_scored_model")
     # add optimizations
     bench_df["optimizations"] = bench_df[
 def get_benchmark_table(bench_df):
     # filter
+    bench_df = bench_df[list(ALL_COLUMNS_MAPPING.keys())]
     # rename
+    bench_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
     # sort
     bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
     # transform
+    bench_df["Model Type 🤗"] = bench_df["Model Type 🤗"].apply(process_model_type)
+    bench_df["Weight Class 🏋️"] = bench_df["Weight Class 🏋️"].apply(
+        process_weight_class
+    )
+    bench_df["Best Scored Model 🏆"] = bench_df["Best Scored Model 🏆"].apply(
+        process_model_name
     )
     return bench_df
     fig = px.scatter(
         bench_df,
         x="generate.latency(s)",
+        y="best_score",
         color="model_type",
         symbol="backend.name",
         size="forward.peak_memory(MB)",
         custom_data=[
+            "best_scored_model",
             "backend.name",
             "backend.torch_dtype",
             "optimizations",
             [
                 "Model: %{customdata[0]}",
                 "Backend: %{customdata[1]}",
+                "Load Datatype: %{customdata[2]}",
                 "Optimizations: %{customdata[3]}",
                 "Peak Memory (MB): %{customdata[4]}",
                 "Throughput (tokens/s): %{customdata[5]}",
                 "Per 1000 Tokens Latency (s): %{x}",
+                "Open LLM Score (%): %{y}",
             ]
         )
     )
     raw_df = get_benchmark_df(benchmark=benchmark)
     filtered_df = raw_df[
+        raw_df["best_scored_model"].str.lower().str.contains(text.lower())
         & raw_df["backend.name"].isin(backends)
         & raw_df["backend.torch_dtype"].isin(datatypes)
         & (
             if len(optimizations) > 0
             else True
         )
+        & (raw_df["best_score"] >= score)
         & (raw_df["forward.peak_memory(MB)"] <= memory)
     ]
     # leaderboard tabs
     with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
+        with gr.TabItem("🖥️ A100-80GB Leaderboard Table 🏅", id=0):
             gr.HTML(A100_TEXT)
             # Original leaderboard table
             A100_leaderboard = gr.components.Dataframe(
                 value=A100_table,
+                datatype=ALL_COLUMNS_DATATYPES,
+                headers=list(ALL_COLUMNS_MAPPING.values()),
                 elem_id="1xA100-table",
             )
+        with gr.TabItem("🖥️ A100-80GB Interactive Plot 📊", id=2):
             gr.HTML(A100_TEXT)
             # Original leaderboard plot
                 show_label=False,
             )
+        with gr.TabItem("🎮 Control Panel 🎛️", id=3):
             # control panel interface
             with gr.Row():
                 with gr.Column(scale=1):
                     elem_id="filter-button",
                 )
+        with gr.TabItem("❔ About 📖", id=4):
+            gr.Markdown(ABOUT_TEXT)
     demo.load(
         change_tab,

src/assets/css_html_js.py CHANGED Viewed

@@ -25,8 +25,8 @@ custom_css = """
     border: none;
 }
-table td:first-child,
-table th:first-child {
     max-width: 300px;
     overflow: auto;
     white-space: nowrap;

     border: none;
 }
+table td:nth-child(3),
+table th:nth-child(3) {
     max-width: 300px;
     overflow: auto;
     white-space: nowrap;

src/assets/text_content.py CHANGED Viewed

@@ -15,14 +15,12 @@ A100_TEXT = """<h3>Single-GPU Benchmark (1xA100):</h3>
 </ul>
 """
-About_TEXT = """<h3>About the benchmarks</h3>
-<ul>
-  <li>The performances benchmarks were obtained using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark).</li>
-  <li>Throughput is measured in tokens per second when generating 1000 tokens with a batch size of 1.</li>
-  <li>Peak memory is measured in MB during the first forward pass of the model (no warmup).</li>
-  <li>Open LLM Score is an average evaluation score obtained from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).</li>
-  <li>Open LLM Tradeoff is the euclidean distance between an LLM and the "perfect LLM" (i.e. 0 latency and 100% accuracy) translating the tradeoff between latency and accuracy.</li>
-</ul>
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."

 </ul>
 """
+ABOUT_TEXT = """<h3>About the benchmarks:</h3>
+- The performances benchmarks were obtained using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark).
+- Throughput is measured in tokens per second when generating 1000 tokens with a batch size of 1.
+- Peak memory is measured in MB during the first forward pass of the model (no warmup).
+- Open LLM Score is an average evaluation score obtained from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
+- Open LLM Tradeoff is the euclidean distance between an LLM and the "perfect LLM" (i.e. 0 latency and 100% accuracy) translating the tradeoff between latency and accuracy.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."

src/utils.py CHANGED Viewed

@@ -22,70 +22,55 @@ def restart_space(LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN):
 def load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN):
-    llm_perf_repo = None
     if OPTIMUM_TOKEN:
         print("Loading LLM-Perf-Dataset from Hub...")
-        llm_perf_repo = Repository(
             local_dir="./llm-perf-dataset",
             clone_from=LLM_PERF_DATASET_REPO,
             token=OPTIMUM_TOKEN,
             repo_type="dataset",
         )
-        llm_perf_repo.git_pull()
-    return llm_perf_repo
-LLAMAS = [
-    "huggingface/llama-7b",
-    "huggingface/llama-13b",
-    "huggingface/llama-30b",
-    "huggingface/llama-65b",
-]
-KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
-VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
-OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
-DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
-MODEL_PAGE = "https://huggingface.co/models"
-LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
-VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
-ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
-    if model_name in LLAMAS:
-        link = LLAMA_LINK
-        model_name = model_name.split("/")[1]
-    elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
-        link = VICUNA_LINK
-        model_name = "stable-vicuna-13b"
-    elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
-        link = ALPACA_LINK
-        model_name = "alpaca-13b"
-    if model_name == "dolly-12b":
-        link = DOLLY_LINK
-    elif model_name == "vicuna-13b":
-        link = VICUNA_LINK
-    elif model_name == "koala-13b":
-        link = KOALA_LINK
-    elif model_name == "oasst-12b":
-        link = OASST_LINK
     return model_hyperlink(link, model_name)
-def make_clickable_score(score):
-    link = f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{score}</a>'
-def num_to_str(num):
     if num < 1000:
         return str(int(num))
     elif num < 1000000:

 def load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN):
+    llm_perf_dataset_repo = None
     if OPTIMUM_TOKEN:
         print("Loading LLM-Perf-Dataset from Hub...")
+        llm_perf_dataset_repo = Repository(
             local_dir="./llm-perf-dataset",
             clone_from=LLM_PERF_DATASET_REPO,
             token=OPTIMUM_TOKEN,
             repo_type="dataset",
         )
+        llm_perf_dataset_repo.git_pull()
+    return llm_perf_dataset_repo
+LLM_MODEL_TYPES = {
+    "gpt_bigcode": "GPT-BigCode 🌸",
+    "RefinedWebModel": "Falcon 🦅",
+    "RefinedWeb": "Falcon 🦅",
+    "baichuan": "Baichuan 🌊",
+    "llama": "LLaMA 🦙",
+    "gpt_neox": "GPT-NeoX",
+    "gpt_neo": "GPT-Neo",
+    "codegen": "CodeGen",
+    "chatglm": "ChatGLM",
+    "gpt2": "GPT-2",
+    "gptj": "GPT-J",
+    "xglm": "XGLM",
+    "opt": "OPT",
+    "mpt": "MPT",
+}
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def process_model_name(model_name):
     link = f"https://huggingface.co/{model_name}"
     return model_hyperlink(link, model_name)
+def process_model_type(model_type):
+    if model_type in LLM_MODEL_TYPES:
+        return LLM_MODEL_TYPES[model_type]
+    else:
+        return model_type
+def process_weight_class(num):
     if num < 1000:
         return str(int(num))
     elif num < 1000000: