Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

BenchmarkBot commited on Jul 28, 2023

Commit

9e3eaf4

1 Parent(s): df1a500

remove about

Browse files

Files changed (2) hide show

app.py +6 -14
src/assets/text_content.py +5 -12

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ from src.assets.text_content import (
     TITLE,
     INTRODUCTION_TEXT,
     A100_TEXT,
-    ABOUT_TEXT,
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
 )
@@ -29,16 +28,14 @@ OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
 ALL_COLUMNS_MAPPING = {
-    "best_scored_model": "Best Scored Model 🏆",
-    "model_type": "Type 🤗",
-    "weight_class": "Class 🏋️",
     #
     "backend.name": "Backend 🏭",
-    "backend.torch_dtype": "Dtype 📥",
     "optimizations": "Optimizations 🛠️",
     #
-    # "tradeoff": "Tradeoff* ⬇️",
-    #
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
     "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
     "best_score": "Score (%) ⬆️",
@@ -53,8 +50,6 @@ ALL_COLUMNS_DATATYPES = [
     "str",
     "str",
     #
-    # "number",
-    #
     "number",
     "number",
     "number",
@@ -112,8 +107,8 @@ def get_benchmark_table(bench_df):
     # rename
     bench_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
     # transform
-    bench_df["Type 🤗"] = bench_df["Type 🤗"].apply(process_model_type)
-    bench_df["Class 🏋️"] = bench_df["Class 🏋️"].apply(
         process_weight_class
     )
     bench_df["Best Scored Model 🏆"] = bench_df["Best Scored Model 🏆"].apply(
@@ -309,9 +304,6 @@ with demo:
                     elem_id="filter-button",
                 )
-        with gr.TabItem("❔ About 📖", id=4):
-            gr.HTML(ABOUT_TEXT)
     demo.load(
         change_tab,
         A100_tabs,

     TITLE,
     INTRODUCTION_TEXT,
     A100_TEXT,
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
 )
 ALL_COLUMNS_MAPPING = {
+    "best_scored_model": "Best Scored LLM 🏆",
+    "model_type": "LLM Type 🤗",
+    "weight_class": "Weight Class 🏋️",
     #
     "backend.name": "Backend 🏭",
+    "backend.torch_dtype": "Load Datatype 📥",
     "optimizations": "Optimizations 🛠️",
     #
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
     "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
     "best_score": "Score (%) ⬆️",
     "str",
     "str",
     #
     "number",
     "number",
     "number",
     # rename
     bench_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
     # transform
+    bench_df["LLM Type 🤗"] = bench_df["LLM Type 🤗"].apply(process_model_type)
+    bench_df["Weight Class 🏋️"] = bench_df["Weight Class 🏋️"].apply(
         process_weight_class
     )
     bench_df["Best Scored Model 🏆"] = bench_df["Best Scored Model 🏆"].apply(
                     elem_id="filter-button",
                 )
     demo.load(
         change_tab,
         A100_tabs,

src/assets/text_content.py CHANGED Viewed

@@ -10,18 +10,11 @@ Anyone from the community can request a model or a hardware/backend/optimization
 A100_TEXT = """<h3>Single-GPU Benchmark (1xA100):</h3>
 <ul>
-    <li>Singleton Batch (1)</li>
-    <li>Thousand Tokens (1000)</li>
-</ul>
-"""
-ABOUT_TEXT = """<h3>About the benchmarks:</h3>
-<ul>
-  <li>The performances benchmarks were obtained using <a href="https://github.com/huggingface/optimum-benchmark">Optimum-Benchmark</a>.</li>
-  <li>Throughput is measured in tokens per second when generating 1000 tokens with a batch size of 1.</li>
-  <li>Peak memory is measured in MB during the first forward pass of the model (no warmup).</li>
-  <li>Open LLM Score is an average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
-  <li>Open LLM Tradeoff is the euclidean distance between an LLM and the "perfect LLM" (i.e. 0 latency and 100% accuracy) translating the tradeoff between latency and accuracy.</li>
 </ul>
 """

 A100_TEXT = """<h3>Single-GPU Benchmark (1xA100):</h3>
 <ul>
+    <li>LLMs are evaluated on a singleton batch and genrating a thousand tokens.</li>
+    <li>Peak memory is measured in MB during the first forward pass of the LLM (no warmup).</li>
+    <li>Each pair of (LLM Type, Weight Class) is represented by the best scored LLM. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
+    <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
+    <li>Ranking is based on the euclidean distance from "perfect LLM" (i.e. 0 latency and 100% accuracy).</li>
 </ul>
 """