Spaces:
Running
Running
BenchmarkBot
commited on
Commit
β’
e2e1ee9
1
Parent(s):
5a7b8dd
move model name to the end of table
Browse files- app.py +8 -8
- src/assets/css_html_js.py +0 -7
- src/assets/text_content.py +5 -9
app.py
CHANGED
@@ -27,38 +27,38 @@ LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
|
27 |
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
|
28 |
|
29 |
ALL_COLUMNS_MAPPING = {
|
30 |
-
"weight_class": "
|
31 |
-
"model_type": "
|
32 |
-
"best_scored_model": "Best Scored LLM π",
|
33 |
#
|
34 |
"backend.name": "Backend π",
|
35 |
"backend.torch_dtype": "Dtype π₯",
|
36 |
"quantization": "Quantization ποΈ",
|
37 |
"optimizations": "Optimizations π οΈ",
|
38 |
#
|
39 |
-
"best_score": "Best Score (%) β¬οΈ",
|
40 |
"generate.peak_memory(MB)": "Memory (MB) β¬οΈ",
|
41 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
42 |
"generate.energy_consumption(kWh/token)": "Energy (kWh/token) β¬οΈ",
|
|
|
43 |
#
|
|
|
44 |
}
|
45 |
ALL_COLUMNS_DATATYPES = [
|
46 |
"str",
|
47 |
"str",
|
48 |
-
"markdown",
|
49 |
#
|
50 |
"str",
|
51 |
"str",
|
52 |
"str",
|
53 |
"str",
|
54 |
#
|
55 |
-
"str",
|
56 |
"number",
|
57 |
"number",
|
58 |
"number",
|
|
|
59 |
#
|
|
|
60 |
]
|
61 |
-
SORTING_COLUMN = ["
|
62 |
|
63 |
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
|
64 |
|
@@ -110,7 +110,7 @@ def get_benchmark_table(bench_df):
|
|
110 |
# add * to quantized models score since we can't garantee the score is the same
|
111 |
copy_df["best_score"] = copy_df.apply(
|
112 |
lambda x: f"{x['best_score']}**"
|
113 |
-
if x["backend.quantization_strategy"]
|
114 |
else x["best_score"],
|
115 |
axis=1,
|
116 |
)
|
|
|
27 |
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
|
28 |
|
29 |
ALL_COLUMNS_MAPPING = {
|
30 |
+
"weight_class": "Class ποΈ",
|
31 |
+
"model_type": "Type π€",
|
|
|
32 |
#
|
33 |
"backend.name": "Backend π",
|
34 |
"backend.torch_dtype": "Dtype π₯",
|
35 |
"quantization": "Quantization ποΈ",
|
36 |
"optimizations": "Optimizations π οΈ",
|
37 |
#
|
|
|
38 |
"generate.peak_memory(MB)": "Memory (MB) β¬οΈ",
|
39 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
40 |
"generate.energy_consumption(kWh/token)": "Energy (kWh/token) β¬οΈ",
|
41 |
+
"best_score": "Best Score (%) β¬οΈ",
|
42 |
#
|
43 |
+
"best_scored_model": "Best Scored LLM π",
|
44 |
}
|
45 |
ALL_COLUMNS_DATATYPES = [
|
46 |
"str",
|
47 |
"str",
|
|
|
48 |
#
|
49 |
"str",
|
50 |
"str",
|
51 |
"str",
|
52 |
"str",
|
53 |
#
|
|
|
54 |
"number",
|
55 |
"number",
|
56 |
"number",
|
57 |
+
"str",
|
58 |
#
|
59 |
+
"markdown",
|
60 |
]
|
61 |
+
SORTING_COLUMN = ["generate.throughput(tokens/s)"]
|
62 |
|
63 |
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
|
64 |
|
|
|
110 |
# add * to quantized models score since we can't garantee the score is the same
|
111 |
copy_df["best_score"] = copy_df.apply(
|
112 |
lambda x: f"{x['best_score']}**"
|
113 |
+
if x["backend.quantization_strategy"] in ["bnb", "gptq"]
|
114 |
else x["best_score"],
|
115 |
axis=1,
|
116 |
)
|
src/assets/css_html_js.py
CHANGED
@@ -16,13 +16,6 @@ custom_css = """
|
|
16 |
transform: scale(1.3);
|
17 |
}
|
18 |
|
19 |
-
table td:first-child,
|
20 |
-
table th:first-child {
|
21 |
-
max-width: 300px;
|
22 |
-
overflow: auto;
|
23 |
-
white-space: nowrap;
|
24 |
-
}
|
25 |
-
|
26 |
.hardware-tabs button {
|
27 |
font-size: 20px;
|
28 |
}
|
|
|
16 |
transform: scale(1.3);
|
17 |
}
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
.hardware-tabs button {
|
20 |
font-size: 20px;
|
21 |
}
|
src/assets/text_content.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
TITLE = """<h1 align="center" id="space-title">π€ Open LLM-Perf Leaderboard ποΈ</h1>"""
|
2 |
|
3 |
INTRODUCTION_TEXT = f"""
|
4 |
-
The π€ Open LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency &
|
5 |
|
6 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
7 |
- Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the π€ Open LLM-Perf Leaderboard ποΈ automatically.
|
@@ -11,11 +11,10 @@ Anyone from the community can request a model or a hardware/backend/optimization
|
|
11 |
ABOUT_TEXT = """<h3>About the π€ Open LLM-Perf Leaderboard ποΈ</h3>
|
12 |
<ul>
|
13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
14 |
-
<li>LLMs are evaluated on a singleton batch and generating 1000 tokens.</li>
|
15 |
-
<li>Peak memory is measured in MB during the
|
16 |
<li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
|
17 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
18 |
-
<li>Ranking is based on a composite metric which is the euclidean distance from the "Perfect LLM" (i.e. 0 latency and 100% accuracy).</li>
|
19 |
</ul>
|
20 |
"""
|
21 |
|
@@ -39,9 +38,6 @@ hydra:
|
|
39 |
experiment_name: {experiment_name}
|
40 |
|
41 |
model: {model}
|
42 |
-
hub_kwargs:
|
43 |
-
revision: {revision}
|
44 |
-
trust_remote_code: {trust_remote_code}
|
45 |
|
46 |
device: cuda
|
47 |
|
@@ -49,7 +45,7 @@ backend:
|
|
49 |
no_weights: true
|
50 |
delete_cache: true
|
51 |
torch_dtype: float16
|
52 |
-
|
53 |
bettertransformer: true
|
54 |
|
55 |
benchmark:
|
@@ -57,7 +53,7 @@ benchmark:
|
|
57 |
|
58 |
input_shapes:
|
59 |
batch_size: 1
|
60 |
-
sequence_length:
|
61 |
|
62 |
new_tokens: 1000
|
63 |
```
|
|
|
1 |
TITLE = """<h1 align="center" id="space-title">π€ Open LLM-Perf Leaderboard ποΈ</h1>"""
|
2 |
|
3 |
INTRODUCTION_TEXT = f"""
|
4 |
+
The π€ Open LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency, throughput & memory) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
|
5 |
|
6 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
7 |
- Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the π€ Open LLM-Perf Leaderboard ποΈ automatically.
|
|
|
11 |
ABOUT_TEXT = """<h3>About the π€ Open LLM-Perf Leaderboard ποΈ</h3>
|
12 |
<ul>
|
13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
14 |
+
<li>LLMs are evaluated on a singleton batch with a prompt size of 512 and generating 1000 tokens.</li>
|
15 |
+
<li>Peak memory is measured in MB during the generate pass with py3nvml while assuring the GPU's isolation.</li>
|
16 |
<li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
|
17 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
|
|
18 |
</ul>
|
19 |
"""
|
20 |
|
|
|
38 |
experiment_name: {experiment_name}
|
39 |
|
40 |
model: {model}
|
|
|
|
|
|
|
41 |
|
42 |
device: cuda
|
43 |
|
|
|
45 |
no_weights: true
|
46 |
delete_cache: true
|
47 |
torch_dtype: float16
|
48 |
+
quantization_strategy: gptq
|
49 |
bettertransformer: true
|
50 |
|
51 |
benchmark:
|
|
|
53 |
|
54 |
input_shapes:
|
55 |
batch_size: 1
|
56 |
+
sequence_length: 512
|
57 |
|
58 |
new_tokens: 1000
|
59 |
```
|