BenchmarkBot commited on
Commit
e2e1ee9
β€’
1 Parent(s): 5a7b8dd

move model name to the end of table

Browse files
app.py CHANGED
@@ -27,38 +27,38 @@ LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
27
  OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
28
 
29
  ALL_COLUMNS_MAPPING = {
30
- "weight_class": "Weight Class πŸ‹οΈ",
31
- "model_type": "LLM Type πŸ€—",
32
- "best_scored_model": "Best Scored LLM πŸ†",
33
  #
34
  "backend.name": "Backend 🏭",
35
  "backend.torch_dtype": "Dtype πŸ“₯",
36
  "quantization": "Quantization πŸ—œοΈ",
37
  "optimizations": "Optimizations πŸ› οΈ",
38
  #
39
- "best_score": "Best Score (%) ⬆️",
40
  "generate.peak_memory(MB)": "Memory (MB) ⬇️",
41
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
42
  "generate.energy_consumption(kWh/token)": "Energy (kWh/token) ⬇️",
 
43
  #
 
44
  }
45
  ALL_COLUMNS_DATATYPES = [
46
  "str",
47
  "str",
48
- "markdown",
49
  #
50
  "str",
51
  "str",
52
  "str",
53
  "str",
54
  #
55
- "str",
56
  "number",
57
  "number",
58
  "number",
 
59
  #
 
60
  ]
61
- SORTING_COLUMN = ["perf_distance"]
62
 
63
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
64
 
@@ -110,7 +110,7 @@ def get_benchmark_table(bench_df):
110
  # add * to quantized models score since we can't garantee the score is the same
111
  copy_df["best_score"] = copy_df.apply(
112
  lambda x: f"{x['best_score']}**"
113
- if x["backend.quantization_strategy"]
114
  else x["best_score"],
115
  axis=1,
116
  )
 
27
  OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
28
 
29
  ALL_COLUMNS_MAPPING = {
30
+ "weight_class": "Class πŸ‹οΈ",
31
+ "model_type": "Type πŸ€—",
 
32
  #
33
  "backend.name": "Backend 🏭",
34
  "backend.torch_dtype": "Dtype πŸ“₯",
35
  "quantization": "Quantization πŸ—œοΈ",
36
  "optimizations": "Optimizations πŸ› οΈ",
37
  #
 
38
  "generate.peak_memory(MB)": "Memory (MB) ⬇️",
39
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
40
  "generate.energy_consumption(kWh/token)": "Energy (kWh/token) ⬇️",
41
+ "best_score": "Best Score (%) ⬆️",
42
  #
43
+ "best_scored_model": "Best Scored LLM πŸ†",
44
  }
45
  ALL_COLUMNS_DATATYPES = [
46
  "str",
47
  "str",
 
48
  #
49
  "str",
50
  "str",
51
  "str",
52
  "str",
53
  #
 
54
  "number",
55
  "number",
56
  "number",
57
+ "str",
58
  #
59
+ "markdown",
60
  ]
61
+ SORTING_COLUMN = ["generate.throughput(tokens/s)"]
62
 
63
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
64
 
 
110
  # add * to quantized models score since we can't garantee the score is the same
111
  copy_df["best_score"] = copy_df.apply(
112
  lambda x: f"{x['best_score']}**"
113
+ if x["backend.quantization_strategy"] in ["bnb", "gptq"]
114
  else x["best_score"],
115
  axis=1,
116
  )
src/assets/css_html_js.py CHANGED
@@ -16,13 +16,6 @@ custom_css = """
16
  transform: scale(1.3);
17
  }
18
 
19
- table td:first-child,
20
- table th:first-child {
21
- max-width: 300px;
22
- overflow: auto;
23
- white-space: nowrap;
24
- }
25
-
26
  .hardware-tabs button {
27
  font-size: 20px;
28
  }
 
16
  transform: scale(1.3);
17
  }
18
 
 
 
 
 
 
 
 
19
  .hardware-tabs button {
20
  font-size: 20px;
21
  }
src/assets/text_content.py CHANGED
@@ -1,7 +1,7 @@
1
  TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ</h1>"""
2
 
3
  INTRODUCTION_TEXT = f"""
4
- The πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
5
 
6
  Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
7
  - Model evaluation requests should be made in the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ automatically.
@@ -11,11 +11,10 @@ Anyone from the community can request a model or a hardware/backend/optimization
11
  ABOUT_TEXT = """<h3>About the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ</h3>
12
  <ul>
13
  <li>To avoid communication-dependent results, only one GPU is used.</li>
14
- <li>LLMs are evaluated on a singleton batch and generating 1000 tokens.</li>
15
- <li>Peak memory is measured in MB during the first forward pass of the LLM (no warmup).</li>
16
  <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
17
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
18
- <li>Ranking is based on a composite metric which is the euclidean distance from the "Perfect LLM" (i.e. 0 latency and 100% accuracy).</li>
19
  </ul>
20
  """
21
 
@@ -39,9 +38,6 @@ hydra:
39
  experiment_name: {experiment_name}
40
 
41
  model: {model}
42
- hub_kwargs:
43
- revision: {revision}
44
- trust_remote_code: {trust_remote_code}
45
 
46
  device: cuda
47
 
@@ -49,7 +45,7 @@ backend:
49
  no_weights: true
50
  delete_cache: true
51
  torch_dtype: float16
52
- load_in_4bit: true
53
  bettertransformer: true
54
 
55
  benchmark:
@@ -57,7 +53,7 @@ benchmark:
57
 
58
  input_shapes:
59
  batch_size: 1
60
- sequence_length: 1
61
 
62
  new_tokens: 1000
63
  ```
 
1
  TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ</h1>"""
2
 
3
  INTRODUCTION_TEXT = f"""
4
+ The πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ aims to benchmark the performance (latency, throughput & memory) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
5
 
6
  Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
7
  - Model evaluation requests should be made in the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ automatically.
 
11
  ABOUT_TEXT = """<h3>About the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ</h3>
12
  <ul>
13
  <li>To avoid communication-dependent results, only one GPU is used.</li>
14
+ <li>LLMs are evaluated on a singleton batch with a prompt size of 512 and generating 1000 tokens.</li>
15
+ <li>Peak memory is measured in MB during the generate pass with py3nvml while assuring the GPU's isolation.</li>
16
  <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
17
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
 
18
  </ul>
19
  """
20
 
 
38
  experiment_name: {experiment_name}
39
 
40
  model: {model}
 
 
 
41
 
42
  device: cuda
43
 
 
45
  no_weights: true
46
  delete_cache: true
47
  torch_dtype: float16
48
+ quantization_strategy: gptq
49
  bettertransformer: true
50
 
51
  benchmark:
 
53
 
54
  input_shapes:
55
  batch_size: 1
56
+ sequence_length: 512
57
 
58
  new_tokens: 1000
59
  ```