IlyasMoutawwakil HF staff commited on
Commit
f3dc796
β€’
1 Parent(s): 29e37fd
Files changed (2) hide show
  1. app.py +4 -6
  2. src/assets/text_content.py +8 -6
app.py CHANGED
@@ -24,12 +24,14 @@ ALL_COLUMNS_MAPPING = {
24
  # model
25
  "Model": "Model πŸ€—",
26
  "Arch": "Arch πŸ›οΈ",
27
- "Size": "Size πŸ‹οΈ",
28
  # deployment settings
29
  "backend.name": "Backend 🏭",
30
  "backend.torch_dtype": "Dtype πŸ“₯",
31
  "optimizations": "Optimizations πŸ› οΈ",
32
  "quantization": "Quantization πŸ—œοΈ",
 
 
33
  # throughput measurements
34
  "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
35
  "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
@@ -42,8 +44,6 @@ ALL_COLUMNS_MAPPING = {
42
  "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
43
  # energy measurements
44
  "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
45
- # quality measurements
46
- "Score": "Avg Score (%) ⬆️",
47
  }
48
  SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
49
  SORTING_ASCENDING = [False, True]
@@ -148,9 +148,7 @@ def get_benchmark_chart(bench_df):
148
  copy_df = bench_df.copy()
149
  # transform
150
  copy_df["Arch πŸ›οΈ"] = copy_df["Arch πŸ›οΈ"].apply(process_model_arch)
151
- # filter latency bigger than 100s
152
- # copy_df = copy_df[copy_df["E2E Latency (s) ⬇️"] <= 100]
153
-
154
  fig = px.scatter(
155
  copy_df,
156
  y="Avg Score (%) ⬆️",
 
24
  # model
25
  "Model": "Model πŸ€—",
26
  "Arch": "Arch πŸ›οΈ",
27
+ "Size": "Size πŸ“",
28
  # deployment settings
29
  "backend.name": "Backend 🏭",
30
  "backend.torch_dtype": "Dtype πŸ“₯",
31
  "optimizations": "Optimizations πŸ› οΈ",
32
  "quantization": "Quantization πŸ—œοΈ",
33
+ # quality measurements
34
+ "Score": "Avg Score (%) ⬆️",
35
  # throughput measurements
36
  "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
37
  "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
 
44
  "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
45
  # energy measurements
46
  "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
 
 
47
  }
48
  SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
49
  SORTING_ASCENDING = [False, True]
 
148
  copy_df = bench_df.copy()
149
  # transform
150
  copy_df["Arch πŸ›οΈ"] = copy_df["Arch πŸ›οΈ"].apply(process_model_arch)
151
+ # plot
 
 
152
  fig = px.scatter(
153
  copy_df,
154
  y="Avg Score (%) ⬆️",
src/assets/text_content.py CHANGED
@@ -12,7 +12,7 @@ ABOUT_TEXT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
12
  <ul>
13
  <li>To avoid communication-dependent results, only one GPU is used.</li>
14
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
15
- <li>LLMs are running on a singleton batch with a prompt size of 512 and generating a 1000 tokens.</li>
16
  <li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
17
  <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
18
  <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
@@ -44,19 +44,21 @@ device: cuda
44
 
45
  backend:
46
  no_weights: true
47
- delete_cache: true
48
  torch_dtype: float16
49
- quantization_strategy: gptq
50
  bettertransformer: true
 
 
51
 
52
  benchmark:
53
  memory: true
54
-
 
 
55
  input_shapes:
56
  batch_size: 1
57
- sequence_length: 512
 
58
 
59
- new_tokens: 1000
60
  ```
61
  """
62
 
 
12
  <ul>
13
  <li>To avoid communication-dependent results, only one GPU is used.</li>
14
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
15
+ <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 1000 tokens.</li>
16
  <li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
17
  <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
18
  <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
 
44
 
45
  backend:
46
  no_weights: true
 
47
  torch_dtype: float16
 
48
  bettertransformer: true
49
+ quantization_scheme: gptq
50
+
51
 
52
  benchmark:
53
  memory: true
54
+ energy: true
55
+
56
+ new_tokens: 1000
57
  input_shapes:
58
  batch_size: 1
59
+ sequence_length: 256
60
+
61
 
 
62
  ```
63
  """
64