BenchmarkBot commited on
Commit
d574374
Β·
1 Parent(s): 3c37eb3

sort by score

Browse files
Files changed (2) hide show
  1. app.py +3 -2
  2. src/assets/text_content.py +4 -3
app.py CHANGED
@@ -58,7 +58,8 @@ ALL_COLUMNS_DATATYPES = [
58
  #
59
  "markdown",
60
  ]
61
- SORTING_COLUMN = ["generate.throughput(tokens/s)"]
 
62
 
63
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
64
 
@@ -110,7 +111,7 @@ def get_benchmark_table(bench_df):
110
  axis=1,
111
  )
112
  # sort
113
- copy_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
114
  # filter
115
  copy_df = copy_df[list(ALL_COLUMNS_MAPPING.keys())]
116
  # rename
 
58
  #
59
  "markdown",
60
  ]
61
+ SORTING_COLUMN = ["best_score"]
62
+ SORTING_ASCENDING = [False]
63
 
64
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
65
 
 
111
  axis=1,
112
  )
113
  # sort
114
+ copy_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
115
  # filter
116
  copy_df = copy_df[list(ALL_COLUMNS_MAPPING.keys())]
117
  # rename
src/assets/text_content.py CHANGED
@@ -11,10 +11,11 @@ Anyone from the community can request a model or a hardware/backend/optimization
11
  ABOUT_TEXT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
12
  <ul>
13
  <li>To avoid communication-dependent results, only one GPU is used.</li>
14
- <li>LLMs are evaluated on a singleton batch with a prompt size of 512 and generating 1000 tokens.</li>
15
- <li>Peak memory is measured in MB during the generate pass with py3nvml while assuring the GPU's isolation.</li>
16
- <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
17
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
 
 
 
 
18
  </ul>
19
  """
20
 
 
11
  ABOUT_TEXT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
12
  <ul>
13
  <li>To avoid communication-dependent results, only one GPU is used.</li>
 
 
 
14
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
15
+ <li>LLMs are running on a singleton batch with a prompt size of 512 and generating a 1000 tokens.</li>
16
+ <li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
17
+ <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
18
+ <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
19
  </ul>
20
  """
21