BenchmarkBot commited on
Commit
b3a1bf0
Β·
1 Parent(s): 81f5492

filtering plot

Browse files
Files changed (2) hide show
  1. app.py +62 -49
  2. src/utils.py +0 -20
app.py CHANGED
@@ -5,7 +5,7 @@ import plotly.express as px
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
  from src.assets.text_content import TITLE, INTRODUCTION_TEXT, SINGLE_A100_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
8
- from src.utils import restart_space, load_dataset_repo, make_clickable_model, make_clickable_score, submit_query
9
  from src.assets.css_html_js import custom_css
10
 
11
 
@@ -16,10 +16,11 @@ OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
16
  COLUMNS_MAPPING = {
17
  "model": "Model πŸ€—",
18
  "backend.name": "Backend 🏭",
19
- "backend.torch_dtype": "Datatype πŸ“₯",
20
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
21
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
22
- "h4_score": "Average H4 Score ⬆️",
 
23
  }
24
  COLUMNS_DATATYPES = ["markdown", "str", "str", "number", "number", "markdown"]
25
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
@@ -28,7 +29,7 @@ SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
28
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
29
 
30
 
31
- def get_benchmark_df(benchmark):
32
  if llm_perf_dataset_repo:
33
  llm_perf_dataset_repo.git_pull()
34
 
@@ -39,41 +40,38 @@ def get_benchmark_df(benchmark):
39
  f"./llm-perf-dataset/reports/additional_data.csv")
40
  bench_df = bench_df.merge(scores_df, on="model", how="left")
41
 
42
- # preprocess
43
- bench_df["model"] = bench_df["model"].apply(make_clickable_model)
44
- bench_df["h4_score"] = bench_df["h4_score"].apply(make_clickable_score)
 
 
45
  # filter
46
  bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
47
  # rename
48
  bench_df.rename(columns=COLUMNS_MAPPING, inplace=True)
49
  # sort
50
  bench_df.sort_values(by=SORTING_COLUMN, ascending=False, inplace=True)
 
 
 
 
51
 
52
  return bench_df
53
 
54
 
55
- # Dataframes
56
- single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
57
-
58
-
59
- def get_benchmark_plot(benchmark):
60
- if llm_perf_dataset_repo:
61
- llm_perf_dataset_repo.git_pull()
62
-
63
- # load
64
- bench_df = pd.read_csv(
65
- f"./llm-perf-dataset/reports/{benchmark}.csv")
66
- scores_df = pd.read_csv(
67
- f"./llm-perf-dataset/reports/additional_data.csv")
68
- bench_df = bench_df.merge(scores_df, on="model", how="left")
69
 
 
70
  bench_df = bench_df[bench_df["generate.latency(s)"] < 100]
71
 
72
  fig = px.scatter(
73
- bench_df, x="h4_score", y="generate.latency(s)",
74
  color='model_type', symbol='backend.name', size='forward.peak_memory(MB)',
75
  custom_data=['model', 'backend.name', 'backend.torch_dtype',
76
  'forward.peak_memory(MB)', 'generate.throughput(tokens/s)'],
 
 
 
77
  )
78
 
79
  fig.update_layout(
@@ -83,11 +81,18 @@ def get_benchmark_plot(benchmark):
83
  'xanchor': 'center',
84
  'yanchor': 'top'
85
  },
86
- xaxis_title="Average H4 Score",
87
- yaxis_title="Latency per 1000 Tokens (s)",
88
- legend_title="Model Type, Backend",
89
- width=1200,
90
  height=600,
 
 
 
 
 
 
 
91
  )
92
 
93
  fig.update_traces(
@@ -97,16 +102,35 @@ def get_benchmark_plot(benchmark):
97
  "Datatype: %{customdata[2]}",
98
  "Peak Memory (MB): %{customdata[3]}",
99
  "Throughput (tokens/s): %{customdata[4]}",
100
- "Latency per 1000 Tokens (s): %{y}",
101
- "Average H4 Score: %{x}"
102
  ])
103
  )
104
 
105
  return fig
106
 
107
 
108
- # Plots
109
- single_A100_plot = get_benchmark_plot(benchmark="1xA100-80GB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  # Demo interface
112
  demo = gr.Blocks(css=custom_css)
@@ -142,7 +166,7 @@ with demo:
142
  elem_id="datatype-checkboxes",
143
  )
144
  threshold_slider = gr.Slider(
145
- label="Average H4 Score πŸ“ˆ",
146
  info="lter by minimum average H4 score",
147
  value=0.0,
148
  elem_id="threshold-slider",
@@ -161,28 +185,11 @@ with demo:
161
 
162
  # Original leaderboard table
163
  single_A100_leaderboard = gr.components.Dataframe(
164
- value=single_A100_df,
165
  datatype=COLUMNS_DATATYPES,
166
  headers=list(COLUMNS_MAPPING.values()),
167
  elem_id="1xA100-table",
168
  )
169
- # Dummy dataframe for search
170
- single_A100_for_search = gr.components.Dataframe(
171
- value=single_A100_df,
172
- datatype=COLUMNS_DATATYPES,
173
- headers=list(COLUMNS_MAPPING.values()),
174
- max_rows=None,
175
- visible=False,
176
- )
177
-
178
- submit_button.click(
179
- submit_query,
180
- [
181
- search_bar, backend_checkboxes, datatype_checkboxes, threshold_slider,
182
- single_A100_for_search
183
- ],
184
- [single_A100_leaderboard]
185
- )
186
 
187
  with gr.TabItem("πŸ–₯️ A100-80GB Plot πŸ“Š", id=1):
188
  # Original leaderboard plot
@@ -195,6 +202,12 @@ with demo:
195
  show_label=False,
196
  )
197
 
 
 
 
 
 
 
198
  with gr.Row():
199
  with gr.Accordion("πŸ“™ Citation", open=False):
200
  citation_button = gr.Textbox(
 
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
  from src.assets.text_content import TITLE, INTRODUCTION_TEXT, SINGLE_A100_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
8
+ from src.utils import restart_space, load_dataset_repo, make_clickable_model, make_clickable_score
9
  from src.assets.css_html_js import custom_css
10
 
11
 
 
16
  COLUMNS_MAPPING = {
17
  "model": "Model πŸ€—",
18
  "backend.name": "Backend 🏭",
19
+ "backend.torch_dtype": "Load Dtype πŸ“₯",
20
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
21
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
22
+ "h4_score": "Average Open LLM Score ⬆️",
23
+
24
  }
25
  COLUMNS_DATATYPES = ["markdown", "str", "str", "number", "number", "markdown"]
26
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
 
29
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
30
 
31
 
32
+ def get_benchmark_df(benchmark="1xA100-80GB"):
33
  if llm_perf_dataset_repo:
34
  llm_perf_dataset_repo.git_pull()
35
 
 
40
  f"./llm-perf-dataset/reports/additional_data.csv")
41
  bench_df = bench_df.merge(scores_df, on="model", how="left")
42
 
43
+ return bench_df
44
+
45
+
46
+ def get_benchmark_table(bench_df):
47
+
48
  # filter
49
  bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
50
  # rename
51
  bench_df.rename(columns=COLUMNS_MAPPING, inplace=True)
52
  # sort
53
  bench_df.sort_values(by=SORTING_COLUMN, ascending=False, inplace=True)
54
+ # transform
55
+ bench_df["Model πŸ€—"] = bench_df["Model πŸ€—"].apply(make_clickable_model)
56
+ bench_df["Average Open LLM Score ⬆️"] = bench_df["Average Open LLM Score ⬆️"].apply(
57
+ make_clickable_score)
58
 
59
  return bench_df
60
 
61
 
62
+ def get_benchmark_plot(bench_df):
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ # untill falcon gets fixed / natively supported
65
  bench_df = bench_df[bench_df["generate.latency(s)"] < 100]
66
 
67
  fig = px.scatter(
68
+ bench_df, x="generate.latency(s)", y="h4_score",
69
  color='model_type', symbol='backend.name', size='forward.peak_memory(MB)',
70
  custom_data=['model', 'backend.name', 'backend.torch_dtype',
71
  'forward.peak_memory(MB)', 'generate.throughput(tokens/s)'],
72
+ symbol_sequence=['triangle-up', 'circle'],
73
+ # as many distinct colors as there are model_type,backend.name couples
74
+ color_discrete_sequence=px.colors.qualitative.Light24,
75
  )
76
 
77
  fig.update_layout(
 
81
  'xanchor': 'center',
82
  'yanchor': 'top'
83
  },
84
+ xaxis_title="Per 1000 Tokens Latency (s)",
85
+ yaxis_title="Average Open LLM Score",
86
+ legend_title="Model Type and Backend",
87
+ width=1000,
88
  height=600,
89
+ legend=dict(
90
+ orientation="h",
91
+ yanchor="bottom",
92
+ y=-0.35,
93
+ xanchor="center",
94
+ x=0.5
95
+ )
96
  )
97
 
98
  fig.update_traces(
 
102
  "Datatype: %{customdata[2]}",
103
  "Peak Memory (MB): %{customdata[3]}",
104
  "Throughput (tokens/s): %{customdata[4]}",
105
+ "Per 1000 Tokens Latency (s): %{y}",
106
+ "Average Open LLM Score: %{x}",
107
  ])
108
  )
109
 
110
  return fig
111
 
112
 
113
+ def filter_query(text, backends, datatypes, threshold, benchmark="1xA100-80GB"):
114
+
115
+ raw_df = get_benchmark_df(benchmark=benchmark)
116
+
117
+ filtered_df = raw_df[
118
+ raw_df["model"].str.lower().str.contains(text.lower()) &
119
+ raw_df["backend.name"].isin(backends) &
120
+ raw_df["Dbackend.torch_dtype"].isin(datatypes) &
121
+ (raw_df["h4_score"] >= threshold)
122
+ ]
123
+
124
+ filtered_table = get_benchmark_table(filtered_df)
125
+ filtered_plot = get_benchmark_plot(filtered_df)
126
+
127
+ return filtered_table, filtered_plot
128
+
129
+
130
+ # Dataframes
131
+ single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
132
+ single_A100_table = get_benchmark_table(single_A100_df)
133
+ single_A100_plot = get_benchmark_plot(single_A100_df)
134
 
135
  # Demo interface
136
  demo = gr.Blocks(css=custom_css)
 
166
  elem_id="datatype-checkboxes",
167
  )
168
  threshold_slider = gr.Slider(
169
+ label="Average Open LLM Score πŸ“ˆ",
170
  info="lter by minimum average H4 score",
171
  value=0.0,
172
  elem_id="threshold-slider",
 
185
 
186
  # Original leaderboard table
187
  single_A100_leaderboard = gr.components.Dataframe(
188
+ value=single_A100_table,
189
  datatype=COLUMNS_DATATYPES,
190
  headers=list(COLUMNS_MAPPING.values()),
191
  elem_id="1xA100-table",
192
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  with gr.TabItem("πŸ–₯️ A100-80GB Plot πŸ“Š", id=1):
195
  # Original leaderboard plot
 
202
  show_label=False,
203
  )
204
 
205
+ submit_button.click(
206
+ filter_query,
207
+ [search_bar, backend_checkboxes, datatype_checkboxes, threshold_slider],
208
+ [single_A100_leaderboard]
209
+ )
210
+
211
  with gr.Row():
212
  with gr.Accordion("πŸ“™ Citation", open=False):
213
  citation_button = gr.Textbox(
src/utils.py CHANGED
@@ -66,23 +66,3 @@ def make_clickable_model(model_name):
66
  def make_clickable_score(score):
67
  link = f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"
68
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{score}</a>'
69
-
70
-
71
- def extract_score_from_clickable(clickable_score) -> float:
72
- return float(re.findall(r"\d+\.\d+", clickable_score)[-1])
73
-
74
-
75
- def submit_query(text, backends, datatypes, threshold, raw_df):
76
- raw_df["Average H4 Score ⬆️"] = raw_df["Average H4 Score ⬆️"].apply(
77
- extract_score_from_clickable)
78
-
79
- filtered_df = raw_df[
80
- raw_df["Model πŸ€—"].str.lower().str.contains(text.lower()) &
81
- raw_df["Backend 🏭"].isin(backends) &
82
- raw_df["Datatype πŸ“₯"].isin(datatypes) &
83
- (raw_df["Average H4 Score ⬆️"] >= threshold)
84
- ]
85
-
86
- filtered_df["Average H4 Score ⬆️"] = filtered_df["Average H4 Score ⬆️"].apply(
87
- make_clickable_score)
88
- return filtered_df
 
66
  def make_clickable_score(score):
67
  link = f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"
68
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{score}</a>'