Spaces:
AIR-Bench
/
Running on CPU Upgrade

hanhainebula commited on
Commit
93fda91
1 Parent(s): 394f64e

fix bugs - 0819

Browse files

1) no hyperlink for "BM25" -> add hyperlink
2) unique key of df -> set to timestamp

app.py CHANGED
@@ -8,15 +8,57 @@ from src.about import (
8
  TITLE,
9
  EVALUATION_QUEUE_TEXT
10
  )
11
- from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
- DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC
 
 
 
 
 
 
 
13
  from src.display.css_html_js import custom_css
14
- from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
15
- from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
16
- from src.read_evals import get_raw_eval_results, get_leaderboard_df
17
- from src.utils import update_metric, upload_file, get_default_cols, submit_results, reset_rank, remove_html
18
- from src.display.gradio_formatting import get_version_dropdown, get_search_bar, get_reranking_dropdown, \
19
- get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table, get_noreranking_dropdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  from src.display.gradio_listener import set_listeners
21
 
22
  def restart_space():
@@ -32,7 +74,7 @@ except Exception as e:
32
  print(f'failed to download')
33
  restart_space()
34
 
35
- raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
36
 
37
  original_df_qa = get_leaderboard_df(
38
  raw_data, task='qa', metric=DEFAULT_METRIC_QA)
@@ -190,7 +232,7 @@ with demo:
190
  queue=True
191
  )
192
  with gr.TabItem("Reranking Only", id=12):
193
- lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == "BM25"]
194
  lb_df_reranker = reset_rank(lb_df_reranker)
195
  reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
196
  with gr.Row():
@@ -199,7 +241,7 @@ with demo:
199
  with gr.Column(scale=1):
200
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
201
  lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
202
- hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == "BM25"]
203
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
204
  hidden_lb_table_reranker = get_leaderboard_table(
205
  hidden_lb_df_reranker, types_qa, visible=False
@@ -345,7 +387,7 @@ with demo:
345
  )
346
  with gr.TabItem("Reranking Only", id=22):
347
  lb_df_reranker_ldoc = leaderboard_df_long_doc[
348
- leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == "BM25"
349
  ]
350
  lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
351
  reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
@@ -355,7 +397,7 @@ with demo:
355
  with gr.Column(scale=1):
356
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
357
  lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
358
- hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == "BM25"]
359
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
360
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
361
  hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
@@ -414,8 +456,8 @@ with demo:
414
  with gr.Row():
415
  with gr.Column():
416
  benchmark_version = gr.Dropdown(
417
- ["AIR-Bench_24.04", ],
418
- value="AIR-Bench_24.04",
419
  interactive=True,
420
  label="AIR-Bench Version")
421
  with gr.Row():
 
8
  TITLE,
9
  EVALUATION_QUEUE_TEXT
10
  )
11
+ from src.benchmarks import (
12
+ DOMAIN_COLS_QA,
13
+ LANG_COLS_QA,
14
+ DOMAIN_COLS_LONG_DOC,
15
+ LANG_COLS_LONG_DOC,
16
+ METRIC_LIST,
17
+ DEFAULT_METRIC_QA,
18
+ DEFAULT_METRIC_LONG_DOC
19
+ )
20
  from src.display.css_html_js import custom_css
21
+ from src.display.utils import (
22
+ COL_NAME_IS_ANONYMOUS,
23
+ COL_NAME_REVISION,
24
+ COL_NAME_TIMESTAMP,
25
+ COL_NAME_RERANKING_MODEL,
26
+ COL_NAME_RETRIEVAL_MODEL
27
+ )
28
+ from src.envs import (
29
+ API,
30
+ EVAL_RESULTS_PATH,
31
+ REPO_ID,
32
+ RESULTS_REPO,
33
+ TOKEN,
34
+ BM25_LINK,
35
+ BENCHMARK_VERSION_LIST,
36
+ LATEST_BENCHMARK_VERSION
37
+ )
38
+ from src.read_evals import (
39
+ get_raw_eval_results,
40
+ get_leaderboard_df
41
+ )
42
+ from src.utils import (
43
+ update_metric,
44
+ upload_file,
45
+ get_default_cols,
46
+ submit_results,
47
+ reset_rank,
48
+ remove_html
49
+ )
50
+ from src.display.gradio_formatting import (
51
+ get_version_dropdown,
52
+ get_search_bar,
53
+ get_reranking_dropdown,
54
+ get_metric_dropdown,
55
+ get_domain_dropdown,
56
+ get_language_dropdown,
57
+ get_anonymous_checkbox,
58
+ get_revision_and_ts_checkbox,
59
+ get_leaderboard_table,
60
+ get_noreranking_dropdown
61
+ )
62
  from src.display.gradio_listener import set_listeners
63
 
64
  def restart_space():
 
74
  print(f'failed to download')
75
  restart_space()
76
 
77
+ raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/{LATEST_BENCHMARK_VERSION}")
78
 
79
  original_df_qa = get_leaderboard_df(
80
  raw_data, task='qa', metric=DEFAULT_METRIC_QA)
 
232
  queue=True
233
  )
234
  with gr.TabItem("Reranking Only", id=12):
235
+ lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
236
  lb_df_reranker = reset_rank(lb_df_reranker)
237
  reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
238
  with gr.Row():
 
241
  with gr.Column(scale=1):
242
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
243
  lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
244
+ hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
245
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
246
  hidden_lb_table_reranker = get_leaderboard_table(
247
  hidden_lb_df_reranker, types_qa, visible=False
 
387
  )
388
  with gr.TabItem("Reranking Only", id=22):
389
  lb_df_reranker_ldoc = leaderboard_df_long_doc[
390
+ leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
391
  ]
392
  lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
393
  reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
 
397
  with gr.Column(scale=1):
398
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
399
  lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
400
+ hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
401
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
402
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
403
  hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
 
456
  with gr.Row():
457
  with gr.Column():
458
  benchmark_version = gr.Dropdown(
459
+ BENCHMARK_VERSION_LIST,
460
+ value=LATEST_BENCHMARK_VERSION,
461
  interactive=True,
462
  label="AIR-Bench Version")
463
  with gr.Row():
src/display/formatting.py CHANGED
@@ -4,7 +4,7 @@ def model_hyperlink(link, model_name):
4
 
5
  def make_clickable_model(model_name: str, model_link: str):
6
  # link = f"https://huggingface.co/{model_name}"
7
- if not model_link or not model_link.startswith("https://") or model_name == "BM25":
8
  return model_name
9
  return model_hyperlink(model_link, model_name)
10
 
 
4
 
5
  def make_clickable_model(model_name: str, model_link: str):
6
  # link = f"https://huggingface.co/{model_name}"
7
+ if not model_link or not model_link.startswith("https://"):
8
  return model_name
9
  return model_hyperlink(model_link, model_name)
10
 
src/display/gradio_formatting.py CHANGED
@@ -1,10 +1,10 @@
1
  import gradio as gr
2
-
3
 
4
  def get_version_dropdown():
5
  return gr.Dropdown(
6
- choices=["AIR-Bench_24.04", ],
7
- value="AIR-Bench_24.04",
8
  label="Select the version of AIR-Bench",
9
  interactive=True
10
  )
 
1
  import gradio as gr
2
+ from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
3
 
4
  def get_version_dropdown():
5
  return gr.Dropdown(
6
+ choices=BENCHMARK_VERSION_LIST,
7
+ value=LATEST_BENCHMARK_VERSION,
8
  label="Select the version of AIR-Bench",
9
  interactive=True
10
  )
src/envs.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
-
3
  from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
@@ -22,3 +22,12 @@ CACHE_PATH = os.getenv("HF_HOME", ".")
22
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval_results")
23
 
24
  API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from display.formatting import model_hyperlink
3
  from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
 
22
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval_results")
23
 
24
  API = HfApi(token=TOKEN)
25
+
26
+ BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
27
+
28
+ BENCHMARK_VERSION_LIST = [
29
+ "AIR-Bench_24.04",
30
+ # "AIR-Bench_24.05",
31
+ ]
32
+
33
+ LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
src/read_evals.py CHANGED
@@ -174,8 +174,8 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
174
  print(f"loading file failed. {model_result_filepath}")
175
  continue
176
  print(f'file loaded: {model_result_filepath}')
177
- eval_name = eval_result.eval_name
178
- eval_results[eval_name] = eval_result
179
 
180
  results = []
181
  for k, v in eval_results.items():
 
174
  print(f"loading file failed. {model_result_filepath}")
175
  continue
176
  print(f'file loaded: {model_result_filepath}')
177
+ timestamp = eval_result.timestamp
178
+ eval_results[timestamp] = eval_result
179
 
180
  results = []
181
  for k, v in eval_results.items():
src/utils.py CHANGED
@@ -10,7 +10,7 @@ from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, Benchmark
10
  from src.display.formatting import styled_message, styled_error
11
  from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
12
  COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_IS_ANONYMOUS, COL_NAME_TIMESTAMP, COL_NAME_REVISION, get_default_auto_eval_column_dict
13
- from src.envs import API, SEARCH_RESULTS_REPO
14
  from src.read_evals import FullEvalResult, get_leaderboard_df, calculate_mean
15
 
16
  import re
@@ -251,7 +251,7 @@ def submit_results(
251
  model_url: str,
252
  reranking_model: str="",
253
  reranking_model_url: str="",
254
- version: str="AIR-Bench_24.04",
255
  is_anonymous=False):
256
  if not filepath.endswith(".zip"):
257
  return styled_error(f"file uploading aborted. wrong file type: {filepath}")
 
10
  from src.display.formatting import styled_message, styled_error
11
  from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
12
  COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_IS_ANONYMOUS, COL_NAME_TIMESTAMP, COL_NAME_REVISION, get_default_auto_eval_column_dict
13
+ from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
14
  from src.read_evals import FullEvalResult, get_leaderboard_df, calculate_mean
15
 
16
  import re
 
251
  model_url: str,
252
  reranking_model: str="",
253
  reranking_model_url: str="",
254
+ version: str=LATEST_BENCHMARK_VERSION,
255
  is_anonymous=False):
256
  if not filepath.endswith(".zip"):
257
  return styled_error(f"file uploading aborted. wrong file type: {filepath}")