Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
hanhainebula
commited on
Commit
·
93fda91
1
Parent(s):
394f64e
fix bugs - 0819
Browse files1) no hyperlink for "BM25" -> add hyperlink
2) unique key of df -> set to timestamp
- app.py +57 -15
- src/display/formatting.py +1 -1
- src/display/gradio_formatting.py +3 -3
- src/envs.py +10 -1
- src/read_evals.py +2 -2
- src/utils.py +2 -2
app.py
CHANGED
@@ -8,15 +8,57 @@ from src.about import (
|
|
8 |
TITLE,
|
9 |
EVALUATION_QUEUE_TEXT
|
10 |
)
|
11 |
-
from src.benchmarks import
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from src.display.css_html_js import custom_css
|
14 |
-
from src.display.utils import
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
from src.display.gradio_listener import set_listeners
|
21 |
|
22 |
def restart_space():
|
@@ -32,7 +74,7 @@ except Exception as e:
|
|
32 |
print(f'failed to download')
|
33 |
restart_space()
|
34 |
|
35 |
-
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/
|
36 |
|
37 |
original_df_qa = get_leaderboard_df(
|
38 |
raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
@@ -190,7 +232,7 @@ with demo:
|
|
190 |
queue=True
|
191 |
)
|
192 |
with gr.TabItem("Reranking Only", id=12):
|
193 |
-
lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] ==
|
194 |
lb_df_reranker = reset_rank(lb_df_reranker)
|
195 |
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
196 |
with gr.Row():
|
@@ -199,7 +241,7 @@ with demo:
|
|
199 |
with gr.Column(scale=1):
|
200 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
201 |
lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
|
202 |
-
hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] ==
|
203 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
204 |
hidden_lb_table_reranker = get_leaderboard_table(
|
205 |
hidden_lb_df_reranker, types_qa, visible=False
|
@@ -345,7 +387,7 @@ with demo:
|
|
345 |
)
|
346 |
with gr.TabItem("Reranking Only", id=22):
|
347 |
lb_df_reranker_ldoc = leaderboard_df_long_doc[
|
348 |
-
leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] ==
|
349 |
]
|
350 |
lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
|
351 |
reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
@@ -355,7 +397,7 @@ with demo:
|
|
355 |
with gr.Column(scale=1):
|
356 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
357 |
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
|
358 |
-
hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] ==
|
359 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
360 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
361 |
hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
|
@@ -414,8 +456,8 @@ with demo:
|
|
414 |
with gr.Row():
|
415 |
with gr.Column():
|
416 |
benchmark_version = gr.Dropdown(
|
417 |
-
|
418 |
-
value=
|
419 |
interactive=True,
|
420 |
label="AIR-Bench Version")
|
421 |
with gr.Row():
|
|
|
8 |
TITLE,
|
9 |
EVALUATION_QUEUE_TEXT
|
10 |
)
|
11 |
+
from src.benchmarks import (
|
12 |
+
DOMAIN_COLS_QA,
|
13 |
+
LANG_COLS_QA,
|
14 |
+
DOMAIN_COLS_LONG_DOC,
|
15 |
+
LANG_COLS_LONG_DOC,
|
16 |
+
METRIC_LIST,
|
17 |
+
DEFAULT_METRIC_QA,
|
18 |
+
DEFAULT_METRIC_LONG_DOC
|
19 |
+
)
|
20 |
from src.display.css_html_js import custom_css
|
21 |
+
from src.display.utils import (
|
22 |
+
COL_NAME_IS_ANONYMOUS,
|
23 |
+
COL_NAME_REVISION,
|
24 |
+
COL_NAME_TIMESTAMP,
|
25 |
+
COL_NAME_RERANKING_MODEL,
|
26 |
+
COL_NAME_RETRIEVAL_MODEL
|
27 |
+
)
|
28 |
+
from src.envs import (
|
29 |
+
API,
|
30 |
+
EVAL_RESULTS_PATH,
|
31 |
+
REPO_ID,
|
32 |
+
RESULTS_REPO,
|
33 |
+
TOKEN,
|
34 |
+
BM25_LINK,
|
35 |
+
BENCHMARK_VERSION_LIST,
|
36 |
+
LATEST_BENCHMARK_VERSION
|
37 |
+
)
|
38 |
+
from src.read_evals import (
|
39 |
+
get_raw_eval_results,
|
40 |
+
get_leaderboard_df
|
41 |
+
)
|
42 |
+
from src.utils import (
|
43 |
+
update_metric,
|
44 |
+
upload_file,
|
45 |
+
get_default_cols,
|
46 |
+
submit_results,
|
47 |
+
reset_rank,
|
48 |
+
remove_html
|
49 |
+
)
|
50 |
+
from src.display.gradio_formatting import (
|
51 |
+
get_version_dropdown,
|
52 |
+
get_search_bar,
|
53 |
+
get_reranking_dropdown,
|
54 |
+
get_metric_dropdown,
|
55 |
+
get_domain_dropdown,
|
56 |
+
get_language_dropdown,
|
57 |
+
get_anonymous_checkbox,
|
58 |
+
get_revision_and_ts_checkbox,
|
59 |
+
get_leaderboard_table,
|
60 |
+
get_noreranking_dropdown
|
61 |
+
)
|
62 |
from src.display.gradio_listener import set_listeners
|
63 |
|
64 |
def restart_space():
|
|
|
74 |
print(f'failed to download')
|
75 |
restart_space()
|
76 |
|
77 |
+
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/{LATEST_BENCHMARK_VERSION}")
|
78 |
|
79 |
original_df_qa = get_leaderboard_df(
|
80 |
raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
|
|
232 |
queue=True
|
233 |
)
|
234 |
with gr.TabItem("Reranking Only", id=12):
|
235 |
+
lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
236 |
lb_df_reranker = reset_rank(lb_df_reranker)
|
237 |
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
238 |
with gr.Row():
|
|
|
241 |
with gr.Column(scale=1):
|
242 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
243 |
lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
|
244 |
+
hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
245 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
246 |
hidden_lb_table_reranker = get_leaderboard_table(
|
247 |
hidden_lb_df_reranker, types_qa, visible=False
|
|
|
387 |
)
|
388 |
with gr.TabItem("Reranking Only", id=22):
|
389 |
lb_df_reranker_ldoc = leaderboard_df_long_doc[
|
390 |
+
leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
391 |
]
|
392 |
lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
|
393 |
reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
|
|
397 |
with gr.Column(scale=1):
|
398 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
399 |
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
|
400 |
+
hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
401 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
402 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
403 |
hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
|
|
|
456 |
with gr.Row():
|
457 |
with gr.Column():
|
458 |
benchmark_version = gr.Dropdown(
|
459 |
+
BENCHMARK_VERSION_LIST,
|
460 |
+
value=LATEST_BENCHMARK_VERSION,
|
461 |
interactive=True,
|
462 |
label="AIR-Bench Version")
|
463 |
with gr.Row():
|
src/display/formatting.py
CHANGED
@@ -4,7 +4,7 @@ def model_hyperlink(link, model_name):
|
|
4 |
|
5 |
def make_clickable_model(model_name: str, model_link: str):
|
6 |
# link = f"https://huggingface.co/{model_name}"
|
7 |
-
if not model_link or not model_link.startswith("https://")
|
8 |
return model_name
|
9 |
return model_hyperlink(model_link, model_name)
|
10 |
|
|
|
4 |
|
5 |
def make_clickable_model(model_name: str, model_link: str):
|
6 |
# link = f"https://huggingface.co/{model_name}"
|
7 |
+
if not model_link or not model_link.startswith("https://"):
|
8 |
return model_name
|
9 |
return model_hyperlink(model_link, model_name)
|
10 |
|
src/display/gradio_formatting.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
|
4 |
def get_version_dropdown():
|
5 |
return gr.Dropdown(
|
6 |
-
choices=
|
7 |
-
value=
|
8 |
label="Select the version of AIR-Bench",
|
9 |
interactive=True
|
10 |
)
|
|
|
1 |
import gradio as gr
|
2 |
+
from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
|
3 |
|
4 |
def get_version_dropdown():
|
5 |
return gr.Dropdown(
|
6 |
+
choices=BENCHMARK_VERSION_LIST,
|
7 |
+
value=LATEST_BENCHMARK_VERSION,
|
8 |
label="Select the version of AIR-Bench",
|
9 |
interactive=True
|
10 |
)
|
src/envs.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
# Info to change for your repository
|
@@ -22,3 +22,12 @@ CACHE_PATH = os.getenv("HF_HOME", ".")
|
|
22 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval_results")
|
23 |
|
24 |
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from display.formatting import model_hyperlink
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
# Info to change for your repository
|
|
|
22 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval_results")
|
23 |
|
24 |
API = HfApi(token=TOKEN)
|
25 |
+
|
26 |
+
BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
|
27 |
+
|
28 |
+
BENCHMARK_VERSION_LIST = [
|
29 |
+
"AIR-Bench_24.04",
|
30 |
+
# "AIR-Bench_24.05",
|
31 |
+
]
|
32 |
+
|
33 |
+
LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
|
src/read_evals.py
CHANGED
@@ -174,8 +174,8 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
|
174 |
print(f"loading file failed. {model_result_filepath}")
|
175 |
continue
|
176 |
print(f'file loaded: {model_result_filepath}')
|
177 |
-
|
178 |
-
eval_results[
|
179 |
|
180 |
results = []
|
181 |
for k, v in eval_results.items():
|
|
|
174 |
print(f"loading file failed. {model_result_filepath}")
|
175 |
continue
|
176 |
print(f'file loaded: {model_result_filepath}')
|
177 |
+
timestamp = eval_result.timestamp
|
178 |
+
eval_results[timestamp] = eval_result
|
179 |
|
180 |
results = []
|
181 |
for k, v in eval_results.items():
|
src/utils.py
CHANGED
@@ -10,7 +10,7 @@ from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, Benchmark
|
|
10 |
from src.display.formatting import styled_message, styled_error
|
11 |
from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
|
12 |
COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_IS_ANONYMOUS, COL_NAME_TIMESTAMP, COL_NAME_REVISION, get_default_auto_eval_column_dict
|
13 |
-
from src.envs import API, SEARCH_RESULTS_REPO
|
14 |
from src.read_evals import FullEvalResult, get_leaderboard_df, calculate_mean
|
15 |
|
16 |
import re
|
@@ -251,7 +251,7 @@ def submit_results(
|
|
251 |
model_url: str,
|
252 |
reranking_model: str="",
|
253 |
reranking_model_url: str="",
|
254 |
-
version: str=
|
255 |
is_anonymous=False):
|
256 |
if not filepath.endswith(".zip"):
|
257 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|
|
|
10 |
from src.display.formatting import styled_message, styled_error
|
11 |
from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
|
12 |
COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_IS_ANONYMOUS, COL_NAME_TIMESTAMP, COL_NAME_REVISION, get_default_auto_eval_column_dict
|
13 |
+
from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
|
14 |
from src.read_evals import FullEvalResult, get_leaderboard_df, calculate_mean
|
15 |
|
16 |
import re
|
|
|
251 |
model_url: str,
|
252 |
reranking_model: str="",
|
253 |
reranking_model_url: str="",
|
254 |
+
version: str=LATEST_BENCHMARK_VERSION,
|
255 |
is_anonymous=False):
|
256 |
if not filepath.endswith(".zip"):
|
257 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|