Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
9c49811
·
1 Parent(s): 3d59d51

feat: adapt the utils in app.py

Browse files
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
@@ -10,18 +9,15 @@ from src.about import (
10
  )
11
  from src.display.css_html_js import custom_css
12
  from src.display.utils import (
13
- BENCHMARK_COLS,
14
  COLS,
15
- EVAL_COLS,
16
- NUMERIC_INTERVALS,
17
  TYPES,
18
- AutoEvalColumn,
19
- ModelType,
20
- fields,
21
- Precision
22
  )
23
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
24
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
25
 
26
 
27
  def restart_space():
@@ -45,9 +41,9 @@ try:
45
  except Exception:
46
  restart_space()
47
 
48
- raw_data, original_df = get_leaderboard_df(
49
- EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
50
- leaderboard_df = original_df.copy()
51
 
52
  # (
53
  # finished_eval_queue_df,
@@ -56,78 +52,6 @@ leaderboard_df = original_df.copy()
56
  # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
57
 
58
 
59
- # Searching and filtering
60
- def update_table(
61
- hidden_df: pd.DataFrame,
62
- columns: list,
63
- type_query: list,
64
- precision_query: str,
65
- size_query: list,
66
- show_deleted: bool,
67
- query: str,
68
- ):
69
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
70
- filtered_df = filter_queries(query, filtered_df)
71
- df = select_columns(filtered_df, columns)
72
- return df
73
-
74
-
75
- def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
76
- return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
77
-
78
-
79
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
80
- always_here_cols = [
81
- AutoEvalColumn.model_type_symbol.name,
82
- AutoEvalColumn.model.name,
83
- ]
84
- # We use COLS to maintain sorting
85
- filtered_df = df[
86
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
87
- ]
88
- return filtered_df
89
-
90
-
91
- def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
92
- final_df = []
93
- if query != "":
94
- queries = [q.strip() for q in query.split(";")]
95
- for _q in queries:
96
- _q = _q.strip()
97
- if _q != "":
98
- temp_filtered_df = search_table(filtered_df, _q)
99
- if len(temp_filtered_df) > 0:
100
- final_df.append(temp_filtered_df)
101
- if len(final_df) > 0:
102
- filtered_df = pd.concat(final_df)
103
- filtered_df = filtered_df.drop_duplicates(
104
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
105
- )
106
-
107
- return filtered_df
108
-
109
-
110
- def filter_models(
111
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
112
- ) -> pd.DataFrame:
113
- # Show all models
114
- if show_deleted:
115
- filtered_df = df
116
- else: # Show only still on the hub models
117
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
118
-
119
- type_emoji = [t[0] for t in type_query]
120
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
121
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
122
-
123
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
124
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
125
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
126
- filtered_df = filtered_df.loc[mask]
127
-
128
- return filtered_df
129
-
130
-
131
  demo = gr.Blocks(css=custom_css)
132
  with demo:
133
  gr.HTML(TITLE)
@@ -147,12 +71,12 @@ with demo:
147
  shown_columns = gr.CheckboxGroup(
148
  choices=[
149
  c.name
150
- for c in fields(AutoEvalColumn)
151
  if not c.hidden and not c.never_hidden
152
  ],
153
  value=[
154
  c.name
155
- for c in fields(AutoEvalColumn)
156
  if c.displayed_by_default and not c.hidden and not c.never_hidden
157
  ],
158
  label="Select columns to show",
@@ -189,10 +113,10 @@ with demo:
189
 
190
  leaderboard_table = gr.components.Dataframe(
191
  value=leaderboard_df[
192
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
193
  + shown_columns.value
194
  ],
195
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
196
  datatype=TYPES,
197
  elem_id="leaderboard-table",
198
  interactive=False,
@@ -201,7 +125,7 @@ with demo:
201
 
202
  # Dummy leaderboard for handling the case when the user uses backspace key
203
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
204
- value=original_df[COLS],
205
  headers=COLS,
206
  datatype=TYPES,
207
  visible=False,
 
1
  import gradio as gr
 
2
  from apscheduler.schedulers.background import BackgroundScheduler
3
  from huggingface_hub import snapshot_download
4
 
 
9
  )
10
  from src.display.css_html_js import custom_css
11
  from src.display.utils import (
12
+ QA_BENCHMARK_COLS,
13
  COLS,
 
 
14
  TYPES,
15
+ AutoEvalColumnQA,
16
+ fields
 
 
17
  )
18
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
19
+ from src.populate import get_leaderboard_df
20
+ from utils import update_table
21
 
22
 
23
  def restart_space():
 
41
  except Exception:
42
  restart_space()
43
 
44
+ raw_data_qa, original_df_qa = get_leaderboard_df(
45
+ EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_1')
46
+ leaderboard_df = original_df_qa.copy()
47
 
48
  # (
49
  # finished_eval_queue_df,
 
52
  # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  demo = gr.Blocks(css=custom_css)
56
  with demo:
57
  gr.HTML(TITLE)
 
71
  shown_columns = gr.CheckboxGroup(
72
  choices=[
73
  c.name
74
+ for c in fields(AutoEvalColumnQA)
75
  if not c.hidden and not c.never_hidden
76
  ],
77
  value=[
78
  c.name
79
+ for c in fields(AutoEvalColumnQA)
80
  if c.displayed_by_default and not c.hidden and not c.never_hidden
81
  ],
82
  label="Select columns to show",
 
113
 
114
  leaderboard_table = gr.components.Dataframe(
115
  value=leaderboard_df[
116
+ [c.name for c in fields(AutoEvalColumnQA) if c.never_hidden]
117
  + shown_columns.value
118
  ],
119
+ headers=[c.name for c in fields(AutoEvalColumnQA) if c.never_hidden] + shown_columns.value,
120
  datatype=TYPES,
121
  elem_id="leaderboard-table",
122
  interactive=False,
 
125
 
126
  # Dummy leaderboard for handling the case when the user uses backspace key
127
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
128
+ value=original_df_qa[COLS],
129
  headers=COLS,
130
  datatype=TYPES,
131
  visible=False,
src/benchmarks.py CHANGED
@@ -106,9 +106,12 @@ metric_list = [
106
 
107
  @dataclass
108
  class Benchmark:
109
- name: str # [task]_[domain]_[language]_[metric], task_key in the json file,
110
  metric: str # ndcg_at_1 ,metric_key in the json file
111
  col_name: str # [domain]_[language], name to display in the leaderboard
 
 
 
112
 
113
  qa_benchmark_dict = {}
114
  long_doc_benchmark_dict = {}
@@ -116,18 +119,20 @@ for task, domain_dict in dataset_dict.items():
116
  for domain, lang_dict in domain_dict.items():
117
  for lang, dataset_list in lang_dict.items():
118
  if task == "qa":
119
- benchmark_name = f"{task}_{domain}_{lang}"
120
  benchmark_name = get_safe_name(benchmark_name)
121
  col_name = f"{domain}_{lang}"
122
  for metric in dataset_list:
123
- qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
124
  elif task == "long_doc":
125
  for dataset in dataset_list:
126
  col_name = f"{domain}_{lang}_{dataset}"
127
  for metric in metric_list:
128
- benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
129
  benchmark_name = get_safe_name(benchmark_name)
130
- long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
131
 
132
  BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
133
  BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
 
 
 
106
 
107
  @dataclass
108
  class Benchmark:
109
+ name: str # [domain]_[language]_[metric], task_key in the json file,
110
  metric: str # ndcg_at_1 ,metric_key in the json file
111
  col_name: str # [domain]_[language], name to display in the leaderboard
112
+ domain: str
113
+ lang: str
114
+ task: str
115
 
116
  qa_benchmark_dict = {}
117
  long_doc_benchmark_dict = {}
 
119
  for domain, lang_dict in domain_dict.items():
120
  for lang, dataset_list in lang_dict.items():
121
  if task == "qa":
122
+ benchmark_name = f"{domain}_{lang}"
123
  benchmark_name = get_safe_name(benchmark_name)
124
  col_name = f"{domain}_{lang}"
125
  for metric in dataset_list:
126
+ qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
127
  elif task == "long_doc":
128
  for dataset in dataset_list:
129
  col_name = f"{domain}_{lang}_{dataset}"
130
  for metric in metric_list:
131
+ benchmark_name = f"{domain}_{lang}_{dataset}_{metric}"
132
  benchmark_name = get_safe_name(benchmark_name)
133
+ long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
134
 
135
  BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
136
  BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
137
+
138
+ BENCHMARK_COLS_QA = [c.col_name for c in qa_benchmark_dict.values()]
src/populate.py CHANGED
@@ -9,16 +9,17 @@ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
9
  from typing import Tuple
10
 
11
 
12
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
  all_data_json = []
16
  for v in raw_data:
17
- all_data_json += v.to_dict()
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
- df["Average ⬆️"] = df[benchmark_cols].mean(axis=1)
21
- # df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
 
22
  df = df[cols].round(decimals=2)
23
 
24
  # filter out if any of the benchmarks have not been produced
 
9
  from typing import Tuple
10
 
11
 
12
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
  all_data_json = []
16
  for v in raw_data:
17
+ all_data_json += v.to_dict(task=task, metric=metric)
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
+ df[AutoEvalColumnQA.average.name] = df[benchmark_cols].mean(axis=1)
21
+ df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
22
+ df.reset_index(inplace=True)
23
  df = df[cols].round(decimals=2)
24
 
25
  # filter out if any of the benchmarks have not been produced
tests/src/test_populate.py CHANGED
@@ -11,10 +11,13 @@ def test_get_leaderboard_df():
11
  benchmark_cols = ['wiki_en', 'wiki_zh',]
12
  raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
13
  assert df.shape[0] == 2
14
- assert df["Retrieval Model"][0] == "bge-m3"
15
- assert df["Retrieval Model"][1] == "bge-m3"
16
- assert df["Reranking Model"][0] == "NoReranker"
17
- assert df["Reranking Model"][1] == "bge-reranker-v2-m3"
 
 
 
18
  assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
19
 
20
 
 
11
  benchmark_cols = ['wiki_en', 'wiki_zh',]
12
  raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
13
  assert df.shape[0] == 2
14
+ # the results contains only one embedding model
15
+ for i in range(2):
16
+ assert df["Retrieval Model"][i] == "bge-m3"
17
+ # the results contains only two reranking model
18
+ assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
19
+ assert df["Reranking Model"][1] == "NoReranker"
20
+ assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
21
  assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
22
 
23
 
tests/test_utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pytest
3
+
4
+ from utils import filter_models, search_table, filter_queries, select_columns
5
+
6
+
7
+ @pytest.fixture
8
+ def toy_df():
9
+ return pd.DataFrame(
10
+ {
11
+ "Retrieval Model": [
12
+ "bge-m3",
13
+ "bge-m3",
14
+ "jina-embeddings-v2-base",
15
+ "jina-embeddings-v2-base"
16
+ ],
17
+ "Reranking Model": [
18
+ "bge-reranker-v2-m3",
19
+ "NoReranker",
20
+ "bge-reranker-v2-m3",
21
+ "NoReranker"
22
+ ],
23
+ "Average ⬆️": [0.6, 0.4, 0.3, 0.2],
24
+ "wiki_en": [0.8, 0.7, 0.2, 0.1],
25
+ "wiki_zh": [0.4, 0.1, 0.4, 0.3],
26
+ "news_en": [0.8, 0.7, 0.2, 0.1],
27
+ "news_zh": [0.4, 0.1, 0.4, 0.3],
28
+ }
29
+ )
30
+
31
+
32
+ def test_filter_models(toy_df):
33
+ df_result = filter_models(toy_df, ["bge-reranker-v2-m3", ])
34
+ assert len(df_result) == 2
35
+ assert df_result.iloc[0]["Reranking Model"] == "bge-reranker-v2-m3"
36
+
37
+
38
+ def test_search_table(toy_df):
39
+ df_result = search_table(toy_df, "jina")
40
+ assert len(df_result) == 2
41
+ assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
42
+
43
+
44
+ def test_filter_queries(toy_df):
45
+ df_result = filter_queries("jina", toy_df)
46
+ assert len(df_result) == 2
47
+ assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
48
+
49
+
50
+ def test_select_columns(toy_df):
51
+ df_result = select_columns(toy_df, ['news',], ['zh',])
52
+ assert len(df_result.columns) == 4
53
+ assert df_result['Average ⬆️'].equals(df_result['news_zh'])
tests/toydata/test_results/bge-m3/NoReranker/results_demo_2023-12-21T18-10-08.json CHANGED
@@ -27,7 +27,7 @@
27
  "domain": "wiki",
28
  "lang": "en",
29
  "dataset": "unknown",
30
- "value": 0.69083
31
  }
32
  ]
33
  },
 
27
  "domain": "wiki",
28
  "lang": "en",
29
  "dataset": "unknown",
30
+ "value": 0.39083
31
  }
32
  ]
33
  },
utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from src.display.utils import AutoEvalColumnQA, COLS
4
+ from src.benchmarks import BENCHMARK_COLS_QA, BenchmarksQA
5
+
6
+
7
+ def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
8
+ return df.loc[df["Reranking Model"].isin(reranking_query)]
9
+
10
+
11
+ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
12
+ final_df = []
13
+ if query != "":
14
+ queries = [q.strip() for q in query.split(";")]
15
+ for _q in queries:
16
+ _q = _q.strip()
17
+ if _q != "":
18
+ temp_filtered_df = search_table(filtered_df, _q)
19
+ if len(temp_filtered_df) > 0:
20
+ final_df.append(temp_filtered_df)
21
+ if len(final_df) > 0:
22
+ filtered_df = pd.concat(final_df)
23
+ filtered_df = filtered_df.drop_duplicates(
24
+ subset=[
25
+ AutoEvalColumnQA.retrieval_model.name,
26
+ AutoEvalColumnQA.reranking_model.name,
27
+ ]
28
+ )
29
+
30
+ return filtered_df
31
+
32
+
33
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
34
+ return df[(df[AutoEvalColumnQA.retrieval_model.name].str.contains(query, case=False))]
35
+
36
+
37
+ def select_columns(df: pd.DataFrame, domain_query: list, language_query: list) -> pd.DataFrame:
38
+ always_here_cols = [
39
+ AutoEvalColumnQA.retrieval_model.name,
40
+ AutoEvalColumnQA.reranking_model.name,
41
+ AutoEvalColumnQA.average.name
42
+ ]
43
+ selected_cols = []
44
+ for c in COLS:
45
+ if c not in df.columns:
46
+ continue
47
+ if c not in BENCHMARK_COLS_QA:
48
+ continue
49
+ eval_col = BenchmarksQA[c].value
50
+ if eval_col.domain not in domain_query:
51
+ continue
52
+ if eval_col.lang not in language_query:
53
+ continue
54
+ selected_cols.append(c)
55
+ # We use COLS to maintain sorting
56
+ filtered_df = df[always_here_cols + selected_cols]
57
+ filtered_df[AutoEvalColumnQA.average.name] = filtered_df[selected_cols].mean(axis=1)
58
+ return filtered_df
59
+
60
+
61
+ def update_table(
62
+ hidden_df: pd.DataFrame,
63
+ columns: list,
64
+ reranking_query: list,
65
+ query: str,
66
+ ):
67
+ filtered_df = filter_models(hidden_df, reranking_query)
68
+ filtered_df = filter_queries(query, filtered_df)
69
+ df = select_columns(filtered_df, columns)
70
+ return df