nan commited on
Commit
5808d8f
1 Parent(s): 61eca2d

feat: add metric selector

Browse files
Files changed (4) hide show
  1. app.py +35 -5
  2. src/populate.py +8 -9
  3. tests/src/test_populate.py +3 -1
  4. utils.py +24 -1
app.py CHANGED
@@ -17,10 +17,12 @@ from src.display.utils import (
17
  )
18
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
19
  from src.populate import get_leaderboard_df
20
- from utils import update_table
21
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, metric_list
22
 
23
 
 
 
24
  def restart_space():
25
  API.restart_space(repo_id=REPO_ID)
26
 
@@ -41,11 +43,21 @@ def restart_space():
41
  # except Exception:
42
  # restart_space()
43
 
44
- raw_data_qa, original_df_qa = get_leaderboard_df(
45
- EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_3')
 
46
  print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
47
  leaderboard_df = original_df_qa.copy()
48
 
 
 
 
 
 
 
 
 
 
49
  # (
50
  # finished_eval_queue_df,
51
  # running_eval_queue_df,
@@ -99,7 +111,7 @@ with demo:
99
  with gr.Column(min_width=320):
100
  selected_metric = gr.Dropdown(
101
  choices=metric_list,
102
- value=metric_list[0],
103
  label="Select the metric",
104
  interactive=True,
105
  elem_id="metric-select",
@@ -117,11 +129,13 @@ with demo:
117
 
118
  # Dummy leaderboard for handling the case when the user uses backspace key
119
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
120
- value=original_df_qa,
121
  # headers=COLS,
122
  # datatype=TYPES,
123
  visible=False,
124
  )
 
 
125
  search_bar.submit(
126
  update_table,
127
  [
@@ -133,6 +147,8 @@ with demo:
133
  ],
134
  leaderboard_table,
135
  )
 
 
136
  for selector in [
137
  selected_domains, selected_langs, selected_rerankings
138
  ]:
@@ -149,6 +165,20 @@ with demo:
149
  queue=True,
150
  )
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
153
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
154
 
 
17
  )
18
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
19
  from src.populate import get_leaderboard_df
20
+ from utils import update_table, update_metric
21
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, metric_list
22
 
23
 
24
+ from functools import partial
25
+
26
  def restart_space():
27
  API.restart_space(repo_id=REPO_ID)
28
 
 
43
  # except Exception:
44
  # restart_space()
45
 
46
+ from src.leaderboard.read_evals import get_raw_eval_results
47
+ raw_data_qa = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
48
+ original_df_qa = get_leaderboard_df(raw_data_qa, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_3')
49
  print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
50
  leaderboard_df = original_df_qa.copy()
51
 
52
+
53
+ def update_metric_qa(
54
+ metric: str,
55
+ domains: list,
56
+ langs: list,
57
+ reranking_model: list,
58
+ query: str,
59
+ ):
60
+ return update_metric(raw_data_qa, metric, domains, langs, reranking_model, query)
61
  # (
62
  # finished_eval_queue_df,
63
  # running_eval_queue_df,
 
111
  with gr.Column(min_width=320):
112
  selected_metric = gr.Dropdown(
113
  choices=metric_list,
114
+ value=metric_list[1],
115
  label="Select the metric",
116
  interactive=True,
117
  elem_id="metric-select",
 
129
 
130
  # Dummy leaderboard for handling the case when the user uses backspace key
131
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
132
+ value=leaderboard_df,
133
  # headers=COLS,
134
  # datatype=TYPES,
135
  visible=False,
136
  )
137
+
138
+ # Set search_bar listener
139
  search_bar.submit(
140
  update_table,
141
  [
 
147
  ],
148
  leaderboard_table,
149
  )
150
+
151
+ # Set column-wise listener
152
  for selector in [
153
  selected_domains, selected_langs, selected_rerankings
154
  ]:
 
165
  queue=True,
166
  )
167
 
168
+ # set metric listener
169
+ selected_metric.change(
170
+ update_metric_qa,
171
+ [
172
+ selected_metric,
173
+ selected_domains,
174
+ selected_langs,
175
+ selected_rerankings,
176
+ search_bar,
177
+ ],
178
+ leaderboard_table,
179
+ queue=True
180
+ )
181
+
182
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
183
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
184
 
src/populate.py CHANGED
@@ -5,31 +5,30 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
9
- from typing import Tuple
10
 
11
 
12
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
14
- raw_data = get_raw_eval_results(results_path, requests_path)
15
- print(f"raw_data loaded: {len(raw_data)}")
16
  all_data_json = []
17
  for v in raw_data:
18
  all_data_json += v.to_dict(task=task, metric=metric)
19
-
20
- print(f'records loaded: {len(all_data_json)}')
21
  df = pd.DataFrame.from_records(all_data_json)
22
  print(f'dataframe created: {df.shape}')
 
 
23
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
24
- df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
25
  df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
26
  df.reset_index(inplace=True)
 
27
  _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
28
  df = df[_cols].round(decimals=2)
29
 
30
  # filter out if any of the benchmarks have not been produced
31
  df = df[has_no_nan_values(df, _benchmark_cols)]
32
- return raw_data, df
33
 
34
 
35
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, FullEvalResult
9
+ from typing import Tuple, List
10
 
11
 
12
+ def get_leaderboard_df(raw_data: List[FullEvalResult], cols: list, benchmark_cols: list, task: str, metric: str) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
 
 
14
  all_data_json = []
15
  for v in raw_data:
16
  all_data_json += v.to_dict(task=task, metric=metric)
 
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
  print(f'dataframe created: {df.shape}')
19
+
20
+ # calculate the average score for selected benchmarks
21
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
22
+ df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
23
  df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
24
  df.reset_index(inplace=True)
25
+
26
  _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
27
  df = df[_cols].round(decimals=2)
28
 
29
  # filter out if any of the benchmarks have not been produced
30
  df = df[has_no_nan_values(df, _benchmark_cols)]
31
+ return df
32
 
33
 
34
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
tests/src/test_populate.py CHANGED
@@ -1,4 +1,5 @@
1
  from src.populate import get_leaderboard_df
 
2
  from pathlib import Path
3
 
4
  cur_fp = Path(__file__)
@@ -9,7 +10,8 @@ def test_get_leaderboard_df():
9
  results_path = cur_fp.parents[1] / "toydata" / "test_results"
10
  cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
11
  benchmark_cols = ['wiki_en', 'wiki_zh',]
12
- raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols, 'qa', 'ndcg_at_1')
 
13
  assert df.shape[0] == 2
14
  # the results contain only one embedding model
15
  for i in range(2):
 
1
  from src.populate import get_leaderboard_df
2
+ from src.leaderboard.read_evals import get_raw_eval_results
3
  from pathlib import Path
4
 
5
  cur_fp = Path(__file__)
 
10
  results_path = cur_fp.parents[1] / "toydata" / "test_results"
11
  cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
12
  benchmark_cols = ['wiki_en', 'wiki_zh',]
13
+ raw_data = get_raw_eval_results(results_path, requests_path)
14
+ df = get_leaderboard_df(raw_data, cols, benchmark_cols, 'qa', 'ndcg_at_1')
15
  assert df.shape[0] == 2
16
  # the results contain only one embedding model
17
  for i in range(2):
utils.py CHANGED
@@ -2,6 +2,10 @@ import pandas as pd
2
 
3
  from src.display.utils import AutoEvalColumnQA, COLS
4
  from src.benchmarks import BENCHMARK_COLS_QA, BenchmarksQA
 
 
 
 
5
 
6
 
7
  def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
@@ -68,4 +72,23 @@ def update_table(
68
  filtered_df = filter_models(hidden_df, reranking_query)
69
  filtered_df = filter_queries(query, filtered_df)
70
  df = select_columns(filtered_df, domains, langs)
71
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from src.display.utils import AutoEvalColumnQA, COLS
4
  from src.benchmarks import BENCHMARK_COLS_QA, BenchmarksQA
5
+ from src.leaderboard.read_evals import FullEvalResult
6
+ from typing import List
7
+ from src.populate import get_leaderboard_df
8
+ from src.display.utils import COLS, QA_BENCHMARK_COLS
9
 
10
 
11
  def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
 
72
  filtered_df = filter_models(hidden_df, reranking_query)
73
  filtered_df = filter_queries(query, filtered_df)
74
  df = select_columns(filtered_df, domains, langs)
75
+ return df
76
+
77
+
78
+ def update_metric(
79
+ raw_data: List[FullEvalResult],
80
+ metric: str,
81
+ domains: list,
82
+ langs: list,
83
+ reranking_model: list,
84
+ query: str,
85
+ ) -> pd.DataFrame:
86
+ leaderboard_df = get_leaderboard_df(raw_data, COLS, QA_BENCHMARK_COLS, task='qa', metric=metric)
87
+ hidden_df = leaderboard_df
88
+ return update_table(
89
+ hidden_df,
90
+ domains,
91
+ langs,
92
+ reranking_model,
93
+ query
94
+ )