Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
93420d3
1 Parent(s): b80bda9

feat: use recall as the default metric

Browse files
src/benchmarks.py CHANGED
@@ -148,4 +148,4 @@ LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
148
  DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
149
  LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
150
 
151
- DEFAULT_METRIC = "ndcg_at_10"
 
148
  DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
149
  LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
150
 
151
+ DEFAULT_METRIC = "recall_at_10"
src/display/gradio_formatting.py CHANGED
@@ -14,7 +14,6 @@ def get_search_bar():
14
  return gr.Textbox(
15
  placeholder=" 🔍 Search for retrieval methods (separate multiple queries with `;`) and press ENTER...",
16
  show_label=False,
17
- # elem_id="search-bar",
18
  info="Search the retrieval methods"
19
  )
20
 
@@ -23,19 +22,21 @@ def get_reranking_dropdown(model_list):
23
  return gr.Dropdown(
24
  choices=model_list,
25
  label="Select the reranking models",
26
- # elem_id="reranking-select",
27
  interactive=True,
28
  multiselect=True
29
  )
30
 
 
31
  def get_noreranking_dropdown():
32
  return gr.Dropdown(
33
- choices=["NoReranker",],
34
- value=["NoReranker",],
35
  interactive=False,
36
  multiselect=True,
37
  visible=False
38
  )
 
 
39
  def get_noreranker_button():
40
  return gr.Button(
41
  value="Only show results without ranking models",
@@ -48,7 +49,7 @@ def get_metric_dropdown(metric_list, default_metrics):
48
  value=default_metrics,
49
  label="Select the metric",
50
  interactive=True,
51
- # elem_id="metric-select-long-doc",
52
  )
53
 
54
 
@@ -57,7 +58,6 @@ def get_domain_dropdown(domain_list, default_domains):
57
  choices=domain_list,
58
  value=default_domains,
59
  label="Select the domains",
60
- # elem_id="domain-column-select",
61
  interactive=True,
62
  )
63
 
@@ -67,7 +67,6 @@ def get_language_dropdown(language_list, default_languages):
67
  choices=language_list,
68
  value=language_list,
69
  label="Select the languages",
70
- # elem_id="language-column-select",
71
  multiselect=True,
72
  interactive=True
73
  )
@@ -91,9 +90,9 @@ def get_revision_and_ts_checkbox():
91
 
92
  def get_leaderboard_table(df, datatype, visible=True):
93
  return gr.components.Dataframe(
94
- value=df,
95
- datatype=datatype,
96
- elem_id="leaderboard-table",
97
- interactive=False,
98
- visible=visible,
99
- )
 
14
  return gr.Textbox(
15
  placeholder=" 🔍 Search for retrieval methods (separate multiple queries with `;`) and press ENTER...",
16
  show_label=False,
 
17
  info="Search the retrieval methods"
18
  )
19
 
 
22
  return gr.Dropdown(
23
  choices=model_list,
24
  label="Select the reranking models",
 
25
  interactive=True,
26
  multiselect=True
27
  )
28
 
29
+
30
  def get_noreranking_dropdown():
31
  return gr.Dropdown(
32
+ choices=["NoReranker", ],
33
+ value=["NoReranker", ],
34
  interactive=False,
35
  multiselect=True,
36
  visible=False
37
  )
38
+
39
+
40
  def get_noreranker_button():
41
  return gr.Button(
42
  value="Only show results without ranking models",
 
49
  value=default_metrics,
50
  label="Select the metric",
51
  interactive=True,
52
+ info="Assuming that LLMs could generate correct answers when the correct context is retrieved, we recommend to use recall_at_k."
53
  )
54
 
55
 
 
58
  choices=domain_list,
59
  value=default_domains,
60
  label="Select the domains",
 
61
  interactive=True,
62
  )
63
 
 
67
  choices=language_list,
68
  value=language_list,
69
  label="Select the languages",
 
70
  multiselect=True,
71
  interactive=True
72
  )
 
90
 
91
  def get_leaderboard_table(df, datatype, visible=True):
92
  return gr.components.Dataframe(
93
+ value=df,
94
+ datatype=datatype,
95
+ elem_id="leaderboard-table",
96
+ interactive=False,
97
+ visible=visible,
98
+ )