Orion Weller commited on
Commit
bf8e6b0
1 Parent(s): 68f913d
.gitattributes CHANGED
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ local_datasets/ filter=lfs diff=lfs merge=lfs -text
36
+ local_datasets/** filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ datasets/
2
+ __pycache__/
3
+ env/
README.md CHANGED
File without changes
analysis.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import plotly.express as px
4
+ import plotly.figure_factory as ff
5
+
6
+
7
+ def results_to_df(results: dict, metric_name: str):
8
+ metric_scores = []
9
+ for topic, results_dict in results.items():
10
+ for metric_name_cur, metric_value in results_dict.items():
11
+ if metric_name == metric_name_cur:
12
+ metric_scores.append(metric_value)
13
+ return pd.DataFrame({metric_name: metric_scores})
14
+
15
+
16
+ def create_boxplot_1df(results: dict, metric_name: str):
17
+ df = results_to_df(results, metric_name)
18
+ fig = px.box(df, y=metric_name)
19
+ return fig
20
+
21
+
22
+ def create_boxplot_2df(results1, results2, metric_name):
23
+ df1 = results_to_df(results1, metric_name)
24
+ df2 = results_to_df(results2, metric_name)
25
+ df2["Run"] = "Run 2"
26
+ df1["Run"] = "Run 1"
27
+ df = pd.concat([df1, df2])
28
+
29
+ # Create distplot with custom bin_size
30
+ fig = px.histogram(df, x=metric_name, color="Run", marginal="box", hover_data=df.columns)
31
+ return fig
32
+
33
+
34
+ def create_boxplot_diff(results1, results2, metric_name):
35
+ df1 = results_to_df(results1, metric_name)
36
+ df2 = results_to_df(results2, metric_name)
37
+ diff = df1[metric_name] - df2[metric_name]
38
+
39
+ x_axis = f"Difference in {metric_name} from 1 to 2"
40
+ fig = px.histogram(pd.DataFrame({x_axis: diff}), x=x_axis, marginal="box")
41
+ return fig
app.py CHANGED
@@ -9,238 +9,507 @@ import pandas as pd
9
  from collections import defaultdict
10
  import json
11
  import copy
 
12
 
13
- def load_jsonl(f):
14
- did2text = defaultdict(list)
15
- sub_did2text = {}
16
-
17
- for idx, line in enumerate(f):
18
- inst = json.loads(line)
19
- if "question" in inst:
20
- docid = inst["metadata"][0]["passage_id"] if "doc_id" not in inst else inst["doc_id"]
21
- did2text[docid].append(inst["question"])
22
- elif "text" in inst:
23
- docid = inst["doc_id"] if "doc_id" in inst else inst["did"]
24
- did2text[docid].append(inst["text"])
25
- sub_did2text[inst["did"]] = inst["text"]
26
- elif "query" in inst:
27
- docid = inst["doc_id"] if "doc_id" in inst else inst["did"]
28
- did2text[docid].append(inst["query"])
29
- else:
30
- breakpoint()
31
- raise NotImplementedError("Need to handle this case")
32
-
33
- return did2text, sub_did2text
34
 
35
 
 
 
 
 
 
 
 
 
36
 
37
- def get_beir(dataset_name: str):
38
- dataset = "scifact"
39
- url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
40
- out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
41
- data_path = util.download_and_unzip(url, out_dir)
42
- return GenericDataLoader(data_folder=data_path).load(split="test")
43
 
44
- def load_run(f_run):
45
- run = pytrec_eval.parse_run(copy.deepcopy(f_run))
46
- # convert bytes to strings for keys
47
- new_run = defaultdict(dict)
48
- for key, sub_dict in run.items():
49
- new_run[key.decode("utf-8")] = {k.decode("utf-8"): v for k, v in sub_dict.items()}
 
50
 
51
- run_pandas = pd.read_csv(f_run, header=None, index_col=None, sep="\t")
52
- run_pandas.columns = ["qid", "generic", "doc_id", "rank", "score", "model"]
53
- run_pandas.doc_id = run_pandas.doc_id.astype(str)
54
- run_pandas.qid = run_pandas.qid.astype(str)
55
- run_pandas["rank"] = run_pandas["rank"].astype(int)
56
- run_pandas.score = run_pandas.score.astype(float)
57
- # if run_1_alt is not None:
58
- # run_1_alt, run_1_alt_sub = load_jsonl(run_1_alt)
59
- return new_run, run_pandas
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  with st.sidebar:
63
- dataset_name = st.selectbox("Select a dataset in BEIR", ("scifact", "trec-covid", "fever"))
64
- metric_name = st.selectbox("Select a metric", ("recall_5", "recall_10"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  # sliderbar of how many Top N to choose
66
  top_n = st.slider("Top N", 1, 100, 3)
67
  x = st.header('Upload a run file')
68
  run1_file = st.file_uploader("Choose a file", key="run1")
69
  y = st.header("Upload a second run file")
70
  run2_file = st.file_uploader("Choose a file", key="run2")
 
 
71
  incorrect_only = st.checkbox("Show only incorrect instances", value=False)
72
  one_better_than_two = st.checkbox("Show only instances where run 1 is better than run 2", value=False)
73
  two_better_than_one = st.checkbox("Show only instances where run 2 is better than run 1", value=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
 
 
 
 
 
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- corpus, queries, qrels = get_beir(dataset_name)
80
 
81
- evaluator = pytrec_eval.RelevanceEvaluator(
82
- qrels, pytrec_eval.supported_measures)
83
 
84
- if run1_file is not None:
85
- run1, run1_pandas = load_run(run1_file)
86
- results1 = evaluator.evaluate(run1) # dict of instance then metrics then values
 
87
 
88
- if run2_file is not None:
89
- run2, run2_pandas = load_run(run2_file)
90
- results2 = evaluator.evaluate(run2)
91
-
92
-
93
- col1, col2 = st.columns([1, 2], gap="medium")
94
-
95
- incorrect = 0
96
- is_better_run1_count = 0
97
- is_better_run2_count = 0
98
- with col1:
99
- st.title("Instances")
100
- if run1_file is not None:
101
- name_of_columns = ["Overview"] + sorted([int(item) for item in set(run1_pandas.qid.tolist())])
102
- checkboxes = [("Overview", st.checkbox("Overview", key=f"0overview"))]
103
- st.divider()
104
- for idx in range(len(name_of_columns)):
105
- if not idx:
106
- continue
107
- is_incorrect = False
108
- is_better_run1 = False
109
- is_better_run2 = False
110
-
111
- run1_score = results1[str(name_of_columns[idx])][metric_name] if idx else 1
112
- if run2_file is not None:
113
- run2_score = results2[str(name_of_columns[idx])][metric_name] if idx else 1
114
-
115
- if idx and run1_score == 0 or run2_score == 0:
116
- incorrect += 1
117
- is_incorrect = True
118
-
119
- if idx and run1_score > run2_score:
120
- is_better_run1_count += 1
121
- is_better_run1 = True
122
- elif idx and run2_score > run1_score:
123
- is_better_run2_count += 1
124
- is_better_run2 = True
125
-
126
- if not incorrect_only or is_incorrect:
127
- if not one_better_than_two or is_better_run1:
128
- if not two_better_than_one or is_better_run2:
129
- check = st.checkbox(str(name_of_columns[idx]), key=f"{idx}check")
130
- st.divider()
131
- checkboxes.append((name_of_columns[idx], check))
132
- else:
133
- if idx and run1_score == 0:
134
- incorrect += 1
135
- is_incorrect = True
136
 
137
- if not incorrect_only or is_incorrect:
138
- check = st.checkbox(str(name_of_columns[idx]), key=f"{idx}check")
139
- st.divider()
140
- checkboxes.append((name_of_columns[idx], check))
141
-
142
-
143
- with col2:
144
- st.title(f"Information ({len(checkboxes) - 1}/{len(name_of_columns) - 1})")
145
- ### Only one run file
146
- if run1_file is not None and run2_file is None:
147
- for check_idx, (inst_num, checkbox) in enumerate(checkboxes):
148
- if checkbox:
149
- if inst_num == "Overview":
150
- st.header("Overview")
151
- st.markdown("TODO: Add overview")
 
 
 
 
 
 
 
 
152
  else:
153
- st.header(f"Instance Number: {inst_num}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- st.subheader(f"Query")
156
- query_text = queries[str(inst_num)]
157
- st.markdown(query_text)
158
- st.divider()
159
 
160
- ## Documents
161
- # relevant
162
- relevant_docs = list(qrels[str(inst_num)].keys())
163
- doc_texts = [(doc_id, corpus[doc_id]["title"], corpus[doc_id]["text"]) for doc_id in relevant_docs]
164
- st.subheader("Relevant Documents")
165
- for (docid, title, text) in doc_texts:
166
- st.text_area(f"{docid}: {title}", text)
 
167
 
168
- # top ranked
169
- pred_doc = run1_pandas[run1_pandas.doc_id.isin(relevant_docs)]
170
- rank_pred = pred_doc[pred_doc.qid == str(inst_num)]["rank"].tolist()
171
- st.subheader("Ranked of Documents")
172
- st.markdown(f"Rank: {rank_pred}")
173
 
174
- st.divider()
175
 
176
- if st.checkbox('Show top ranked documents'):
177
- st.subheader("Top N Ranked Documents")
178
- run1_top_n = run1_pandas[run1_pandas.qid == str(inst_num)][:top_n]
179
- run1_top_n_docs = [corpus[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
 
 
 
 
 
 
 
 
 
 
180
  for d_idx, doc in enumerate(run1_top_n_docs):
181
- st.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: {doc['title']}", doc["text"])
182
- st.divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
 
185
- st.subheader("Score")
186
- st.markdown(f"{results1[str(inst_num)][metric_name]}")
187
- break
188
 
189
- ## Both run files available
190
- elif run1_file is not None and run2_file is not None:
191
- for check_idx, (inst_num, checkbox) in enumerate(checkboxes):
192
- if checkbox:
193
- if inst_num == "Overview":
194
- st.header("Overview")
195
- st.markdown("TODO: Add overview")
196
- else:
197
- st.header(f"Instance Number: {inst_num}")
198
 
199
- st.subheader(f"Query")
200
- query_text = queries[str(inst_num)]
201
- st.markdown(query_text)
202
- st.divider()
203
 
204
- ## Documents
205
- # relevant
206
- relevant_docs = list(qrels[str(inst_num)].keys())
207
- doc_texts = [(doc_id, corpus[doc_id]["title"], corpus[doc_id]["text"]) for doc_id in relevant_docs]
208
- st.subheader("Relevant Documents")
209
- for (docid, title, text) in doc_texts:
210
- st.text_area(f"{docid}: {title}", text)
211
 
212
- # top ranked
213
- pred_doc1 = run1_pandas[run1_pandas.doc_id.isin(relevant_docs)]
214
- rank_pred1 = pred_doc1[pred_doc1.qid == str(inst_num)]["rank"].tolist()
215
- pred_doc2 = run2_pandas[run2_pandas.doc_id.isin(relevant_docs)]
216
- rank_pred2 = pred_doc2[pred_doc2.qid == str(inst_num)]["rank"].tolist()
217
- st.subheader("Ranked of Documents")
218
- st.markdown(f"Run 1 Rank: {rank_pred1}")
219
- st.markdown(f"Run 2 Rank: {rank_pred2}")
220
 
221
 
222
- st.divider()
 
 
 
 
 
 
223
 
224
- if st.checkbox('Show top ranked documents for Run 1'):
225
- st.subheader("Top N Ranked Documents")
226
- run1_top_n = run1_pandas[run1_pandas.qid == str(inst_num)][:top_n]
227
- run1_top_n_docs = [corpus[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
 
 
 
 
 
 
228
  for d_idx, doc in enumerate(run1_top_n_docs):
229
- st.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: {doc['title']}", doc["text"])
230
-
231
- if st.checkbox('Show top ranked documents for Run 2'):
232
- st.subheader("Top N Ranked Documents")
233
- run2_top_n = run2_pandas[run2_pandas.qid == str(inst_num)][:top_n]
234
- run2_top_n_docs = [corpus[str(doc_id)] for doc_id in run2_top_n.doc_id.tolist()]
 
 
 
 
 
 
 
 
 
 
 
235
  for d_idx, doc in enumerate(run2_top_n_docs):
236
- st.text_area(f"{run2_top_n['doc_id'].iloc[d_idx]}: {doc['title']}", doc["text"])
 
 
237
 
238
- st.divider()
239
 
 
 
240
 
241
- st.subheader("Scores")
242
- st.markdown(f"Run 1: {results1[str(inst_num)][metric_name]}")
243
- st.markdown(f"Run 2: {results2[str(inst_num)][metric_name]}")
244
 
245
- break
 
 
246
 
 
 
 
9
  from collections import defaultdict
10
  import json
11
  import copy
12
+ import plotly.express as px
13
 
14
+ from constants import ALL_DATASETS, ALL_METRICS
15
+ from dataset_loading import get_dataset, load_run, load_local_qrels, load_local_corpus, load_local_queries
16
+ from analysis import create_boxplot_1df, create_boxplot_2df, create_boxplot_diff
17
+
18
+
19
+ st.set_page_config(layout="wide")
20
+
21
+
22
+ if 'cur_instance_num' not in st.session_state:
23
+ st.session_state.cur_instance_num = -1
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
+ def update_details(run_details, run_score):
27
+ if run_score == 0:
28
+ run_details["none"] += 1
29
+ elif run_score == 1:
30
+ run_details["perfect"] += 1
31
+ else:
32
+ run_details["inbetween"] += 1
33
+ return run_details
34
 
 
 
 
 
 
 
35
 
36
+ def check_valid_args(run1_file, run2_file, dataset_name, qrels, queries, corpus):
37
+ if run1_file is not None and dataset_name not in ["", None, "custom"]:
38
+ return True
39
+ elif run1_file is not None and dataset_name == "custom":
40
+ if qrels is not None and queries is not None and corpus is not None:
41
+ return True
42
+ return False
43
 
44
+ def validate(config_option, file_loaded):
45
+ if config_option != "None" and file_loaded is None:
46
+ st.error("Please upload a file for " + config_option)
47
+ st.stop()
 
 
 
 
 
48
 
49
 
50
+ def combine(text_og, text_new, combine_type):
51
+ if combine_type == "None":
52
+ return text_og
53
+ elif combine_type == "Append":
54
+ return text_og + " <APPEND> " + text_new
55
+ elif combine_type == "Prepend":
56
+ return text_new + " <PREPEND> " + text_og
57
+ elif combine_type == "Replace":
58
+ return text_new
59
+ else:
60
+ raise ValueError("Invalid combine type")
61
+
62
  with st.sidebar:
63
+ st.title("Options")
64
+ dataset_name = st.selectbox("Select a preloaded dataset or upload your own", tuple(ALL_DATASETS))
65
+ metric_name = st.selectbox("Select a metric", tuple(ALL_METRICS))
66
+
67
+ if dataset_name == "custom":
68
+ st.header("Upload corpus")
69
+ corpus_file = st.file_uploader("Choose a file", key="corpus")
70
+ corpus = load_local_corpus(corpus_file)
71
+ st.header("Upload queries")
72
+ queries_file = st.file_uploader("Choose a file", key="queries")
73
+ queries = load_local_queries(queries_file)
74
+ st.header("Upload qrels")
75
+ qrels_file = st.file_uploader("Choose a file", key="qrels")
76
+ qrels = load_local_qrels(qrels_file)
77
+ else:
78
+ qrels = None
79
+ queries = None
80
+ corpus = None
81
+
82
  # sliderbar of how many Top N to choose
83
  top_n = st.slider("Top N", 1, 100, 3)
84
  x = st.header('Upload a run file')
85
  run1_file = st.file_uploader("Choose a file", key="run1")
86
  y = st.header("Upload a second run file")
87
  run2_file = st.file_uploader("Choose a file", key="run2")
88
+
89
+ z = st.header("Analysis Options")
90
  incorrect_only = st.checkbox("Show only incorrect instances", value=False)
91
  one_better_than_two = st.checkbox("Show only instances where run 1 is better than run 2", value=False)
92
  two_better_than_one = st.checkbox("Show only instances where run 2 is better than run 1", value=False)
93
+ advanced_options1 = st.checkbox("Show advanced options for Run 1", value=False)
94
+ doc_expansion1 = doc_expansion2 = None
95
+ query_expansion1 = query_expansion2 = None
96
+ run1_uses_query_expansion = "None"
97
+ run1_uses_doc_expansion = "None"
98
+ run2_uses_query_expansion = "None"
99
+ run2_uses_doc_expansion = "None"
100
+ if advanced_options1:
101
+ doc_header = st.header("Upload a Document Expansion file")
102
+ doc_expansion_file = st.file_uploader("Choose a file", key="doc_expansion")
103
+ if doc_expansion_file is not None:
104
+ doc_expansion1 = load_local_corpus(doc_expansion_file)
105
+ query_header = st.header("Upload a Query Expansion file")
106
+ query_expansion_file = st.file_uploader("Choose a file", key="query_expansion")
107
+ if query_expansion_file is not None:
108
+ query_expansion1 = load_local_queries(query_expansion_file)
109
+
110
+ run1_uses_query_expansion = st.selectbox("Type of query expansion used in run 1", ("None", "Append", "Prepend", "Replace"))
111
+ run1_uses_doc_expansion = st.selectbox("Type of document expansion used in run 1", ("None", "Append", "Prepend", "Replace"))
112
+ validate(run1_uses_query_expansion, query_expansion_file)
113
+ validate(run1_uses_doc_expansion, doc_expansion_file)
114
+
115
+ advanced_options2 = st.checkbox("Show advanced options for Run 2", value=False)
116
+ if advanced_options2:
117
+ doc_header = st.header("Upload a Document Expansion file")
118
+ doc_expansion_file = st.file_uploader("Choose a file", key="doc_expansion2")
119
+ if doc_expansion_file is not None:
120
+ doc_expansion2 = load_local_corpus(doc_expansion_file)
121
+ query_header = st.header("Upload a Query Expansion file")
122
+ query_expansion_file = st.file_uploader("Choose a file", key="query_expansion2")
123
+ if query_expansion_file is not None:
124
+ query_expansion2 = load_local_queries(query_expansion_file)
125
+
126
+ run2_uses_query_expansion = st.selectbox("Type of query expansion used in run 2", ("None", "Append", "Prepend", "Replace"))
127
+ run2_uses_doc_expansion = st.selectbox("Type of document expansion used in run 2", ("None", "Append", "Prepend", "Replace"))
128
+ validate(run2_uses_query_expansion, query_expansion_file)
129
+ validate(run2_uses_doc_expansion, doc_expansion_file)
130
+
131
+
132
+ # everything hinges on the run being uploaded, so do that first
133
+ # init_title = st.title("Upload Run and Choose Details")
134
+
135
+ if run1_file is not None:
136
+ run1, run1_pandas = load_run(run1_file)
137
 
138
+ # do everything, now that we have the run file
139
+ if check_valid_args(run1_file, run2_file, dataset_name, qrels, queries, corpus):
140
+ # init_title = st.title("Analysis")
141
+ # don't load these til a run is given
142
+ if dataset_name != "custom":
143
+ corpus, queries, qrels = get_dataset(dataset_name)
144
 
145
+ evaluator = pytrec_eval.RelevanceEvaluator(
146
+ copy.deepcopy(qrels), pytrec_eval.supported_measures)
147
+ results1 = evaluator.evaluate(run1) # dict of instance then metrics then values
148
+ if len(results1) == 0:
149
+ # alert and stop
150
+ st.error("Run file is empty")
151
+ st.stop()
152
+
153
+ if run2_file is not None:
154
+ run2, run2_pandas = load_run(run2_file)
155
+ # NOTE: will fail if run1 is not uploaded
156
+ evaluator2 = pytrec_eval.RelevanceEvaluator(
157
+ copy.deepcopy(qrels), pytrec_eval.supported_measures)
158
+ results2 = evaluator2.evaluate(run2)
159
+
160
+ col1, col2 = st.columns([1, 3], gap="large")
161
+
162
+ # incorrect = 0
163
+ is_better_run1_count = 0
164
+ is_better_run2_count = 0
165
+ is_same_count = 0
166
+ run1_details = {"none": 0, "perfect": 0, "inbetween": 0}
167
+ run2_details = {"none": 0, "perfect": 0, "inbetween": 0}
168
+ with col1:
169
+ st.title("Instances")
170
+ if run1_file is not None:
171
+ set_of_cols = set(run1_pandas.qid.tolist())
172
+ container_for_nav = st.container()
173
+ name_of_columns = sorted([item for item in set_of_cols])
174
+ instances_to_use = []
175
+ # st.divider()
176
+ for idx in range(len(name_of_columns)):
177
+ is_incorrect = False
178
+ is_better_run1 = False
179
+ is_better_run2 = False
180
+
181
+ run1_score = results1[str(name_of_columns[idx])][metric_name] if idx else 1
182
+ run1_details = update_details(run1_details, run1_score)
183
+ if run2_file is not None:
184
+ run2_score = results2[str(name_of_columns[idx])][metric_name] if idx else 1
185
+ run2_details = update_details(run2_details, run2_score)
186
+
187
+ if run1_score == 0 or run2_score == 0:
188
+ is_incorrect = True
189
+
190
+ if run1_score > run2_score:
191
+ is_better_run1_count += 1
192
+ is_better_run1 = True
193
+ elif run2_score > run1_score:
194
+ is_better_run2_count += 1
195
+ is_better_run2 = True
196
+ else:
197
+ is_same_count += 1
198
+
199
+
200
+ if not incorrect_only or is_incorrect:
201
+ if not one_better_than_two or is_better_run1:
202
+ if not two_better_than_one or is_better_run2:
203
+ # check = st.checkbox(f"{idx}. " + str(name_of_columns[idx]), key=f"{idx}check")
204
+ # st.divider()
205
+ instances_to_use.append(name_of_columns[idx])
206
+ else:
207
+ if run1_score == 0:
208
+ is_incorrect = True
209
+
210
+ if not incorrect_only or is_incorrect:
211
+ # check = st.checkbox(f"{idx}. " + str(name_of_columns[idx]), key=f"{idx}check")
212
+ # st.divider()
213
+ instances_to_use.append(name_of_columns[idx])
214
 
215
 
216
+ def sync_from_drop():
217
+ if st.session_state.selectbox_instance == "Overview":
218
+ st.session_state.number_of_col = -1
219
+ st.session_state.cur_instance_num = -1
220
+ else:
221
+ index_of_obj = name_of_columns.index(st.session_state.selectbox_instance)
222
+ # print("Index of obj: ", index_of_obj, type(index_of_obj))
223
+ st.session_state.number_of_col = index_of_obj
224
+ st.session_state.cur_instance_num = index_of_obj
225
+
226
+ def sync_from_number():
227
+ st.session_state.cur_instance_num = st.session_state.number_of_col
228
+ # print("Session state number of col: ", st.session_state.number_of_col, type(st.session_state.number_of_col))
229
+ if st.session_state.number_of_col == -1:
230
+ st.session_state.selectbox_instance = "Overview"
231
+ else:
232
+ st.session_state.selectbox_instance = name_of_columns[st.session_state.number_of_col]
233
+
234
+
235
+ number_of_col = container_for_nav.number_input(min_value=-1, step=1, max_value=len(instances_to_use), on_change=sync_from_number, label=f"Select instance by index (out of **{len(instances_to_use)}**)", key="number_of_col")
236
+ selectbox_instance = container_for_nav.selectbox("Select instance by ID", ["Overview"] + name_of_columns, on_change=sync_from_drop, key="selectbox_instance")
237
+ st.divider()
238
+ # make pie plot showing incorrect vs correct
239
+ st.header("Breakdown")
240
+ if run2_file is None:
241
+ plotly_pie_chart = px.pie(names=["Perfect", "Inbetween", "None"], values=[run1_details["perfect"], run1_details["inbetween"], run1_details["none"]])
242
+ st.write("Run 1 Scores")
243
+ plotly_pie_chart.update_traces(showlegend=False, selector=dict(type='pie'), textposition='inside', textinfo='percent+label')
244
+ st.plotly_chart(plotly_pie_chart, use_container_width=True)
245
+ else:
246
+ if st.checkbox("Show Run 1 vs Run 2", value=True):
247
+ plotly_pie_chart = px.pie(names=["Run 1 Better", "Run 2 Better", "Tied"], values=[is_better_run1_count, is_better_run2_count, is_same_count])
248
+ plotly_pie_chart.update_traces(showlegend=False, selector=dict(type='pie'), textposition='inside', textinfo='percent+label')
249
+ st.plotly_chart(plotly_pie_chart, use_container_width=True)
250
+
251
+ if st.checkbox("Show Run 1 Breakdown"):
252
+ plotly_pie_chart_run1 = px.pie(names=["Perfect", "Inbetween", "None"], values=[run1_details["perfect"], run1_details["inbetween"], run1_details["none"]])
253
+ plotly_pie_chart_run1.update_traces(showlegend=False, selector=dict(type='pie'), textposition='inside', textinfo='percent+label')
254
+ st.plotly_chart(plotly_pie_chart_run1, use_container_width=True)
255
+ if st.checkbox("Show Run 2 Breakdown"):
256
+ plotly_pie_chart_run2 = px.pie(names=["Perfect", "Inbetween", "None"], values=[run2_details["perfect"], run2_details["inbetween"], run2_details["none"]])
257
+ plotly_pie_chart_run2.update_traces(showlegend=False, selector=dict(type='pie'), textposition='inside', textinfo='percent+label')
258
+ st.plotly_chart(plotly_pie_chart_run2, use_container_width=True)
259
 
 
260
 
 
 
261
 
262
+ with col2:
263
+ # st.title(f"Information ({len(checkboxes) - 1}/{len(name_of_columns) - 1})")
264
+ ### Only one run file
265
+ if run1_file is not None and run2_file is None:
266
 
267
+ # get instance number
268
+ inst_index = number_of_col
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
+ if inst_index >= 0:
271
+ inst_num = instances_to_use[inst_index - 1]
272
+
273
+ st.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Run 1</h1>", unsafe_allow_html=True)
274
+
275
+ container = st.container()
276
+
277
+ rank_col, score_col, id_col = container.columns([2,1,3])
278
+ id_col.metric("ID", inst_num)
279
+ score_col.metric(metric_name, results1[str(inst_num)][metric_name])
280
+
281
+ # st.subheader(f"ID")
282
+ # st.markdown(inst_num)
283
+ st.divider()
284
+
285
+ st.subheader(f"Query")
286
+ if run1_uses_query_expansion != "None":
287
+ show_orig_rel = st.checkbox("Show Original Query", key=f"{inst_index}reloriguery", value=False)
288
+
289
+ query_text_og = queries[str(inst_num)]
290
+ if query_expansion1 is not None and run1_uses_query_expansion != "None" and not show_orig_rel:
291
+ alt_text = query_expansion1[str(inst_num)]
292
+ query_text = combine(query_text_og, alt_text, run1_uses_query_expansion)
293
  else:
294
+ query_text = query_text_og
295
+ st.markdown(query_text)
296
+ st.divider()
297
+
298
+ ## Documents
299
+ # relevant
300
+ relevant_docs = list(qrels[str(inst_num)].keys())
301
+ doc_texts = [(doc_id, corpus[doc_id]["title"], corpus[doc_id]["text"]) for doc_id in relevant_docs]
302
+ st.subheader("Relevant Documents")
303
+ if doc_expansion1 is not None and run1_uses_doc_expansion != "None":
304
+ show_orig_rel = st.checkbox("Show Original Relevant Doc(s)", key=f"{inst_index}relorig", value=False)
305
+
306
+ for (docid, title, text) in doc_texts:
307
+ if doc_expansion1 is not None and run1_uses_doc_expansion != "None" and not show_orig_rel:
308
+ alt_text = doc_expansion1[docid]["text"]
309
+ text = combine(text, alt_text, run1_uses_doc_expansion)
310
+ st.text_area(f"{docid}:", text)
311
 
312
+
 
 
 
313
 
314
+ pred_doc = run1_pandas[run1_pandas.doc_id.isin(relevant_docs)]
315
+ rank_pred = pred_doc[pred_doc.qid == str(inst_num)]["rank"].tolist()
316
+ # st.subheader("Ranked of Documents")
317
+ # st.markdown(f"Rank: {rank_pred}")
318
+ ranking_str = ",".join([str(item) for item in rank_pred]) if type(rank_pred) == list else str(rank_pred)
319
+ if ranking_str == "":
320
+ ranking_str = "--"
321
+ rank_col.metric(f"Rank of Relevant Doc(s)", ranking_str)
322
 
323
+ st.divider()
 
 
 
 
324
 
325
+ # top ranked
326
 
327
+ if st.checkbox('Show top ranked documents', key=f"{inst_index}top-1run"):
328
+ st.subheader("Top N Ranked Documents")
329
+ if doc_expansion1 is not None and run1_uses_doc_expansion != "None":
330
+ show_orig_rel_ranked = st.checkbox("Show Original Ranked Doc(s)", key=f"{inst_index}relorigdocs", value=False)
331
+
332
+ run1_top_n = run1_pandas[run1_pandas.qid == str(inst_num)][:top_n]
333
+ run1_top_n_docs = [corpus[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
334
+ if doc_expansion1 is not None and run1_uses_doc_expansion != "None" and not show_orig_rel_ranked:
335
+ run1_top_n_docs_alt = [doc_expansion1[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
336
+ for d_idx, doc in enumerate(run1_top_n_docs):
337
+ alt_text = run1_top_n_docs_alt[d_idx]["text"]
338
+ doc_text = combine(doc["text"], alt_text, run1_uses_doc_expansion)
339
+ st.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}")
340
+ else:
341
  for d_idx, doc in enumerate(run1_top_n_docs):
342
+ st.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}")
343
+ st.divider()
344
+
345
+ # none checked
346
+ elif inst_index < 0:
347
+ st.title("Overview")
348
+ st.subheader(f"Scores of {metric_name}")
349
+ plotly_chart = create_boxplot_1df(results1, metric_name)
350
+ st.plotly_chart(plotly_chart)
351
+
352
+ ## Both run files available
353
+ elif run1_file is not None and run2_file is not None:
354
+ has_check = False
355
+ container_top = st.container()
356
+
357
+ # get instance number
358
+ inst_index = number_of_col
359
+
360
+ if inst_index >= 0:
361
+ inst_num = instances_to_use[inst_index]
362
+
363
+ col_run1, col_run2 = container_top.columns([1,1])
364
+ col_run1.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Run 1</h1>", unsafe_allow_html=True)
365
+ col_run2.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Run 2</h1>", unsafe_allow_html=True)
366
+
367
+ container_overview = st.container()
368
+ rank_col1, score_col1, rank_col2, score_col2 = container_overview.columns([2,1,2,1])
369
+ # id_col1.metric("", "")
370
+ score_col1.metric("Run 1 " + metric_name, results1[str(inst_num)][metric_name])
371
+ score_col2.metric("Run 2 " + metric_name, results2[str(inst_num)][metric_name])
372
+
373
+ st.divider()
374
+
375
+ st.subheader(f"Query")
376
+ container_two_query = st.container()
377
+ col_run1, col_run2 = container_two_query.columns(2, gap="medium")
378
+
379
+ query_text_og = queries[str(inst_num)]
380
+ if run1_uses_query_expansion != "None" and run2_uses_query_expansion != "None":
381
+ alt_text1 = query_expansion1[str(inst_num)]
382
+ alt_text2 = query_expansion2[str(inst_num)]
383
+ combined_text1 = combine(query_text_og, alt_text1, run1_uses_query_expansion)
384
+ combined_text2 = combine(query_text_og, alt_text2, run2_uses_query_expansion)
385
+ col_run1.markdown(combined_text1)
386
+ col_run2.markdown(combined_text2)
387
+ elif run1_uses_query_expansion != "None":
388
+ alt_text = query_expansion1[str(inst_num)]
389
+ combined_text1 = combine(query_text_og, alt_text, run1_uses_query_expansion)
390
+ col_run1.markdown(combined_text1)
391
+ col_run2.markdown(query_text_og)
392
+ elif run2_uses_query_expansion != "None":
393
+ alt_text = query_expansion2[str(inst_num)]
394
+ combined_text2 = combine(query_text_og, alt_text, run2_uses_query_expansion)
395
+ col_run1.markdown(query_text_og)
396
+ col_run2.markdown(combined_text2)
397
+ else:
398
+ query_text = query_text_og
399
+ col_run1.markdown(query_text)
400
+ col_run2.markdown(query_text)
401
+
402
+ st.divider()
403
+
404
+
405
+
406
+ ## Documents
407
+ # relevant
408
+ st.subheader("Relevant Documents")
409
+ container_two_docs_rel = st.container()
410
+ col_run1, col_run2 = container_two_docs_rel.columns(2, gap="medium")
411
+ relevant_docs = list(qrels[str(inst_num)].keys())
412
+ doc_texts = [(doc_id, corpus[doc_id]["title"], corpus[doc_id]["text"]) for doc_id in relevant_docs]
413
+
414
+ if doc_expansion1 is not None and run1_uses_doc_expansion != "None":
415
+ show_orig_rel1 = col_run1.checkbox("Show Original Relevant Doc(s)", key=f"{inst_index}relorig_run1", value=False)
416
+ if doc_expansion2 is not None and run2_uses_doc_expansion != "None":
417
+ show_orig_rel2 = col_run2.checkbox("Show Original Relevant Doc(s)", key=f"{inst_index}relorig_run2", value=False)
418
+
419
+ for (docid, title, text) in doc_texts:
420
+ if doc_expansion1 is not None and run1_uses_doc_expansion != "None" and not show_orig_rel1:
421
+ alt_text = doc_expansion1[docid]["text"]
422
+ text = combine(text, alt_text, run1_uses_doc_expansion)
423
+ col_run1.text_area(f"{docid}:", text, key=f"{inst_num}doc{docid}1")
424
+
425
+ for (docid, title, text) in doc_texts:
426
+ if doc_expansion2 is not None and run2_uses_doc_expansion != "None" and not show_orig_rel2:
427
+ alt_text = doc_expansion2[docid]["text"]
428
+ text = combine(text, alt_text, run2_uses_doc_expansion)
429
+ col_run2.text_area(f"{docid}:", text, key=f"{inst_num}doc{docid}2")
430
+
431
+ # top ranked
432
+ # NOTE: BEIR calls trec_eval which ranks by score, then doc_id for ties
433
+ # we have to fix that or we don't match the scores
434
+ pred_doc1 = run1_pandas[run1_pandas.qid == inst_num].sort_values(["score", "doc_id"], ascending=[False, False])
435
+ pred_doc1["rank_real"] = list(range(1, len(pred_doc1) + 1))
436
+ rank_pred1 = pred_doc1[pred_doc1.doc_id.isin(relevant_docs)]["rank_real"].tolist()
437
+
438
+ pred_doc2 = run2_pandas[run2_pandas.qid == inst_num].sort_values(["score", "doc_id"], ascending=[False, False])
439
+ pred_doc2["rank_real"] = list(range(1, len(pred_doc2) + 1))
440
+ rank_pred2 = pred_doc2[pred_doc2.doc_id.isin(relevant_docs)]["rank_real"].tolist()
441
 
442
 
443
+ # st.subheader("Ranked of Documents")
444
+ # st.markdown(f"Run 1 Rank: {rank_pred1}")
445
+ # st.markdown(f"Run 2 Rank: {rank_pred2}")
446
 
447
+ ranking_str = ",".join([str(item) for item in rank_pred1]) if type(rank_pred1) == list else str(rank_pred1)
448
+ if ranking_str == "":
449
+ ranking_str = "--"
450
+ rank_col1.metric("Run 1 " + f"Rank of Relevant Doc(s)", ranking_str)
 
 
 
 
 
451
 
452
+ ranking_str2 = ",".join([str(item) for item in rank_pred2]) if type(rank_pred2) == list else str(rank_pred2)
453
+ if ranking_str2 == "":
454
+ ranking_str2 = "--"
455
+ rank_col2.metric("Run 2 " + f"Rank of Relevant Doc(s)", ranking_str2)
456
 
 
 
 
 
 
 
 
457
 
458
+ st.divider()
 
 
 
 
 
 
 
459
 
460
 
461
+ container_two_docs_ranked = st.container()
462
+ col_run1, col_run2 = container_two_docs_ranked.columns(2, gap="medium")
463
+
464
+ if col_run1.checkbox('Show top ranked documents for Run 1', key=f"{inst_index}top-1run"):
465
+ col_run1.subheader("Top N Ranked Documents")
466
+ if doc_expansion1 is not None and run1_uses_doc_expansion != "None":
467
+ show_orig_rel_ranked1 = col_run1.checkbox("Show Original Ranked Doc(s)", key=f"{inst_index}relorigdocs1", value=False)
468
 
469
+ run1_top_n = run1_pandas[run1_pandas.qid == str(inst_num)].sort_values(["score", "doc_id"], ascending=[False, False])[:top_n]
470
+ run1_top_n_docs = [corpus[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
471
+
472
+ if doc_expansion1 is not None and run1_uses_doc_expansion != "None" and not show_orig_rel_ranked1:
473
+ run1_top_n_docs_alt = [doc_expansion1[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
474
+ for d_idx, doc in enumerate(run1_top_n_docs):
475
+ alt_text = run1_top_n_docs_alt[d_idx]["text"]
476
+ doc_text = combine(doc["text"], alt_text, run1_uses_doc_expansion)
477
+ col_run1.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}1")
478
+ else:
479
  for d_idx, doc in enumerate(run1_top_n_docs):
480
+ col_run1.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}1")
481
+
482
+ if col_run2.checkbox('Show top ranked documents for Run 2', key=f"{inst_index}top-2run"):
483
+ col_run2.subheader("Top N Ranked Documents")
484
+ if doc_expansion2 is not None and run2_uses_doc_expansion != "None":
485
+ show_orig_rel_ranked2 = col_run2.checkbox("Show Original Ranked Doc(s)", key=f"{inst_index}relorigdocs2", value=False)
486
+ run2_top_n = run2_pandas[run2_pandas.qid == str(inst_num)].sort_values(["score", "doc_id"], ascending=[False, False])[:top_n]
487
+ run2_top_n_docs = [corpus[str(doc_id)] for doc_id in run2_top_n.doc_id.tolist()]
488
+
489
+
490
+ if doc_expansion2 is not None and run2_uses_doc_expansion != "None" and not show_orig_rel_ranked2:
491
+ run2_top_n_docs_alt = [doc_expansion2[str(doc_id)] for doc_id in run2_top_n.doc_id.tolist()]
492
+ for d_idx, doc in enumerate(run2_top_n_docs):
493
+ alt_text = run2_top_n_docs_alt[d_idx]["text"]
494
+ doc_text = combine(doc["text"], alt_text, run2_uses_doc_expansion)
495
+ col_run2.text_area(f"{run2_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}2")
496
+ else:
497
  for d_idx, doc in enumerate(run2_top_n_docs):
498
+ col_run2.text_area(f"{run2_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}2")
499
+
500
+ st.divider()
501
 
 
502
 
503
+ else:
504
+ st.title("Overview")
505
 
506
+ st.subheader(f"Scores of {metric_name}")
507
+ fig = create_boxplot_2df(results1, results2, metric_name)
508
+ st.plotly_chart(fig)
509
 
510
+ st.subheader(f"Score Difference of {metric_name}")
511
+ fig_comp = create_boxplot_diff(results1, results2, metric_name)
512
+ st.plotly_chart(fig_comp)
513
 
514
+ else:
515
+ st.warning("Please choose a dataset and upload a run file. If you chose \"custom\" be sure that you uploaded all files (queries, corpus, qrels)")
constants.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ALL_METRICS = [
3
+ "ndcg_cut_10",
4
+ "ndcg_cut_5",
5
+ "ndcg_cut_15",
6
+ "ndcg_cut_20",
7
+ "ndcg_cut_30",
8
+ "ndcg_cut_100",
9
+ "ndcg_cut_200",
10
+ "ndcg_cut_500",
11
+ "ndcg_cut_1000",
12
+ "map",
13
+ "P_5",
14
+ "P_10",
15
+ "P_15",
16
+ "P_20",
17
+ "P_30",
18
+ "P_100",
19
+ "P_200",
20
+ "P_500",
21
+ "P_1000",
22
+ "recall_5",
23
+ "recall_10",
24
+ "recall_15",
25
+ "recall_20",
26
+ "recall_30",
27
+ "recall_100",
28
+ "recall_200",
29
+ "recall_500",
30
+ "recall_1000",
31
+ "recip_rank",
32
+ "set_recall",
33
+ "set_P",
34
+ "set_F",
35
+ "num_rel_ret",
36
+ "num_ret",
37
+ "num_rel",
38
+ "num_q",
39
+ "num_rel",
40
+ "num_rel_ret"
41
+ "Rprec",
42
+ "bpref",
43
+ "iprec_at_recall_0.00",
44
+ "iprec_at_recall_0.10",
45
+ "iprec_at_recall_0.20",
46
+ "iprec_at_recall_0.30",
47
+ "iprec_at_recall_0.40",
48
+ "iprec_at_recall_0.50",
49
+ "iprec_at_recall_0.60",
50
+ "iprec_at_recall_0.70",
51
+ "iprec_at_recall_0.80",
52
+ "iprec_at_recall_0.90",
53
+ "iprec_at_recall_1.00",
54
+ ]
55
+
56
+
57
+ BEIR = [
58
+ "msmarco",
59
+ "trec-covid",
60
+ "nf_corpus",
61
+ "bioasq",
62
+ "nq",
63
+ "hotpotqa",
64
+ "fiqa",
65
+ "signal1m",
66
+ "trec-news",
67
+ "robust04",
68
+ "arguana",
69
+ "webis-touche2020",
70
+ "cqadupstack",
71
+ "quora",
72
+ "dbpedia-entity",
73
+ "scidocs",
74
+ "fever",
75
+ "climate-fever",
76
+ "scifact",
77
+ ]
78
+
79
+
80
+ IR_DATASETS = [
81
+ "antique",
82
+ "aol_ia",
83
+ "aquaint",
84
+ "argsme",
85
+ "c4",
86
+ "car",
87
+ "clinicaltrials",
88
+ "clirmatrix",
89
+ "clueweb09",
90
+ "clueweb12",
91
+ "codec",
92
+ "cord19",
93
+ "cranfield",
94
+ "disks45",
95
+ "dpr_w100",
96
+ "codesearchnet",
97
+ "gov",
98
+ "gov2",
99
+ "highwire",
100
+ "istella22",
101
+ "kilt",
102
+ "lotte",
103
+ "medline",
104
+ "mmarco",
105
+ "mr_tydi",
106
+ "msmarco_document",
107
+ "msmarco_document_v2",
108
+ "msmarco_passage",
109
+ "msmarco_passage_v2",
110
+ "msmarco_qna",
111
+ "neumarco",
112
+ "nfcorpus",
113
+ "natural_questions",
114
+ "nyt",
115
+ "pmc",
116
+ "touche_image",
117
+ "touche",
118
+ "trec_arabic",
119
+ "trec_mandarin",
120
+ "trec_spanish",
121
+ "trec_robust04",
122
+ "trec_tot",
123
+ "tripclick",
124
+ "tweets2013_ia",
125
+ "vaswani",
126
+ "wapo",
127
+ "wikiclir",
128
+ "wikir",
129
+ "trec_fair",
130
+ "trec_cast",
131
+ "hc4",
132
+ "neuclir",
133
+ "sara",
134
+ ]
135
+
136
+ LOCAL_DATASETS = [
137
+ "gooaq_technical",
138
+ "codesearch_py",
139
+ ]
140
+
141
+
142
+ ALL_DATASETS = ["", "custom"] + LOCAL_DATASETS + BEIR + IR_DATASETS
dataset_loading.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pathlib
4
+ import beir
5
+ from beir import util
6
+ from beir.datasets.data_loader import GenericDataLoader
7
+ import pytrec_eval
8
+ import pandas as pd
9
+ from collections import defaultdict
10
+ import json
11
+ import copy
12
+ import ir_datasets
13
+
14
+
15
+ from constants import BEIR, IR_DATASETS, LOCAL_DATASETS
16
+
17
+ def load_local_corpus(corpus_file, columns_to_combine=["title", "text"]):
18
+ if corpus_file is None:
19
+ return None
20
+ did2text = {}
21
+ id_key = "_id"
22
+ with corpus_file as f:
23
+ for idx, line in enumerate(f):
24
+ uses_bytes = not (type(line) == str)
25
+ if uses_bytes:
26
+ if idx == 0 and "doc_id" in line.decode("utf-8"):
27
+ continue
28
+ inst = json.loads(line.decode("utf-8"))
29
+ else:
30
+ if idx == 0 and "doc_id" in line:
31
+ continue
32
+ inst = json.loads(line)
33
+ all_text = " ".join([inst[col] for col in columns_to_combine if col in inst])
34
+ if id_key not in inst:
35
+ id_key = "doc_id"
36
+ did2text[inst[id_key]] = {
37
+ "text": all_text,
38
+ "title": inst["title"] if "title" in inst else "",
39
+ }
40
+ return did2text
41
+
42
+ def load_local_queries(queries_file):
43
+ if queries_file is None:
44
+ return None
45
+ qid2text = {}
46
+ id_key = "_id"
47
+ with queries_file as f:
48
+ for idx, line in enumerate(f):
49
+ uses_bytes = not (type(line) == str)
50
+ if uses_bytes:
51
+ if idx == 0 and "query_id" in line.decode("utf-8"):
52
+ continue
53
+ inst = json.loads(line.decode("utf-8"))
54
+ else:
55
+ if idx == 0 and "query_id" in line:
56
+ continue
57
+ inst = json.loads(line)
58
+ if id_key not in inst:
59
+ id_key = "query_id"
60
+ qid2text[inst[id_key]] = inst["text"]
61
+ return qid2text
62
+
63
+ def load_local_qrels(qrels_file):
64
+ if qrels_file is None:
65
+ return None
66
+ qid2did2label = defaultdict(dict)
67
+ with qrels_file as f:
68
+ for idx, line in enumerate(f):
69
+ uses_bytes = not (type(line) == str)
70
+ if uses_bytes:
71
+ if idx == 0 and "qid" in line.decode("utf-8") or "query-id" in line.decode("utf-8"):
72
+ continue
73
+ cur_line = line.decode("utf-8")
74
+ else:
75
+ if idx == 0 and "qid" in line or "query-id" in line:
76
+ continue
77
+ cur_line = line
78
+ try:
79
+ qid, _, doc_id, label = cur_line.split()
80
+ except:
81
+ qid, doc_id, label = cur_line.split()
82
+ qid2did2label[str(qid)][str(doc_id)] = int(label)
83
+
84
+ return qid2did2label
85
+
86
+
87
+ def load_run(f_run):
88
+ run = pytrec_eval.parse_run(copy.deepcopy(f_run))
89
+ # convert bytes to strings for keys
90
+ new_run = defaultdict(dict)
91
+ for key, sub_dict in run.items():
92
+ new_run[key.decode("utf-8")] = {k.decode("utf-8"): v for k, v in sub_dict.items()}
93
+
94
+ run_pandas = pd.read_csv(f_run, header=None, index_col=None, sep="\t")
95
+ run_pandas.columns = ["qid", "generic", "doc_id", "rank", "score", "model"]
96
+ run_pandas.doc_id = run_pandas.doc_id.astype(str)
97
+ run_pandas.qid = run_pandas.qid.astype(str)
98
+ run_pandas["rank"] = run_pandas["rank"].astype(int)
99
+ run_pandas.score = run_pandas.score.astype(float)
100
+ # if run_1_alt is not None:
101
+ # run_1_alt, run_1_alt_sub = load_jsonl(run_1_alt)
102
+ return new_run, run_pandas
103
+
104
+
105
+
106
+ def load_jsonl(f):
107
+ did2text = defaultdict(list)
108
+ sub_did2text = {}
109
+
110
+ for idx, line in enumerate(f):
111
+ inst = json.loads(line)
112
+ if "question" in inst:
113
+ docid = inst["metadata"][0]["passage_id"] if "doc_id" not in inst else inst["doc_id"]
114
+ did2text[docid].append(inst["question"])
115
+ elif "text" in inst:
116
+ docid = inst["doc_id"] if "doc_id" in inst else inst["did"]
117
+ did2text[docid].append(inst["text"])
118
+ sub_did2text[inst["did"]] = inst["text"]
119
+ elif "query" in inst:
120
+ docid = inst["doc_id"] if "doc_id" in inst else inst["did"]
121
+ did2text[docid].append(inst["query"])
122
+ else:
123
+ breakpoint()
124
+ raise NotImplementedError("Need to handle this case")
125
+
126
+ return did2text, sub_did2text
127
+
128
+
129
+
130
+ def get_beir(dataset: str):
131
+ url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
132
+ out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
133
+ data_path = util.download_and_unzip(url, out_dir)
134
+ return GenericDataLoader(data_folder=data_path).load(split="test")
135
+
136
+
137
+ def get_ir_datasets(dataset_name: str):
138
+ dataset = ir_datasets.load(dataset_name)
139
+ queries = {}
140
+ for qid, query in dataset.queries_iter():
141
+ queries[qid] = query
142
+ # corpus = {}
143
+ # for doc in dataset.docs_iter():
144
+ # return corpus, queries, qrels
145
+ return dataset.doc_store(), queries, dataset.qrels_dict()
146
+
147
+
148
+ def get_dataset(dataset_name: str):
149
+ if dataset_name == "":
150
+ return {}, {}, {}
151
+
152
+ if dataset_name in BEIR:
153
+ return get_beir(dataset_name)
154
+ elif dataset_name in IR_DATASETS:
155
+ return get_ir_datasets(dataset_name)
156
+ elif dataset_name in LOCAL_DATASETS:
157
+ base_path = f"local_datasets/{dataset_name}"
158
+ corpus_file = open(f"{base_path}/corpus.jsonl", "r")
159
+ queries_file = open(f"{base_path}/queries.jsonl", "r")
160
+ qrels_file = open(f"{base_path}/qrels/test.tsv", "r")
161
+ return load_local_corpus(corpus_file), load_local_queries(queries_file), load_local_qrels(qrels_file)
162
+ else:
163
+ raise NotImplementedError("Dataset not implemented")
local_datasets/codesearch_py/corpus.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87ef61791e9aa9a9833e59e81756d41beaca8e4cd3efad2bb8940e5876f69008
3
+ size 384365716
local_datasets/codesearch_py/qrels/test.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d172966a5e2dcc39491d446ca75ed730f7309d09701c131add14eb62b45c2114
3
+ size 79309
local_datasets/codesearch_py/qrels/test.tsv.tmp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef54b582e47e48fdd094a3da00644bcf4af684b709be3f4f72f4de23c783ea50
3
+ size 79283
local_datasets/codesearch_py/qrels/test.tsv.tmp.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efda84b6d2b19a4bbd33ecd89616c88b63f4d585f7cb5ea10cc12372592306a3
3
+ size 81283
local_datasets/codesearch_py/qrels/test.tsv.tmp.2.filtered ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea798baa1ab23010a7769e60ba06e388d2b421cc2a9987b13900743df122a7c2
3
+ size 24193
local_datasets/codesearch_py/queries.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41e3f41fad388f4f612630bdb8ccb23b319b24a0b859db226a381b6f68b1771c
3
+ size 199567
local_datasets/gooaq_technical/corpus.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31282e5019461a6cd9d88a9e47fe6743d6962b3aeb81f5f5f78fa72eb52ff46b
3
+ size 1399723
local_datasets/gooaq_technical/qrels/test.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b56de4bfec42225780cda2fc28fd7e0ee433f313208ab210de5bcf6281757ee
3
+ size 49675
local_datasets/gooaq_technical/qrels/test.tsv.tmp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49982dbf8d1d182a75935718cb183b91d29e3ad4db1892723371c7d762955cbc
3
+ size 49649
local_datasets/gooaq_technical/qrels/test.tsv.tmp.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f0c2a26846e0456ddd24cd6d315ae516af28504e6b2961d00e0da0ff821f648
3
+ size 51649
local_datasets/gooaq_technical/qrels/test.tsv.tmp.2.filtered ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e681ddae1619d30ce425fdb01ca4ceb10f493b079369ac0e555b1338cd3914e1
3
+ size 15158
local_datasets/gooaq_technical/queries.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25df519a5e39f0c91f6f7c5bdb74601cbfffbadd3cd1a998a8a5a48740b885be
3
+ size 110860
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ sudo apt install default-jre
requirements.txt CHANGED
@@ -1,4 +1,8 @@
1
- beir
2
- pandas
3
- pytrec_eval
4
- streamlit
 
 
 
 
 
1
+ beir==1.0.1
2
+ pandas==2.0.3
3
+ pytrec_eval==0.5
4
+ streamlit==1.24.1
5
+ ir_datasets==0.5.5
6
+ pyserini==0.21.1
7
+ torch==2.0.1
8
+ plotly==5.5.15
test.tst ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "base",
2
+ "antique",
3
+ "aol_ia",
4
+ "aquaint",
5
+ "argsme",
6
+ "beir",
7
+ "c4
8
+ "car",
9
+ "clinicaltrials",
10
+ "clirmatrix",
11
+ "clueweb09",
12
+ "clueweb12",
13
+ "codec",
14
+ "cord19",
15
+ "cranfield",
16
+ "disks45",
17
+ "dpr_w100",
18
+ "codesearchnet",
19
+ "gov",
20
+ "gov2",
21
+ "highwire",
22
+ "istella22",
23
+ "kilt",
24
+ "lotte",
25
+ "medline",
26
+ "mmarco",
27
+ "mr_tydi",
28
+ "msmarco_document",
29
+ "msmarco_document_v2",
30
+ "msmarco_passage",
31
+ "msmarco_passage_v2",
32
+ "msmarco_qna",
33
+ "neumarco",
34
+ "nfcorpus",
35
+ "natural_questions",
36
+ "nyt",
37
+ "pmc",
38
+ "touche_image",
39
+ "touche",
40
+ "trec_arabic",
41
+ "trec_mandarin",
42
+ "trec_spanish",
43
+ "trec_robust04",
44
+ "trec_tot",
45
+ "tripclick",
46
+ "tweets2013_ia",
47
+ "vaswani",
48
+ "wapo",
49
+ "wikiclir",
50
+ "wikir",
51
+ "trec_fair",
52
+ "trec_cast",
53
+ "hc4",
54
+ "neuclir",
55
+ "sara",