Spaces:
Sleeping
Sleeping
elibrowne
commited on
Commit
·
e65ac7c
1
Parent(s):
1991ca2
Question data for E5 and ColBERT online and formatted
Browse files- create_json_data.py +37 -0
- question_data.csv +0 -0
- question_data.json +0 -0
create_json_data.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import csv
|
3 |
+
|
4 |
+
with open("question_data.csv", "r") as f:
|
5 |
+
reader = csv.reader(f)
|
6 |
+
questions = []
|
7 |
+
for row in reader:
|
8 |
+
questions.append(row)
|
9 |
+
|
10 |
+
questions = questions[2:] # cut off top two (labels, passage #s)
|
11 |
+
|
12 |
+
# qid,prompt,question,a,b,c,d,answer,gold_passage,top10_colbert,,,,,,,,,,generation_colbert,top10_e5,,,,,,,,,,generation_e5,gold_passage_generation
|
13 |
+
# 0 1 2 3 4 5 6 7 8 9-18 19 20-29 30 31
|
14 |
+
# See example.json for how these files will be ported
|
15 |
+
|
16 |
+
full_question_dict = {} # stores all "id":q_data pairs
|
17 |
+
for entry in questions:
|
18 |
+
# Create individual question data
|
19 |
+
q_data = {}
|
20 |
+
if not entry[1] == "":
|
21 |
+
entry[2] = entry[1] + " " + entry[2]
|
22 |
+
q_data["question"] = entry[2]
|
23 |
+
q_data["answers"] = entry[3:7] # inclusive of (3, 6) -> A, B, C, D
|
24 |
+
answer_map = {"A": 0, "B": 1, "C": 2, "D": 3}
|
25 |
+
q_data["correct_answer_index"] = answer_map[entry[7]] # entry[7] = "A" -> index = 0
|
26 |
+
q_data["top10_colbert"] = entry[9:19] # inclusive of (9-18) -> 10 retrievals
|
27 |
+
q_data["generation_colbert"] = entry[19]
|
28 |
+
q_data["top10_e5"] = entry[20:30] # inclusive of (20-29) -> 10 retrievals
|
29 |
+
q_data["generation_e5"] = entry[30]
|
30 |
+
q_data["top10_contains_gold_passage"] = False # this is always the case b/c of programming. Does not reflect reality
|
31 |
+
q_data["gold_passage"] = entry[8]
|
32 |
+
q_data["gold_passage_generation"] = entry[31]
|
33 |
+
# Add to full question dictionary
|
34 |
+
full_question_dict[entry[0]] = q_data # entry[0] is qid
|
35 |
+
|
36 |
+
with open("question_data.json", "w") as f:
|
37 |
+
json.dump(full_question_dict, f)
|
question_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
question_data.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|