Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -19,10 +19,13 @@ import textract
|
|
19 |
from scipy.special import softmax
|
20 |
import pandas as pd
|
21 |
from datetime import datetime
|
|
|
|
|
22 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
23 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
|
24 |
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
|
25 |
model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
|
|
|
26 |
if device == 'cuda:0':
|
27 |
pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
|
28 |
else:
|
@@ -90,7 +93,8 @@ def predict(query,data):
|
|
90 |
hist = st + " " + st_hashed
|
91 |
now = datetime.now()
|
92 |
current_time = now.strftime("%H:%M:%S")
|
93 |
-
|
|
|
94 |
df = pd.read_csv("{}.csv".format(hash(st)))
|
95 |
list_outputs = []
|
96 |
for i in range(k):
|
@@ -105,7 +109,7 @@ def predict(query,data):
|
|
105 |
print(e)
|
106 |
print(st)
|
107 |
|
108 |
-
if name_to_save+".txt" in os.listdir():
|
109 |
doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
110 |
doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
111 |
file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
@@ -125,6 +129,8 @@ def predict(query,data):
|
|
125 |
doc_emb = doc_emb.reshape(-1, 768)
|
126 |
with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
|
127 |
f.write(text)
|
|
|
|
|
128 |
start = time.time()
|
129 |
query_emb = encode_query(query)
|
130 |
|
@@ -136,6 +142,8 @@ def predict(query,data):
|
|
136 |
probs = softmax(sorted(scores,reverse = True)[:k])
|
137 |
table = {"Passage":[],"Answer":[],"Probabilities":[]}
|
138 |
|
|
|
|
|
139 |
for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
|
140 |
passage = passage.replace("\n","")
|
141 |
#passage = passage.replace(" . "," ")
|
@@ -155,7 +163,7 @@ def predict(query,data):
|
|
155 |
table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
|
156 |
|
157 |
|
158 |
-
|
159 |
df = pd.DataFrame(table)
|
160 |
print(df)
|
161 |
print("time: "+ str(time.time()-start))
|
|
|
19 |
from scipy.special import softmax
|
20 |
import pandas as pd
|
21 |
from datetime import datetime
|
22 |
+
|
23 |
+
|
24 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
25 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
|
26 |
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
|
27 |
model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
|
28 |
+
|
29 |
if device == 'cuda:0':
|
30 |
pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
|
31 |
else:
|
|
|
93 |
hist = st + " " + st_hashed
|
94 |
now = datetime.now()
|
95 |
current_time = now.strftime("%H:%M:%S")
|
96 |
+
|
97 |
+
try: #if the same question was already asked for this document, upload question and answer
|
98 |
df = pd.read_csv("{}.csv".format(hash(st)))
|
99 |
list_outputs = []
|
100 |
for i in range(k):
|
|
|
109 |
print(e)
|
110 |
print(st)
|
111 |
|
112 |
+
if name_to_save+".txt" in os.listdir(): #if the document was already used, load its embeddings
|
113 |
doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
114 |
doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
115 |
file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
|
|
129 |
doc_emb = doc_emb.reshape(-1, 768)
|
130 |
with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
|
131 |
f.write(text)
|
132 |
+
|
133 |
+
#once embeddings are calculated, run MIPS
|
134 |
start = time.time()
|
135 |
query_emb = encode_query(query)
|
136 |
|
|
|
142 |
probs = softmax(sorted(scores,reverse = True)[:k])
|
143 |
table = {"Passage":[],"Answer":[],"Probabilities":[]}
|
144 |
|
145 |
+
|
146 |
+
#get answers for each pair of question (from user) and top best passages
|
147 |
for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
|
148 |
passage = passage.replace("\n","")
|
149 |
#passage = passage.replace(" . "," ")
|
|
|
163 |
table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
|
164 |
|
165 |
|
166 |
+
#format answers for ~nice output and save it for future (if the same question is asked again using same pdf)
|
167 |
df = pd.DataFrame(table)
|
168 |
print(df)
|
169 |
print("time: "+ str(time.time()-start))
|