ThePixOne commited on
Commit
bb00b3a
1 Parent(s): df909ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -19,10 +19,13 @@ import textract
19
  from scipy.special import softmax
20
  import pandas as pd
21
  from datetime import datetime
 
 
22
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
23
  model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
24
  tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
25
  model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
 
26
  if device == 'cuda:0':
27
  pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
28
  else:
@@ -90,7 +93,8 @@ def predict(query,data):
90
  hist = st + " " + st_hashed
91
  now = datetime.now()
92
  current_time = now.strftime("%H:%M:%S")
93
- try:
 
94
  df = pd.read_csv("{}.csv".format(hash(st)))
95
  list_outputs = []
96
  for i in range(k):
@@ -105,7 +109,7 @@ def predict(query,data):
105
  print(e)
106
  print(st)
107
 
108
- if name_to_save+".txt" in os.listdir():
109
  doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
110
  doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
111
  file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
@@ -125,6 +129,8 @@ def predict(query,data):
125
  doc_emb = doc_emb.reshape(-1, 768)
126
  with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
127
  f.write(text)
 
 
128
  start = time.time()
129
  query_emb = encode_query(query)
130
 
@@ -136,6 +142,8 @@ def predict(query,data):
136
  probs = softmax(sorted(scores,reverse = True)[:k])
137
  table = {"Passage":[],"Answer":[],"Probabilities":[]}
138
 
 
 
139
  for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
140
  passage = passage.replace("\n","")
141
  #passage = passage.replace(" . "," ")
@@ -155,7 +163,7 @@ def predict(query,data):
155
  table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
156
 
157
 
158
-
159
  df = pd.DataFrame(table)
160
  print(df)
161
  print("time: "+ str(time.time()-start))
 
19
  from scipy.special import softmax
20
  import pandas as pd
21
  from datetime import datetime
22
+
23
+
24
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
25
  model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
26
  tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
27
  model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
28
+
29
  if device == 'cuda:0':
30
  pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
31
  else:
 
93
  hist = st + " " + st_hashed
94
  now = datetime.now()
95
  current_time = now.strftime("%H:%M:%S")
96
+
97
+ try: #if the same question was already asked for this document, upload question and answer
98
  df = pd.read_csv("{}.csv".format(hash(st)))
99
  list_outputs = []
100
  for i in range(k):
 
109
  print(e)
110
  print(st)
111
 
112
+ if name_to_save+".txt" in os.listdir(): #if the document was already used, load its embeddings
113
  doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
114
  doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
115
  file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
 
129
  doc_emb = doc_emb.reshape(-1, 768)
130
  with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
131
  f.write(text)
132
+
133
+ #once embeddings are calculated, run MIPS
134
  start = time.time()
135
  query_emb = encode_query(query)
136
 
 
142
  probs = softmax(sorted(scores,reverse = True)[:k])
143
  table = {"Passage":[],"Answer":[],"Probabilities":[]}
144
 
145
+
146
+ #get answers for each pair of question (from user) and top best passages
147
  for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
148
  passage = passage.replace("\n","")
149
  #passage = passage.replace(" . "," ")
 
163
  table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
164
 
165
 
166
+ #format answers for ~nice output and save it for future (if the same question is asked again using same pdf)
167
  df = pd.DataFrame(table)
168
  print(df)
169
  print("time: "+ str(time.time()-start))