Spaces:

ThePixOne
/

open_domain_qa

Running

App Files Files Community

ThePixOne commited on May 26, 2022

Commit

bb00b3a

•

1 Parent(s): df909ae

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -3

app.py CHANGED Viewed

@@ -19,10 +19,13 @@ import textract
 from scipy.special import softmax
 import pandas as pd
 from datetime import datetime
 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
 model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
 tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
 model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
 if device == 'cuda:0':
     pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
 else:
@@ -90,7 +93,8 @@ def predict(query,data):
     hist = st + " " + st_hashed
     now = datetime.now()
     current_time = now.strftime("%H:%M:%S")
-    try:
         df = pd.read_csv("{}.csv".format(hash(st)))
         list_outputs = []
         for i in range(k):
@@ -105,7 +109,7 @@ def predict(query,data):
         print(e)
         print(st)
-    if name_to_save+".txt" in os.listdir():
         doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
         doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
         file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
@@ -125,6 +129,8 @@ def predict(query,data):
         doc_emb = doc_emb.reshape(-1, 768)
         with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
             f.write(text)
     start = time.time()
     query_emb = encode_query(query)
@@ -136,6 +142,8 @@ def predict(query,data):
     probs = softmax(sorted(scores,reverse = True)[:k])
     table = {"Passage":[],"Answer":[],"Probabilities":[]}
     for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
         passage = passage.replace("\n","")
         #passage = passage.replace(" . "," ")
@@ -155,7 +163,7 @@ def predict(query,data):
             table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
     df = pd.DataFrame(table)
     print(df)
     print("time: "+ str(time.time()-start))

 from scipy.special import softmax
 import pandas as pd
 from datetime import datetime
 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
 model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
 tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
 model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
 if device == 'cuda:0':
     pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
 else:
     hist = st + " " + st_hashed
     now = datetime.now()
     current_time = now.strftime("%H:%M:%S")
+    try: #if the same question was already asked for this document, upload question and answer
         df = pd.read_csv("{}.csv".format(hash(st)))
         list_outputs = []
         for i in range(k):
         print(e)
         print(st)
+    if name_to_save+".txt" in os.listdir(): #if the document was already used, load its embeddings
         doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
         doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
         file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
         doc_emb = doc_emb.reshape(-1, 768)
         with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
             f.write(text)
+    #once embeddings are calculated, run MIPS
     start = time.time()
     query_emb = encode_query(query)
     probs = softmax(sorted(scores,reverse = True)[:k])
     table = {"Passage":[],"Answer":[],"Probabilities":[]}
+    #get answers for each pair of question (from user) and top best passages
     for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
         passage = passage.replace("\n","")
         #passage = passage.replace(" . "," ")
             table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
+    #format answers for ~nice output and save it for future (if the same question is asked again using same pdf)
     df = pd.DataFrame(table)
     print(df)
     print("time: "+ str(time.time()-start))