Spaces:

hitz02
/

COVID_NLI

Runtime error

App Files Files Community

hitz02 commited on Nov 10, 2022

Commit

07ab211

•

1 Parent(s): 556f46d

Upload 3 files

Browse files

Files changed (3) hide show

app.py +109 -0
requirements.txt +9 -0
utils.py +123 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import pandas as pd
+import numpy as np
+import pickle
+import glob
+import json
+from pandas.io.json import json_normalize
+from nltk.tokenize import sent_tokenize
+import nltk
+import scipy.spatial
+from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForQuestionAnswering
+from sentence_transformers import models, SentenceTransformer
+import torch
+import spacy
+import streamlit as st
+from utils import *
+@st.cache(allow_output_mutation=True)
+def load_prep_data():
+  with open('listfile_3.data', 'rb') as filehandle:
+    articles = pickle.load(filehandle)
+  for article in range(len(articles)):
+    if articles[article][1] != []:
+      articles[article][1] = sent_tokenize(articles[article][1])
+  return articles
+@st.cache(allow_output_mutation=True)
+def build_sent_trans_model():
+  word_embedding_model = models.BERT('covidbert_nli')
+  # Add the pooling strategy of Mean
+  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
+                                   pooling_mode_mean_tokens=True,
+                                   pooling_mode_cls_token=False,
+                                   pooling_mode_max_tokens=False)
+  model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+  return model
+@st.cache(allow_output_mutation=True)
+def load_embedded_articles():
+  with open('list_of_articles.pkl', 'rb') as f:
+    list_of_articles = pickle.load(f)
+  return list_of_articles
+@st.cache(allow_output_mutation=True)
+def load_comprehension_model():
+  # device is set to -1 to use the available gpu
+  comprehension_model = pipeline("question-answering",
+                                 model=AutoModelForQuestionAnswering.\
+                                 from_pretrained("graviraja/covidbert_squad"),
+                                 tokenizer=AutoTokenizer.\
+                                 from_pretrained("graviraja/covidbert_squad"),
+                                 device=-1)
+  return comprehension_model
+def main():
+  nltk.download('punkt')
+  spacy_nlp = spacy.load('en_core_web_sm')
+  device = torch.device('cuda:0' if torch.cuda.is_available()
+                      else 'cpu')
+  embeddings = load_prep_data()
+  model = build_sent_trans_model()
+  model.to(device)
+  list_of_articles = load_embedded_articles()
+  comprehension_model = load_comprehension_model()
+  query = st.text_input("Enter Query",'example query ',key="query")
+  query_embedding, results1 = fetch_stage1(query, model, list_of_articles)
+  results2 = fetch_stage2(results1, model, embeddings, query_embedding)
+  results3 = fetch_stage3(results2, query, embeddings, comprehension_model, spacy_nlp)
+  if results3:
+    count = 1
+    for res in results3:
+      st.write('{}> {}'.format(count, res[2]))
+      st.write('Score: %.4f' % (res[1]))
+      st.write("From the article with title: {}".format(embeddings[res[0]][0]))
+      st.write("\n")
+      # print(count,". ", res[2], "(Score: %.4f)" % (res[1]))
+      # print("From the article with title: ", embeddings[res[0]][0])
+      # print("\n")
+      if count > 3:
+        break
+      count += 1
+  else:
+    st.info("There isn't any answer")
+if __name__ == '__main__':
+	main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+nltk
+pandas
+scipy
+numpy
+sentence-transformers==0.2.5.1
+transformers==2.5.1
+spacy
+streamlit

utils.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import pandas as pd
+import numpy as np
+import pickle
+import glob
+import json
+from pandas.io.json import json_normalize
+from nltk.tokenize import sent_tokenize
+import nltk
+import scipy.spatial
+from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForQuestionAnswering
+from sentence_transformers import models, SentenceTransformer
+def get_full_sentence(spacy_nlp, para_text, start_index, end_index):
+  """
+  Returns the relative sentence of original text,
+  given a specific paragraph (body text).
+  """
+  sent_start = 0
+  sent_end = len(para_text)
+  for sent in spacy_nlp(para_text).sents:
+      if (sent.start_char <= start_index) and (sent.end_char >= start_index):
+          sent_start = sent.start_char
+      if (sent.start_char <= end_index) and (sent.end_char >= end_index):
+          sent_end = sent.end_char
+  sentence = para_text[sent_start:sent_end + 1]
+  return sentence
+def fetch_stage1(query, model, list_of_articles):
+  """
+  Compare all the articles' abstract content with each query
+  """
+  # Encode queries
+  query_embedding = model.encode([query])[0]
+  all_abs_distances = []
+  for idx_of_article,article in enumerate(list_of_articles):
+    if article:
+      distances = []
+      cdists = scipy.spatial.distance.cdist([query_embedding], np.vstack(article), "cosine").reshape(-1,1)
+      for idx,sentence in enumerate(article):
+        distances.append((idx, 1 - cdists[idx][0]))
+      results = sorted(distances, key=lambda x: x[1], reverse=True)
+      if results:
+        all_abs_distances.append((idx_of_article, results[0][0], results[0][1]))
+  results = sorted(all_abs_distances, key=lambda x: x[2], reverse=True)
+  return query_embedding, results
+def fetch_stage2(results, model, embeddings, query_embedding):
+  """
+  Take the 20 most similar articles, based on the relevant abstracts and
+  compare all the body texts content to the query
+  """
+  all_text_distances = []
+  for top in results[0:20]:
+    article_idx = top[0]
+    body_texts = [text[0] for text in embeddings[article_idx][2]]
+    body_text_embeddings = model.encode(body_texts, show_progress_bar=False)
+    # body_text_distances = []
+    # for text_idx,text in enumerate(embeddings[article_idx][2]):
+    qbody = scipy.spatial.distance.cdist([query_embedding],
+                                          np.vstack(body_text_embeddings),
+                                          "cosine").reshape(-1,1)
+    body_text_distances = [(idx, 1-dist[0]) for idx,dist in enumerate(qbody)]
+    # for text_idx,text in enumerate(body_texts):
+    #   # Encode only the body texts of 20 best articles
+    #   # body_text_embedding = model.encode(text, show_progress_bar=False)
+    #   body_text_distances.append(((text_idx,
+    #                                (1 - ([0]))
+    #                                )))
+    results = sorted(body_text_distances, key=lambda x: x[1], reverse=True)
+    if results:
+      all_text_distances.append((article_idx, results[0][0], results[0][1]))
+  results = sorted(all_text_distances, key=lambda x: x[2], reverse=True)
+  return results
+def fetch_stage3(results, query, embeddings, comprehension_model, spacy_nlp):
+  """
+  For the top 20 retrieved paragraphs in the document,
+  answer will be comprehended on each paragraph using the model.
+  """
+  answers = []
+  # contxt = [embeddings[top_text[0]][2][top_text[1]][0] for top_text in results[0:20]]
+  for top_text in results[0:20]:
+    article_idx = top_text[0]
+    body_text_idx = top_text[1]
+    query_ = {"context": embeddings[article_idx][2][body_text_idx][0], "question": query}
+    pred = comprehension_model(query_, topk=1, show_progress_bar=False)
+    # If there is any answer
+    if pred["answer"] and round(pred["score"], 4) > 0:
+        # Take the suitable sentence from the paragraph
+        sent = get_full_sentence(spacy_nlp, query_['context'], pred["start"], pred["end"])
+        answers.append((article_idx, round(pred["score"], 4), sent))
+  results = sorted(answers, key=lambda x: x[1], reverse=True)
+  return results