Spaces:

ivyblossom
/

question-answering

Running

ivyblossom commited on Aug 3, 2023

Commit

0f897d9

•

1 Parent(s): 7723d60

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,9 +2,10 @@ import os
 import fitz  # PyMuPDF for parsing PDF
 import streamlit as st
 from sentence_transformers import SentenceTransformer, util
 # Load a pre-trained SentenceTransformer model
-model_name = "paraphrase-MiniLM-L6-v2"
 model = SentenceTransformer(model_name)
 # Function to extract text from a PDF file
@@ -17,8 +18,14 @@ def extract_text_from_pdf(pdf_path):
             text += page_text
             yield page_num + 1, page_text  # Return the page number (1-based) and the extracted text
 # Function to perform semantic search
-def semantic_search(query, documents, top_k=5):
     query_embedding = model.encode(query, convert_to_tensor=True)
     # Convert the list of documents to embeddings
@@ -30,7 +37,9 @@ def semantic_search(query, documents, top_k=5):
     # Sort the results in decreasing order
     results = []
     for idx in range(len(cosine_scores)):
-        results.append((documents[idx][0], documents[idx][1], cosine_scores[idx].item()))
     results = sorted(results, key=lambda x: x[2], reverse=True)
     return results[:top_k]

 import fitz  # PyMuPDF for parsing PDF
 import streamlit as st
 from sentence_transformers import SentenceTransformer, util
+import re
 # Load a pre-trained SentenceTransformer model
+model_name = "paraphrase-MiniLM-L6-v2"
 model = SentenceTransformer(model_name)
 # Function to extract text from a PDF file
             text += page_text
             yield page_num + 1, page_text  # Return the page number (1-based) and the extracted text
+# Function to truncate text to the nearest word boundary
+def truncate_to_word_boundary(text, max_words=500):
+    words = re.findall(r'\w+', text)
+    truncated_text = ' '.join(words[:max_words])
+    return truncated_text
 # Function to perform semantic search
+def semantic_search(query, documents, top_k=5, max_words=500):
     query_embedding = model.encode(query, convert_to_tensor=True)
     # Convert the list of documents to embeddings
     # Sort the results in decreasing order
     results = []
     for idx in range(len(cosine_scores)):
+        page_num, text = documents[idx]
+        truncated_text = truncate_to_word_boundary(text, max_words)
+        results.append((page_num, truncated_text, cosine_scores[idx].item()))
     results = sorted(results, key=lambda x: x[2], reverse=True)
     return results[:top_k]