ivyblossom's picture
Update app.py
b06f609
raw
history blame
2.46 kB
import os
import fitz # PyMuPDF for parsing PDF
import streamlit as st
from sentence_transformers import SentenceTransformer, util
# Load a pre-trained SentenceTransformer model
model_name = "paraphrase-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as pdf_document:
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
page_text = page.get_text()
text += page_text
yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
# Function to perform semantic search
def semantic_search(query, documents, top_k=5):
query_embedding = model.encode(query, convert_to_tensor=True)
# Convert the list of documents to embeddings
document_embeddings = model.encode([text for _, text in documents], convert_to_tensor=True)
# Compute cosine similarity scores of query with documents
cosine_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)
# Sort the results in decreasing order
results = []
for idx in range(len(cosine_scores)):
results.append((documents[idx][0], documents[idx][1], cosine_scores[idx].item()))
results = sorted(results, key=lambda x: x[2], reverse=True)
return results[:top_k]
def main():
st.title("Semantic Search on PDF Documents")
pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
query = st.text_input("Enter your query:")
if st.button("Search"):
if pdf_file:
pdf_path = os.path.join(os.getcwd(), pdf_file.name)
with open(pdf_path, "wb") as f:
f.write(pdf_file.read())
# Extract text from the PDF along with page numbers
pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
search_results = semantic_search(query, pdf_text_with_pages)
os.remove(pdf_path) # Delete the uploaded file after processing
st.write(f"Search results for query: '{query}'")
for i, (page_num, result_text, score) in enumerate(search_results, start=1):
with st.container():
st.write(f"Result {i} - Page {page_num}")
st.write(f"Score: {score:.2f}")
st.write(result_text)
if __name__ == "__main__":
main()