import os import fitz # PyMuPDF for parsing PDF import streamlit as st from sentence_transformers import SentenceTransformer, util # Load a pre-trained SentenceTransformer model model_name = "paraphrase-MiniLM-L6-v2" model = SentenceTransformer(model_name) # Function to extract text from a PDF file def extract_text_from_pdf(pdf_path): text = "" with fitz.open(pdf_path) as pdf_document: for page_num in range(pdf_document.page_count): page = pdf_document.load_page(page_num) page_text = page.get_text() text += page_text yield page_num + 1, page_text # Return the page number (1-based) and the extracted text # Function to perform semantic search def semantic_search(query, documents, top_k=5): query_embedding = model.encode(query, convert_to_tensor=True) # Convert the list of documents to embeddings document_embeddings = model.encode([text for _, text in documents], convert_to_tensor=True) # Compute cosine similarity scores of query with documents cosine_scores = util.pytorch_cos_sim(query_embedding, document_embeddings) # Sort the results in decreasing order results = [] for idx in range(len(cosine_scores)): results.append((documents[idx][0], documents[idx][1], cosine_scores[idx].item())) results = sorted(results, key=lambda x: x[2], reverse=True) return results[:top_k] def main(): st.title("Semantic Search on PDF Documents") pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"]) query = st.text_input("Enter your query:") if st.button("Search"): if pdf_file: pdf_path = os.path.join(os.getcwd(), pdf_file.name) with open(pdf_path, "wb") as f: f.write(pdf_file.read()) # Extract text from the PDF along with page numbers pdf_text_with_pages = list(extract_text_from_pdf(pdf_path)) search_results = semantic_search(query, pdf_text_with_pages) os.remove(pdf_path) # Delete the uploaded file after processing st.write(f"Search results for query: '{query}'") for i, (page_num, result_text, score) in enumerate(search_results, start=1): with st.container(): st.write(f"Result {i} - Page {page_num}") st.write(f"Score: {score:.2f}") st.write(result_text) if __name__ == "__main__": main()