Spaces:
Running
Running
import os | |
import fitz # PyMuPDF for parsing PDF | |
import streamlit as st | |
from sentence_transformers import SentenceTransformer, util | |
# Load a pre-trained SentenceTransformer model | |
model_name = "paraphrase-MiniLM-L6-v2" | |
model = SentenceTransformer(model_name) | |
# Function to extract text from a PDF file | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
with fitz.open(pdf_path) as pdf_document: | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
page_text = page.get_text() | |
text += page_text | |
yield page_num + 1, page_text # Return the page number (1-based) and the extracted text | |
# Function to perform semantic search | |
def semantic_search(query, documents, top_k=5): | |
query_embedding = model.encode(query, convert_to_tensor=True) | |
# Convert the list of documents to embeddings | |
document_embeddings = model.encode([text for _, text in documents], convert_to_tensor=True) | |
# Compute cosine similarity scores of query with documents | |
cosine_scores = util.pytorch_cos_sim(query_embedding, document_embeddings) | |
# Sort the results in decreasing order | |
results = [] | |
for idx in range(len(cosine_scores)): | |
results.append((documents[idx][0], documents[idx][1], cosine_scores[idx].item())) | |
results = sorted(results, key=lambda x: x[2], reverse=True) | |
return results[:top_k] | |
def main(): | |
st.title("Semantic Search on PDF Documents") | |
pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"]) | |
query = st.text_input("Enter your query:") | |
if st.button("Search"): | |
if pdf_file: | |
pdf_path = os.path.join(os.getcwd(), pdf_file.name) | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file.read()) | |
# Extract text from the PDF along with page numbers | |
pdf_text_with_pages = list(extract_text_from_pdf(pdf_path)) | |
search_results = semantic_search(query, pdf_text_with_pages) | |
os.remove(pdf_path) # Delete the uploaded file after processing | |
st.write(f"Search results for query: '{query}'") | |
for i, (page_num, result_text, score) in enumerate(search_results, start=1): | |
with st.container(): | |
st.write(f"Result {i} - Page {page_num}") | |
st.write(f"Score: {score:.2f}") | |
st.write(result_text) | |
if __name__ == "__main__": | |
main() |