Spaces:

ivyblossom
/

question-answering

Running

App Files Files Community

question-answering / app.py

ivyblossom

Update app.py

b06f609 over 1 year ago

raw

history blame

2.46 kB

	import os
	import fitz # PyMuPDF for parsing PDF
	import streamlit as st
	from sentence_transformers import SentenceTransformer, util

	# Load a pre-trained SentenceTransformer model
	model_name = "paraphrase-MiniLM-L6-v2"
	model = SentenceTransformer(model_name)

	# Function to extract text from a PDF file
	def extract_text_from_pdf(pdf_path):
	text = ""
	with fitz.open(pdf_path) as pdf_document:
	for page_num in range(pdf_document.page_count):
	page = pdf_document.load_page(page_num)
	page_text = page.get_text()
	text += page_text
	yield page_num + 1, page_text # Return the page number (1-based) and the extracted text

	# Function to perform semantic search
	def semantic_search(query, documents, top_k=5):
	query_embedding = model.encode(query, convert_to_tensor=True)

	# Convert the list of documents to embeddings
	document_embeddings = model.encode([text for _, text in documents], convert_to_tensor=True)

	# Compute cosine similarity scores of query with documents
	cosine_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)

	# Sort the results in decreasing order
	results = []
	for idx in range(len(cosine_scores)):
	results.append((documents[idx][0], documents[idx][1], cosine_scores[idx].item()))
	results = sorted(results, key=lambda x: x[2], reverse=True)

	return results[:top_k]

	def main():
	st.title("Semantic Search on PDF Documents")

	pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
	query = st.text_input("Enter your query:")

	if st.button("Search"):
	if pdf_file:
	pdf_path = os.path.join(os.getcwd(), pdf_file.name)
	with open(pdf_path, "wb") as f:
	f.write(pdf_file.read())

	# Extract text from the PDF along with page numbers
	pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))

	search_results = semantic_search(query, pdf_text_with_pages)
	os.remove(pdf_path) # Delete the uploaded file after processing

	st.write(f"Search results for query: '{query}'")
	for i, (page_num, result_text, score) in enumerate(search_results, start=1):
	with st.container():
	st.write(f"Result {i} - Page {page_num}")
	st.write(f"Score: {score:.2f}")
	st.write(result_text)

	if __name__ == "__main__":
	main()