ivyblossom commited on
Commit
e4a1f31
1 Parent(s): 40e834e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -0
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF for parsing PDF
3
+ import streamlit as st
4
+ from sentence_transformers import SentenceTransformer, util
5
+
6
+ # Load a pre-trained SentenceTransformer model
7
+ model_name = "paraphrase-MiniLM-L6-v2" # You can change this to a different model if needed
8
+ model = SentenceTransformer(model_name)
9
+
10
+ # Function to extract text from a PDF file
11
+ def extract_text_from_pdf(pdf_path):
12
+ text = ""
13
+ with fitz.open(pdf_path) as pdf_document:
14
+ for page_num in range(pdf_document.page_count):
15
+ page = pdf_document.load_page(page_num)
16
+ text += page.get_text()
17
+ return text
18
+
19
+ # Function to perform semantic search
20
+ def semantic_search(query, documents, top_k=5):
21
+ query_embedding = model.encode(query, convert_to_tensor=True)
22
+
23
+ # Convert the list of documents to embeddings
24
+ document_embeddings = model.encode(documents, convert_to_tensor=True)
25
+
26
+ # Compute cosine similarity scores of query with documents
27
+ cosine_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)
28
+
29
+ # Sort the results in decreasing order
30
+ results = []
31
+ for idx in range(len(cosine_scores)):
32
+ results.append((documents[idx], cosine_scores[idx].item()))
33
+ results = sorted(results, key=lambda x: x[1], reverse=True)
34
+
35
+ return results[:top_k]
36
+
37
+ def main():
38
+ st.title("Semantic Search on PDF Documents")
39
+
40
+ query = st.text_input("Enter your query:")
41
+ pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
42
+
43
+ if st.button("Search"):
44
+ if pdf_file:
45
+ pdf_path = os.path.join("uploads", pdf_file.name)
46
+ with open(pdf_path, "wb") as f:
47
+ f.write(pdf_file.read())
48
+
49
+ pdf_text = extract_text_from_pdf(pdf_path)
50
+ search_results = semantic_search(query, [pdf_text])
51
+ os.remove(pdf_path) # Delete the uploaded file after processing
52
+
53
+ st.write(f"Search results for query: '{query}'")
54
+ for i, (result, score) in enumerate(search_results, start=1):
55
+ st.write(f"{i}. Score: {score:.2f}")
56
+ st.write(result)
57
+
58
+ if __name__ == "__main__":
59
+ main()