Nirav-Khanpara commited on
Commit
9ed0ab0
β€’
1 Parent(s): e2674bd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ import os
5
+ import pickle
6
+ import streamlit as st
7
+ from scan_pdf_parser import get_text_from_scanned_pdf
8
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
9
+ from langchain.llms import GooglePalm
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain.chains import RetrievalQA
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain.document_loaders import PyPDFLoader
14
+ from langchain.vectorstores import FAISS
15
+ from langchain.docstore.document import Document
16
+
17
+ llm = GooglePalm(temperature=0.9)
18
+
19
+ st.title("Query PDF Tool")
20
+
21
+ uploaded_file = st.file_uploader("Choose a PDF file")
22
+ main_placeholder = st.empty()
23
+ second_placeholder = st.empty()
24
+
25
+
26
+ if uploaded_file:
27
+ if not os.path.exists(uploaded_file.name):
28
+ main_placeholder.text("Data Loading...Started...βŒ›βŒ›βŒ›")
29
+ with open(f'{uploaded_file.name}', 'wb') as f:
30
+ f.write(uploaded_file.getbuffer())
31
+
32
+ pdf_loader = PyPDFLoader(uploaded_file.name)
33
+ documents = pdf_loader.load()
34
+
35
+ raw_text = ''
36
+ for doc in documents:
37
+ raw_text += doc.page_content
38
+
39
+ if len(raw_text) < 10:
40
+ main_placeholder.text("It looks like Scanned PDF, No worries converting it...βŒ›βŒ›βŒ›")
41
+ raw_text = get_text_from_scanned_pdf(uploaded_file.name)
42
+
43
+ main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
44
+ text_splitter = RecursiveCharacterTextSplitter(
45
+ separators=['\n\n', '\n', '.', ','],
46
+ chunk_size=2000
47
+ )
48
+
49
+ texts = text_splitter.split_text(raw_text)
50
+ docs = [Document(page_content=t) for t in texts]
51
+
52
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
53
+ main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
54
+ vectorstore = FAISS.from_documents(docs, embeddings)
55
+
56
+ # Save the FAISS index to a pickle file
57
+ with open(f'vector_store_{uploaded_file.name}.pkl', "wb") as f:
58
+ pickle.dump(vectorstore, f)
59
+
60
+ main_placeholder.text("Data Loading...Completed...βœ…βœ…βœ…")
61
+
62
+
63
+ query = second_placeholder.text_input("Question:")
64
+ if query:
65
+ if os.path.exists(f'vector_store_{uploaded_file.name}.pkl'):
66
+ with open(f'vector_store_{uploaded_file.name}.pkl', "rb") as f:
67
+ vector_store = pickle.load(f)
68
+
69
+ prompt_template = """
70
+ <context>
71
+ {context}
72
+ </context>
73
+ Question: {question}
74
+ Assistant:"""
75
+ prompt = PromptTemplate(
76
+ template=prompt_template, input_variables=["context", "question"]
77
+ )
78
+
79
+ chain = RetrievalQA.from_chain_type(
80
+ llm=llm,
81
+ chain_type="stuff",
82
+ retriever=vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 1}),
83
+ return_source_documents=True,
84
+ chain_type_kwargs={"prompt": prompt}
85
+ )
86
+
87
+ with st.spinner("Searching for the answer..."):
88
+ result = chain({"query": query})
89
+ st.header("Answer")
90
+ st.write(result["result"])
91
+