testcolab2 commited on
Commit
89c2788
·
verified ·
1 Parent(s): a60407d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -205
app.py CHANGED
@@ -1,214 +1,74 @@
1
- from langchain_community.document_loaders import DirectoryLoader
2
- from langchain_community.document_loaders import PyPDFLoader
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain.text_splitter import CharacterTextSplitter
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
7
- from langchain.embeddings import HuggingFaceInstructEmbeddings
8
- from langchain.chains import ConversationChain
9
- from langchain.memory import ConversationBufferMemory
10
- from langchain.chains import (
11
- StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
12
- )
13
- from langchain_core.prompts import PromptTemplate
14
  import streamlit as st
 
 
 
 
15
  from PyPDF2 import PdfReader
16
 
17
- css = '''
18
- <style>
19
- .chat-message {
20
- padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
21
- }
22
- .chat-message.user {
23
- background-color: #2b313e
24
- }
25
- .chat-message.bot {
26
- background-color: #475063
27
- }
28
- .chat-message .avatar {
29
- width: 20%;
30
- }
31
- .chat-message .avatar img {
32
- max-width: 78px;
33
- max-height: 78px;
34
- border-radius: 50%;
35
- object-fit: cover;
36
- }
37
- .chat-message .message {
38
- width: 80%;
39
- padding: 0 1.5rem;
40
- color: #fff;
41
- }
42
- '''
43
-
44
- bot_template = '''
45
- <div class="chat-message bot">
46
- <div class="avatar">
47
- <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png">
48
- </div>
49
- <div class="message">{{MSG}}</div>
50
- </div>
51
- '''
52
-
53
- user_template = '''
54
- <div class="chat-message user">
55
- <div class="avatar">
56
- <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
57
- </div>
58
- <div class="message">{{MSG}}</div>
59
- </div>
60
- '''
61
-
62
- def get_pdf_text(pdf_docs):
63
- text = ""
64
- for pdf in pdf_docs:
65
- pdf_reader = PdfReader(pdf)
66
- for page in pdf_reader.pages:
67
- text += page.extract_text()
68
- return text
69
-
70
-
71
- def get_text_chunks(text):
72
- text_splitter = CharacterTextSplitter(
73
- separator="\n",
74
- chunk_size=1000,
75
- chunk_overlap=200,
76
- length_function=len
77
- )
78
- chunks = text_splitter.split_text(text)
79
- return chunks
80
-
81
-
82
- def get_vectorstore(text_chunks):
83
- # embeddings = OpenAIEmbeddings()
84
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
85
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
86
- return vectorstore
87
-
88
-
89
- def get_conversation_chain(vectorstore):
90
- # llm = ChatOpenAI()
91
- llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
92
-
93
- memory = ConversationBufferMemory(
94
- memory_key='chat_history', return_messages=True)
95
- conversation_chain = ConversationalRetrievalChain.from_llm(
96
- llm=llm,
97
- retriever=vectorstore.as_retriever(),
98
- memory=memory
99
- )
100
- return conversation_chain
101
-
102
-
103
-
104
-
105
- def handle_userinput(user_question):
106
- response = st.session_state.conversation({'question': user_question})
107
- st.session_state.chat_history = response['chat_history']
108
-
109
- for i, message in enumerate(st.session_state.chat_history):
110
- if i % 2 == 0:
111
- st.write(user_template.replace(
112
- "{{MSG}}", message.content), unsafe_allow_html=True)
113
  else:
114
- st.write(bot_template.replace(
115
- "{{MSG}}", message.content), unsafe_allow_html=True)
116
-
117
-
118
- def initialize_conversation_chain(text_chunks):
119
- vectorstore = get_vectorstore(text_chunks)
120
- return get_conversation_chain(vectorstore)
121
-
122
- def main():
123
- st.set_page_config(page_title="Chat with multiple PDFs", page_icon="logo1.png")
124
- st.write(css, unsafe_allow_html=True)
125
-
126
- if "conversation" not in st.session_state:
127
- st.session_state.conversation = None
128
-
129
- if "chat_history" not in st.session_state:
130
- st.session_state.chat_history = None
131
-
132
- st.header("Chat with multiple PDFs :books:")
133
- user_question = st.text_input("Ask a question about your documents:")
134
-
135
- if user_question:
136
- handle_userinput(user_question)
137
-
138
- with st.sidebar:
139
- st.subheader("Your documents")
140
- pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
141
-
142
- if st.button("Process"):
143
- with st.spinner("Processing"):
144
- # Get PDF text
145
- raw_text = get_pdf_text(pdf_docs)
146
-
147
- # Get the text chunks
148
- text_chunks = get_text_chunks(raw_text)
149
-
150
- vectorstore = get_vectorstore(text_chunks)
151
-
152
- # Initialize conversation chain
153
- st.session_state.conversation = get_conversation_chain(vectorstore)
154
-
155
-
156
-
157
- if __name__ == '__main__':
158
- main()
159
-
160
-
161
-
162
- # def main():
163
-
164
- # st.set_page_config(page_title="Chat with multiple PDFs",
165
- # page_icon="logo1.png" )
166
- # st.write(css, unsafe_allow_html=True)
167
-
168
- # if "conversation" not in st.session_state:
169
- # st.session_state.conversation = None
170
- # if "chat_history" not in st.session_state:
171
- # st.session_state.chat_history = None
172
-
173
- # st.header("Chat with multiple PDFs :books:")
174
- # user_question = st.text_input("Ask a question about your documents:")
175
- # if user_question:
176
- # handle_userinput(user_question)
177
-
178
- # with st.sidebar:
179
- # st.subheader("Your documents")
180
- # pdf_docs = st.file_uploader(
181
- # "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
182
-
183
-
184
- # if st.button("Process"):
185
- # with st.spinner("Processing"):
186
- # # get pdf text
187
- # raw_text = get_pdf_text(pdf_docs)
188
-
189
- # # get the text chunks
190
- # text_chunks = get_text_chunks(raw_text)
191
-
192
- # # create vector store
193
- # vectorstore = get_vectorstore(text_chunks)
194
-
195
- # # create conversation chain
196
- # st.session_state.conversation = get_conversation_chain(
197
- # vectorstore)
198
-
199
 
 
 
 
 
 
 
 
200
 
201
- # # Clear chat history
202
- # st.session_state.chat_history = None
203
-
204
- # if st.session_state.conversation is not None:
205
- # if st.session_state.chat_history is None:
206
- # # Greet the user
207
- # greeting = "Hello! How can I assist you with your documents?"
208
- # st.write(bot_template.replace("{{MSG}}", greeting), unsafe_allow_html=True)
209
 
210
-
 
211
 
 
 
212
 
213
- # if __name__ == '__main__':
214
- # main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from llama_index import VectorStoreIndex, ServiceContext
3
+ from llama_index.embeddings import HuggingFaceEmbedding
4
+ from llama_index.llms import HuggingFaceInferenceAPI
5
+ from llama_index.schema import Document
6
  from PyPDF2 import PdfReader
7
 
8
+ class DocumentLoader:
9
+ @staticmethod
10
+ def read_pdf(uploaded_file):
11
+ pdf_reader = PdfReader(uploaded_file)
12
+ text = ""
13
+ for page_num in range(len(pdf_reader.pages)):
14
+ text += pdf_reader.pages[page_num].extract_text()
15
+ return text
16
+
17
+ @staticmethod
18
+ def load_documents(uploaded_pdf):
19
+ file_contents = DocumentLoader.read_pdf(uploaded_pdf)
20
+ return [Document(text=file_contents)]
21
+
22
+ class IndexCreator:
23
+ @staticmethod
24
+ def create_index(documents, hf_token):
25
+ llm = HuggingFaceInferenceAPI(model_name="HuggingFaceH4/zephyr-7b-alpha", token=hf_token)
26
+ embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1")
27
+
28
+ service_context = ServiceContext.from_defaults(
29
+ llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae
30
+ )
31
+ index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)
32
+ index.storage_context.persist()
33
+ return index.as_query_engine()
34
+
35
+ class PDFQueryApp:
36
+ def __init__(self):
37
+ st.title("Private LLM")
38
+ st.write("Base Model : **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggineFace)**")
39
+ st.write("Embedding Model : **WhereIsAI/UAE-Large-V1(open-source from HuggineFace)**")
40
+ st.write("Ask anything from the data that you upload")
41
+ st.wrte("Note !! As its runnning on a CPU it takes times 5 to 8 mins for each response")
42
+
43
+ self.hf_token = st.text_input("Enter your Hugging Face token [Free]:")
44
+ self.uploaded_pdf = st.file_uploader("Upload your data[PDF for now]", type=['pdf'])
45
+ self.query_engine = None
46
+
47
+ def load_and_create_index(self):
48
+ if self.uploaded_pdf:
49
+ st.success("Dataset has been loaded into the model succesfully")
50
+ documents = DocumentLoader.load_documents(self.uploaded_pdf)
51
+ self.query_engine = IndexCreator.create_index(documents, self.hf_token)
52
+ st.success("Vector embeddings have been succesfully created and initiated")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  else:
54
+ st.warning("You have to upload a PDF file first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ def run_query(self, user_query):
57
+ if self.query_engine and user_query:
58
+ with st.spinner('Fetching the response from the model Please wait !!!!...'):
59
+ response = self.query_engine.query(user_query)
60
+ st.markdown(f"**Response:** {response}")
61
+ else:
62
+ st.warning("Please load documents and create vector embeddings before querying.")
63
 
64
+ if __name__ == "__main__":
65
+ app = PDFQueryApp()
 
 
 
 
 
 
66
 
67
+ # Load and create index
68
+ app.load_and_create_index()
69
 
70
+ # Streamlit input for user query
71
+ user_query = st.text_input("Enter your query from the dataset:")
72
 
73
+ # Query engine with user input
74
+ app.run_query(user_query)