Darpan07 commited on
Commit
4cf62a0
1 Parent(s): e556297

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -58
app.py CHANGED
@@ -4,62 +4,125 @@ import time
4
  import streamlit as st
5
  from dotenv import load_dotenv
6
  from PyPDF2 import PdfReader
7
- from langchain_openai import OpenAI, OpenAIEmbeddings
 
 
 
8
  from langchain.prompts import PromptTemplate
9
  from langchain.chains import LLMChain
10
  from langchain.memory import ConversationBufferWindowMemory
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from langchain_community.vectorstores import FAISS
13
-
14
 
15
  # load the environment variables into the python script
16
- load_dotenv()
17
  # fetching the openai_api_key environment variable
18
- openai_api_key = os.getenv('OPENAI_API_KEY')
19
 
20
 
21
  # Initialize session states
22
- if 'vectorDB' not in st.session_state:
23
  st.session_state.vectorDB = None
24
  if "messages" not in st.session_state:
25
  st.session_state.messages = []
26
- if 'bot_name' not in st.session_state:
27
- st.session_state.bot_name = ''
28
- if 'chain' not in st.session_state:
29
  st.session_state.chain = None
30
 
31
 
32
- def get_pdf_text(pdf) -> str:
33
- """ This function extracts the text from the PDF file """
 
 
 
 
 
34
  text = ""
35
- pdf_reader = PdfReader(pdf)
36
- for page in pdf_reader.pages:
37
- text += page.extract_text()
38
 
39
  return text
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def get_vectorstore(text_chunks):
42
- """ This function will create a vector database as well as create and store the embedding of the text chunks into the VectorDB """
43
  embeddings = OpenAIEmbeddings()
44
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
45
  return vectorstore
46
 
 
47
  def get_text_chunks(text: str):
48
- """ This function will split the text into the smaller chunks"""
49
  text_splitter = RecursiveCharacterTextSplitter(
50
- chunk_size=1000,
51
- chunk_overlap=100,
52
- length_function=len,
53
- is_separator_regex=False,
54
  )
55
  chunks = text_splitter.split_text(text)
56
  return chunks
57
 
58
- def processing(pdf):
59
- """This function divides the PDF into smaller chunks and saves these segmented chunks in a vector database. And return the Vector Database"""
60
-
61
- # getting all the raw text from the PDF
62
- raw_text = get_pdf_text(pdf)
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  # divinding the raw text into smaller chunks
65
  text_chunks = get_text_chunks(raw_text)
@@ -69,81 +132,100 @@ def processing(pdf):
69
 
70
  return vectorDB
71
 
 
72
  def get_response(query: str):
73
- """This function will return the output of the user query! """
74
-
75
  # getting the context from the database that is similar to the user query
76
- query_context = st.session_state.vectorDB.similarity_search(query=query,k=4)
77
  # calling the chain to get the output from the LLM
78
- response = st.session_state.chain.invoke({'human_input':query,'context':query_context,'name':st.session_state.bot_name})['text']
 
 
 
 
 
 
79
  # Iterate through each word in the 'response' string after splitting it based on whitespace
80
  for word in response.split():
81
  # Yield the current word followed by a space, effectively creating a generator
82
  yield word + " "
83
-
84
  # Pause execution for 0.05 seconds (50 milliseconds) to introduce a delay
85
  time.sleep(0.05)
86
 
 
87
  def get_conversation_chain(vectorDB):
88
- """ This function will create and return a LLM-Chain"""
89
-
90
- # using OPENAI LLM
91
- llm = OpenAI(temperature=0.4)
92
 
93
  # creating a template to pass into LLM
94
  template = """You are a friendly customer support ChatBot with a name: {name} for the company, aiming to enhance the customer experience by providing tailored assistance and information.
95
-
96
- Answer the question as detailed as possible and to the point from the context: {context}\n , if the answer is not in the provided context just say, "answer is not available in the context", do not provide the wrong answer\n\n
97
-
98
  {chat_history}
99
  Human: {human_input}
100
  AI: """
101
 
102
  # creating a prompt that is used to format the input of the user
103
- prompt = PromptTemplate(template = template,input_variables=['chat_history','human_input','name','context'])
 
 
 
104
 
105
  # creating a memory that will store the chat history between chatbot and user
106
- memory = ConversationBufferWindowMemory(memory_key='chat_history',input_key="human_input",k=5)
 
 
107
 
108
- chain = LLMChain(llm=llm,prompt=prompt,memory=memory,verbose=True)
109
 
110
  return chain
111
 
112
 
 
 
 
 
113
 
114
- if __name__ =='__main__':
115
- #setting the config of WebPage
116
- st.set_page_config(page_title="Personalized ChatBot",page_icon="🤖")
117
- st.header('Personalized Customer Support Chatbot 🤖',divider='rainbow')
118
-
119
  # taking input( bot name and pdf file) from the user
120
  with st.sidebar:
121
- st.caption('Please enter the **Bot Name** and Upload **PDF** File!')
 
 
 
 
122
 
123
- bot_name = st.text_input(label='Bot Name',placeholder='Enter the bot name here....',key="bot_name")
124
- file = st.file_uploader("Upload a PDF file!",type='pdf')
 
 
 
125
 
126
  # moving forward only when both the inputs are given by the user
127
- if file and bot_name:
128
  # the Process File button will process the pdf file and save the chunks into the vector database
129
- if st.button('Process File'):
130
- # if there is existing chat history we will delete it
131
  if st.session_state.messages != []:
132
  st.session_state.messages = []
133
-
134
- with st.spinner('Processing.....'):
135
- st.session_state['vectorDB'] = processing(file)
136
- st.session_state['chain'] = get_conversation_chain(st.session_state['vectorDB'])
137
- st.success('File Processed',icon="")
138
-
 
139
 
140
  # if the vector database is ready to use then only show the chatbot interface
141
- if st.session_state.vectorDB:
142
  # Display chat messages from history on app rerun
143
  for message in st.session_state.messages:
144
  with st.chat_message(message["role"]):
145
  st.write(message["content"])
146
-
147
  # taking the input i.e. query from the user (walrus operator)
148
  if prompt := st.chat_input(f"Message {st.session_state.bot_name}"):
149
  # Add user message to chat history
 
4
  import streamlit as st
5
  from dotenv import load_dotenv
6
  from PyPDF2 import PdfReader
7
+ from docx import Document
8
+ from docx.text.paragraph import Paragraph
9
+ from docx.table import Table
10
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
11
  from langchain.prompts import PromptTemplate
12
  from langchain.chains import LLMChain
13
  from langchain.memory import ConversationBufferWindowMemory
14
  from langchain.text_splitter import RecursiveCharacterTextSplitter
15
  from langchain_community.vectorstores import FAISS
16
+
17
 
18
  # load the environment variables into the python script
19
+ load_dotenv()
20
  # fetching the openai_api_key environment variable
21
+ openai_api_key = os.getenv("OPENAI_API_KEY")
22
 
23
 
24
  # Initialize session states
25
+ if "vectorDB" not in st.session_state:
26
  st.session_state.vectorDB = None
27
  if "messages" not in st.session_state:
28
  st.session_state.messages = []
29
+ if "bot_name" not in st.session_state:
30
+ st.session_state.bot_name = ""
31
+ if "chain" not in st.session_state:
32
  st.session_state.chain = None
33
 
34
 
35
+ def process_paragraph(paragraph):
36
+ """This Function returns the content of the paragraph present inside the DOC file"""
37
+ return paragraph.text
38
+
39
+
40
+ def process_table(table):
41
+ """This function extracts the content from the table present inside the DOC file"""
42
  text = ""
43
+ for row in table.rows:
44
+ for cell in row.cells:
45
+ text += cell.text
46
 
47
  return text
48
 
49
+
50
+ def read_docx(file_path):
51
+ """This function extracts the text from the DOC file"""
52
+ doc = Document(file_path)
53
+ text = []
54
+
55
+ for element in doc.iter_inner_content():
56
+ if isinstance(element, Paragraph):
57
+ text.append(process_paragraph(element))
58
+ elif isinstance(element, Table):
59
+ text.append(process_table(element))
60
+
61
+ return " ".join(text)
62
+
63
+
64
+ def read_text_file(text_file):
65
+ """This function extracts the text from the TEXT file"""
66
+ try:
67
+ text = text_file.read().decode("utf-8")
68
+ return text
69
+
70
+ except Exception as e:
71
+ st.error(f"Error while reading {text_file.name} file : **{e}**")
72
+ return None
73
+
74
+
75
+ def get_pdf_text(pdf):
76
+ """This function extracts the text from the PDF file"""
77
+ try:
78
+ text = []
79
+ pdf_reader = PdfReader(pdf)
80
+ for page in pdf_reader.pages:
81
+ text.append(page.extract_text())
82
+
83
+ return " ".join(text)
84
+
85
+ except Exception as e:
86
+ st.error(f"Error while reading {pdf.name} file : **{e}**")
87
+ return None
88
+
89
+
90
  def get_vectorstore(text_chunks):
91
+ """This function will create a vector database as well as create & store the embedding of the text chunks into the VectorDB"""
92
  embeddings = OpenAIEmbeddings()
93
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
94
  return vectorstore
95
 
96
+
97
  def get_text_chunks(text: str):
98
+ """This function will split the text into the smaller chunks"""
99
  text_splitter = RecursiveCharacterTextSplitter(
100
+ chunk_size=1000,
101
+ chunk_overlap=50,
102
+ length_function=len,
103
+ is_separator_regex=False,
104
  )
105
  chunks = text_splitter.split_text(text)
106
  return chunks
107
 
108
+
109
+ def processing(files):
110
+ """This function"""
111
+
112
+ data = []
113
+ for file in files:
114
+ if file.name.endswith(".docx"):
115
+ text = read_docx(file)
116
+
117
+ elif file.name.endswith(".pdf"):
118
+ text = get_pdf_text(file)
119
+
120
+ else:
121
+ text = read_text_file(file)
122
+
123
+ data.append(text)
124
+
125
+ raw_text = " ".join(data)
126
 
127
  # divinding the raw text into smaller chunks
128
  text_chunks = get_text_chunks(raw_text)
 
132
 
133
  return vectorDB
134
 
135
+
136
  def get_response(query: str):
137
+ """This function will return the output of the user query!"""
138
+
139
  # getting the context from the database that is similar to the user query
140
+ query_context = st.session_state.vectorDB.similarity_search(query=query)
141
  # calling the chain to get the output from the LLM
142
+ response = st.session_state.chain.invoke(
143
+ {
144
+ "human_input": query,
145
+ "context": query_context[0].page_content,
146
+ "name": st.session_state.bot_name,
147
+ }
148
+ )["text"]
149
  # Iterate through each word in the 'response' string after splitting it based on whitespace
150
  for word in response.split():
151
  # Yield the current word followed by a space, effectively creating a generator
152
  yield word + " "
153
+
154
  # Pause execution for 0.05 seconds (50 milliseconds) to introduce a delay
155
  time.sleep(0.05)
156
 
157
+
158
  def get_conversation_chain(vectorDB):
159
+ """This function will create and return a LLM-Chain"""
160
+
161
+ # using OPENAI ChatModel
162
+ llm = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo-16k")
163
 
164
  # creating a template to pass into LLM
165
  template = """You are a friendly customer support ChatBot with a name: {name} for the company, aiming to enhance the customer experience by providing tailored assistance and information.
166
+ Answer the question as detailed as possible and to the point from the context: {context}\n\n.
167
+ If the answer is not in the provided context then only just say, "answer is not available in the context", do not provide the wrong answer\n\n
 
168
  {chat_history}
169
  Human: {human_input}
170
  AI: """
171
 
172
  # creating a prompt that is used to format the input of the user
173
+ prompt = PromptTemplate(
174
+ template=template,
175
+ input_variables=["chat_history", "human_input", "name", "context"],
176
+ )
177
 
178
  # creating a memory that will store the chat history between chatbot and user
179
+ memory = ConversationBufferWindowMemory(
180
+ memory_key="chat_history", input_key="human_input", k=5
181
+ )
182
 
183
+ chain = LLMChain(llm=llm, prompt=prompt, memory=memory, verbose=True)
184
 
185
  return chain
186
 
187
 
188
+ if __name__ == "__main__":
189
+ # setting the config of WebPage
190
+ st.set_page_config(page_title="Personalized ChatBot", page_icon="🤖")
191
+ st.header("Personalized Customer Support Chatbot 🤖", divider="rainbow")
192
 
 
 
 
 
 
193
  # taking input( bot name and pdf file) from the user
194
  with st.sidebar:
195
+ st.caption("Please enter the **Bot Name** and Upload **PDF** File!")
196
+
197
+ bot_name = st.text_input(
198
+ label="Bot Name", placeholder="Enter the bot name here....", key="bot_name"
199
+ )
200
 
201
+ files = st.file_uploader(
202
+ label="Upload Files!",
203
+ type=["pdf", "txt", "docx"],
204
+ accept_multiple_files=True,
205
+ )
206
 
207
  # moving forward only when both the inputs are given by the user
208
+ if files and bot_name:
209
  # the Process File button will process the pdf file and save the chunks into the vector database
210
+ if st.button("Process File"):
211
+ # if there is existing chat history we will delete it
212
  if st.session_state.messages != []:
213
  st.session_state.messages = []
214
+
215
+ with st.spinner("Processing....."):
216
+ st.session_state["vectorDB"] = processing(files)
217
+ st.session_state["chain"] = get_conversation_chain(
218
+ st.session_state["vectorDB"]
219
+ )
220
+ st.success("File Processed", icon="✅")
221
 
222
  # if the vector database is ready to use then only show the chatbot interface
223
+ if st.session_state.vectorDB:
224
  # Display chat messages from history on app rerun
225
  for message in st.session_state.messages:
226
  with st.chat_message(message["role"]):
227
  st.write(message["content"])
228
+
229
  # taking the input i.e. query from the user (walrus operator)
230
  if prompt := st.chat_input(f"Message {st.session_state.bot_name}"):
231
  # Add user message to chat history