Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,62 +4,125 @@ import time
|
|
4 |
import streamlit as st
|
5 |
from dotenv import load_dotenv
|
6 |
from PyPDF2 import PdfReader
|
7 |
-
from
|
|
|
|
|
|
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
from langchain.chains import LLMChain
|
10 |
from langchain.memory import ConversationBufferWindowMemory
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
from langchain_community.vectorstores import FAISS
|
13 |
-
|
14 |
|
15 |
# load the environment variables into the python script
|
16 |
-
load_dotenv()
|
17 |
# fetching the openai_api_key environment variable
|
18 |
-
openai_api_key = os.getenv(
|
19 |
|
20 |
|
21 |
# Initialize session states
|
22 |
-
if
|
23 |
st.session_state.vectorDB = None
|
24 |
if "messages" not in st.session_state:
|
25 |
st.session_state.messages = []
|
26 |
-
if
|
27 |
-
st.session_state.bot_name =
|
28 |
-
if
|
29 |
st.session_state.chain = None
|
30 |
|
31 |
|
32 |
-
def
|
33 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
34 |
text = ""
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
|
39 |
return text
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def get_vectorstore(text_chunks):
|
42 |
-
"""
|
43 |
embeddings = OpenAIEmbeddings()
|
44 |
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
45 |
return vectorstore
|
46 |
|
|
|
47 |
def get_text_chunks(text: str):
|
48 |
-
"""
|
49 |
text_splitter = RecursiveCharacterTextSplitter(
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
)
|
55 |
chunks = text_splitter.split_text(text)
|
56 |
return chunks
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
# divinding the raw text into smaller chunks
|
65 |
text_chunks = get_text_chunks(raw_text)
|
@@ -69,81 +132,100 @@ def processing(pdf):
|
|
69 |
|
70 |
return vectorDB
|
71 |
|
|
|
72 |
def get_response(query: str):
|
73 |
-
"""This function will return the output of the user query!
|
74 |
-
|
75 |
# getting the context from the database that is similar to the user query
|
76 |
-
query_context = st.session_state.vectorDB.similarity_search(query=query
|
77 |
# calling the chain to get the output from the LLM
|
78 |
-
response = st.session_state.chain.invoke(
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
# Iterate through each word in the 'response' string after splitting it based on whitespace
|
80 |
for word in response.split():
|
81 |
# Yield the current word followed by a space, effectively creating a generator
|
82 |
yield word + " "
|
83 |
-
|
84 |
# Pause execution for 0.05 seconds (50 milliseconds) to introduce a delay
|
85 |
time.sleep(0.05)
|
86 |
|
|
|
87 |
def get_conversation_chain(vectorDB):
|
88 |
-
"""
|
89 |
-
|
90 |
-
# using OPENAI
|
91 |
-
llm =
|
92 |
|
93 |
# creating a template to pass into LLM
|
94 |
template = """You are a friendly customer support ChatBot with a name: {name} for the company, aiming to enhance the customer experience by providing tailored assistance and information.
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
{chat_history}
|
99 |
Human: {human_input}
|
100 |
AI: """
|
101 |
|
102 |
# creating a prompt that is used to format the input of the user
|
103 |
-
prompt = PromptTemplate(
|
|
|
|
|
|
|
104 |
|
105 |
# creating a memory that will store the chat history between chatbot and user
|
106 |
-
memory = ConversationBufferWindowMemory(
|
|
|
|
|
107 |
|
108 |
-
chain = LLMChain(llm=llm,prompt=prompt,memory=memory,verbose=True)
|
109 |
|
110 |
return chain
|
111 |
|
112 |
|
|
|
|
|
|
|
|
|
113 |
|
114 |
-
if __name__ =='__main__':
|
115 |
-
#setting the config of WebPage
|
116 |
-
st.set_page_config(page_title="Personalized ChatBot",page_icon="🤖")
|
117 |
-
st.header('Personalized Customer Support Chatbot 🤖',divider='rainbow')
|
118 |
-
|
119 |
# taking input( bot name and pdf file) from the user
|
120 |
with st.sidebar:
|
121 |
-
st.caption(
|
|
|
|
|
|
|
|
|
122 |
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
125 |
|
126 |
# moving forward only when both the inputs are given by the user
|
127 |
-
if
|
128 |
# the Process File button will process the pdf file and save the chunks into the vector database
|
129 |
-
if st.button(
|
130 |
-
# if there is existing chat history we will delete it
|
131 |
if st.session_state.messages != []:
|
132 |
st.session_state.messages = []
|
133 |
-
|
134 |
-
with st.spinner(
|
135 |
-
st.session_state[
|
136 |
-
st.session_state[
|
137 |
-
|
138 |
-
|
|
|
139 |
|
140 |
# if the vector database is ready to use then only show the chatbot interface
|
141 |
-
if st.session_state.vectorDB:
|
142 |
# Display chat messages from history on app rerun
|
143 |
for message in st.session_state.messages:
|
144 |
with st.chat_message(message["role"]):
|
145 |
st.write(message["content"])
|
146 |
-
|
147 |
# taking the input i.e. query from the user (walrus operator)
|
148 |
if prompt := st.chat_input(f"Message {st.session_state.bot_name}"):
|
149 |
# Add user message to chat history
|
|
|
4 |
import streamlit as st
|
5 |
from dotenv import load_dotenv
|
6 |
from PyPDF2 import PdfReader
|
7 |
+
from docx import Document
|
8 |
+
from docx.text.paragraph import Paragraph
|
9 |
+
from docx.table import Table
|
10 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
11 |
from langchain.prompts import PromptTemplate
|
12 |
from langchain.chains import LLMChain
|
13 |
from langchain.memory import ConversationBufferWindowMemory
|
14 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
15 |
from langchain_community.vectorstores import FAISS
|
16 |
+
|
17 |
|
18 |
# load the environment variables into the python script
|
19 |
+
load_dotenv()
|
20 |
# fetching the openai_api_key environment variable
|
21 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
22 |
|
23 |
|
24 |
# Initialize session states
|
25 |
+
if "vectorDB" not in st.session_state:
|
26 |
st.session_state.vectorDB = None
|
27 |
if "messages" not in st.session_state:
|
28 |
st.session_state.messages = []
|
29 |
+
if "bot_name" not in st.session_state:
|
30 |
+
st.session_state.bot_name = ""
|
31 |
+
if "chain" not in st.session_state:
|
32 |
st.session_state.chain = None
|
33 |
|
34 |
|
35 |
+
def process_paragraph(paragraph):
|
36 |
+
"""This Function returns the content of the paragraph present inside the DOC file"""
|
37 |
+
return paragraph.text
|
38 |
+
|
39 |
+
|
40 |
+
def process_table(table):
|
41 |
+
"""This function extracts the content from the table present inside the DOC file"""
|
42 |
text = ""
|
43 |
+
for row in table.rows:
|
44 |
+
for cell in row.cells:
|
45 |
+
text += cell.text
|
46 |
|
47 |
return text
|
48 |
|
49 |
+
|
50 |
+
def read_docx(file_path):
|
51 |
+
"""This function extracts the text from the DOC file"""
|
52 |
+
doc = Document(file_path)
|
53 |
+
text = []
|
54 |
+
|
55 |
+
for element in doc.iter_inner_content():
|
56 |
+
if isinstance(element, Paragraph):
|
57 |
+
text.append(process_paragraph(element))
|
58 |
+
elif isinstance(element, Table):
|
59 |
+
text.append(process_table(element))
|
60 |
+
|
61 |
+
return " ".join(text)
|
62 |
+
|
63 |
+
|
64 |
+
def read_text_file(text_file):
|
65 |
+
"""This function extracts the text from the TEXT file"""
|
66 |
+
try:
|
67 |
+
text = text_file.read().decode("utf-8")
|
68 |
+
return text
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
st.error(f"Error while reading {text_file.name} file : **{e}**")
|
72 |
+
return None
|
73 |
+
|
74 |
+
|
75 |
+
def get_pdf_text(pdf):
|
76 |
+
"""This function extracts the text from the PDF file"""
|
77 |
+
try:
|
78 |
+
text = []
|
79 |
+
pdf_reader = PdfReader(pdf)
|
80 |
+
for page in pdf_reader.pages:
|
81 |
+
text.append(page.extract_text())
|
82 |
+
|
83 |
+
return " ".join(text)
|
84 |
+
|
85 |
+
except Exception as e:
|
86 |
+
st.error(f"Error while reading {pdf.name} file : **{e}**")
|
87 |
+
return None
|
88 |
+
|
89 |
+
|
90 |
def get_vectorstore(text_chunks):
|
91 |
+
"""This function will create a vector database as well as create & store the embedding of the text chunks into the VectorDB"""
|
92 |
embeddings = OpenAIEmbeddings()
|
93 |
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
94 |
return vectorstore
|
95 |
|
96 |
+
|
97 |
def get_text_chunks(text: str):
|
98 |
+
"""This function will split the text into the smaller chunks"""
|
99 |
text_splitter = RecursiveCharacterTextSplitter(
|
100 |
+
chunk_size=1000,
|
101 |
+
chunk_overlap=50,
|
102 |
+
length_function=len,
|
103 |
+
is_separator_regex=False,
|
104 |
)
|
105 |
chunks = text_splitter.split_text(text)
|
106 |
return chunks
|
107 |
|
108 |
+
|
109 |
+
def processing(files):
|
110 |
+
"""This function"""
|
111 |
+
|
112 |
+
data = []
|
113 |
+
for file in files:
|
114 |
+
if file.name.endswith(".docx"):
|
115 |
+
text = read_docx(file)
|
116 |
+
|
117 |
+
elif file.name.endswith(".pdf"):
|
118 |
+
text = get_pdf_text(file)
|
119 |
+
|
120 |
+
else:
|
121 |
+
text = read_text_file(file)
|
122 |
+
|
123 |
+
data.append(text)
|
124 |
+
|
125 |
+
raw_text = " ".join(data)
|
126 |
|
127 |
# divinding the raw text into smaller chunks
|
128 |
text_chunks = get_text_chunks(raw_text)
|
|
|
132 |
|
133 |
return vectorDB
|
134 |
|
135 |
+
|
136 |
def get_response(query: str):
|
137 |
+
"""This function will return the output of the user query!"""
|
138 |
+
|
139 |
# getting the context from the database that is similar to the user query
|
140 |
+
query_context = st.session_state.vectorDB.similarity_search(query=query)
|
141 |
# calling the chain to get the output from the LLM
|
142 |
+
response = st.session_state.chain.invoke(
|
143 |
+
{
|
144 |
+
"human_input": query,
|
145 |
+
"context": query_context[0].page_content,
|
146 |
+
"name": st.session_state.bot_name,
|
147 |
+
}
|
148 |
+
)["text"]
|
149 |
# Iterate through each word in the 'response' string after splitting it based on whitespace
|
150 |
for word in response.split():
|
151 |
# Yield the current word followed by a space, effectively creating a generator
|
152 |
yield word + " "
|
153 |
+
|
154 |
# Pause execution for 0.05 seconds (50 milliseconds) to introduce a delay
|
155 |
time.sleep(0.05)
|
156 |
|
157 |
+
|
158 |
def get_conversation_chain(vectorDB):
|
159 |
+
"""This function will create and return a LLM-Chain"""
|
160 |
+
|
161 |
+
# using OPENAI ChatModel
|
162 |
+
llm = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo-16k")
|
163 |
|
164 |
# creating a template to pass into LLM
|
165 |
template = """You are a friendly customer support ChatBot with a name: {name} for the company, aiming to enhance the customer experience by providing tailored assistance and information.
|
166 |
+
Answer the question as detailed as possible and to the point from the context: {context}\n\n.
|
167 |
+
If the answer is not in the provided context then only just say, "answer is not available in the context", do not provide the wrong answer\n\n
|
|
|
168 |
{chat_history}
|
169 |
Human: {human_input}
|
170 |
AI: """
|
171 |
|
172 |
# creating a prompt that is used to format the input of the user
|
173 |
+
prompt = PromptTemplate(
|
174 |
+
template=template,
|
175 |
+
input_variables=["chat_history", "human_input", "name", "context"],
|
176 |
+
)
|
177 |
|
178 |
# creating a memory that will store the chat history between chatbot and user
|
179 |
+
memory = ConversationBufferWindowMemory(
|
180 |
+
memory_key="chat_history", input_key="human_input", k=5
|
181 |
+
)
|
182 |
|
183 |
+
chain = LLMChain(llm=llm, prompt=prompt, memory=memory, verbose=True)
|
184 |
|
185 |
return chain
|
186 |
|
187 |
|
188 |
+
if __name__ == "__main__":
|
189 |
+
# setting the config of WebPage
|
190 |
+
st.set_page_config(page_title="Personalized ChatBot", page_icon="🤖")
|
191 |
+
st.header("Personalized Customer Support Chatbot 🤖", divider="rainbow")
|
192 |
|
|
|
|
|
|
|
|
|
|
|
193 |
# taking input( bot name and pdf file) from the user
|
194 |
with st.sidebar:
|
195 |
+
st.caption("Please enter the **Bot Name** and Upload **PDF** File!")
|
196 |
+
|
197 |
+
bot_name = st.text_input(
|
198 |
+
label="Bot Name", placeholder="Enter the bot name here....", key="bot_name"
|
199 |
+
)
|
200 |
|
201 |
+
files = st.file_uploader(
|
202 |
+
label="Upload Files!",
|
203 |
+
type=["pdf", "txt", "docx"],
|
204 |
+
accept_multiple_files=True,
|
205 |
+
)
|
206 |
|
207 |
# moving forward only when both the inputs are given by the user
|
208 |
+
if files and bot_name:
|
209 |
# the Process File button will process the pdf file and save the chunks into the vector database
|
210 |
+
if st.button("Process File"):
|
211 |
+
# if there is existing chat history we will delete it
|
212 |
if st.session_state.messages != []:
|
213 |
st.session_state.messages = []
|
214 |
+
|
215 |
+
with st.spinner("Processing....."):
|
216 |
+
st.session_state["vectorDB"] = processing(files)
|
217 |
+
st.session_state["chain"] = get_conversation_chain(
|
218 |
+
st.session_state["vectorDB"]
|
219 |
+
)
|
220 |
+
st.success("File Processed", icon="✅")
|
221 |
|
222 |
# if the vector database is ready to use then only show the chatbot interface
|
223 |
+
if st.session_state.vectorDB:
|
224 |
# Display chat messages from history on app rerun
|
225 |
for message in st.session_state.messages:
|
226 |
with st.chat_message(message["role"]):
|
227 |
st.write(message["content"])
|
228 |
+
|
229 |
# taking the input i.e. query from the user (walrus operator)
|
230 |
if prompt := st.chat_input(f"Message {st.session_state.bot_name}"):
|
231 |
# Add user message to chat history
|