Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +108 -0
- htmlTemplates.py +45 -0
- requirements.txt +13 -0
app.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# importing dependencies
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import streamlit as st
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
from langchain.text_splitter import CharacterTextSplitter
|
6 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
7 |
+
from langchain.vectorstores import faiss
|
8 |
+
from langchain.prompts import PromptTemplate
|
9 |
+
from langchain.memory import ConversationBufferMemory
|
10 |
+
from langchain.chains import ConversationalRetrievalChain
|
11 |
+
from langchain.chat_models import ChatOpenAI
|
12 |
+
from htmlTemplates import css, bot_template, user_template
|
13 |
+
from langchain.embeddings import openai
|
14 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
15 |
+
|
16 |
+
|
17 |
+
# creating custom template to guide llm model
|
18 |
+
custom_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
|
19 |
+
Chat History:
|
20 |
+
{chat_history}
|
21 |
+
Follow Up Input: {question}
|
22 |
+
Standalone question:"""
|
23 |
+
|
24 |
+
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)
|
25 |
+
|
26 |
+
# extracting text from pdf
|
27 |
+
def get_pdf_text(docs):
|
28 |
+
text=""
|
29 |
+
for pdf in docs:
|
30 |
+
pdf_reader=PdfReader(pdf)
|
31 |
+
for page in pdf_reader.pages:
|
32 |
+
text+=page.extract_text()
|
33 |
+
return text
|
34 |
+
|
35 |
+
# converting text to chunks
|
36 |
+
def get_chunks(raw_text):
|
37 |
+
text_splitter=CharacterTextSplitter(separator="\n",
|
38 |
+
chunk_size=1000,
|
39 |
+
chunk_overlap=200,
|
40 |
+
length_function=len)
|
41 |
+
chunks=text_splitter.split_text(raw_text)
|
42 |
+
return chunks
|
43 |
+
|
44 |
+
# using all-MiniLm embeddings model and faiss to get vectorstore
|
45 |
+
def get_vectorstore(chunks):
|
46 |
+
embeddings=OpenAIEmbeddings()
|
47 |
+
vectorstore=faiss.FAISS.from_texts(texts=chunks,embedding=embeddings)
|
48 |
+
return vectorstore
|
49 |
+
|
50 |
+
# generating conversation chain
|
51 |
+
def get_conversationchain(vectorstore):
|
52 |
+
llm=ChatOpenAI(temperature=0.2,model_name='gpt-4-turbo')
|
53 |
+
memory = ConversationBufferMemory(memory_key='chat_history',
|
54 |
+
return_messages=True,
|
55 |
+
output_key='answer') # using conversation buffer memory to hold past information
|
56 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
57 |
+
llm=llm,
|
58 |
+
retriever=vectorstore.as_retriever(),
|
59 |
+
condense_question_prompt=CUSTOM_QUESTION_PROMPT,
|
60 |
+
memory=memory)
|
61 |
+
return conversation_chain
|
62 |
+
|
63 |
+
# generating response from user queries and displaying them accordingly
|
64 |
+
def handle_question(question):
|
65 |
+
response=st.session_state.conversation({'question': question})
|
66 |
+
st.session_state.chat_history=response["chat_history"]
|
67 |
+
for i,msg in enumerate(st.session_state.chat_history):
|
68 |
+
if i%2==0:
|
69 |
+
st.write(user_template.replace("{{MSG}}",msg.content,),unsafe_allow_html=True)
|
70 |
+
else:
|
71 |
+
st.write(bot_template.replace("{{MSG}}",msg.content),unsafe_allow_html=True)
|
72 |
+
|
73 |
+
|
74 |
+
def main():
|
75 |
+
load_dotenv()
|
76 |
+
st.set_page_config(page_title="Chat with multiple PDFs",page_icon=":books:")
|
77 |
+
st.write(css,unsafe_allow_html=True)
|
78 |
+
if "conversation" not in st.session_state:
|
79 |
+
st.session_state.conversation=None
|
80 |
+
|
81 |
+
if "chat_history" not in st.session_state:
|
82 |
+
st.session_state.chat_history=None
|
83 |
+
|
84 |
+
st.header("Chat with multiple PDFs :books:")
|
85 |
+
question=st.text_input("Ask question from your document:")
|
86 |
+
if question:
|
87 |
+
handle_question(question)
|
88 |
+
with st.sidebar:
|
89 |
+
st.subheader("Your documents")
|
90 |
+
docs=st.file_uploader("Upload your PDF here and click on 'Process'",accept_multiple_files=True)
|
91 |
+
if st.button("Process"):
|
92 |
+
with st.spinner("Processing"):
|
93 |
+
|
94 |
+
#get the pdf
|
95 |
+
raw_text=get_pdf_text(docs)
|
96 |
+
|
97 |
+
#get the text chunks
|
98 |
+
text_chunks=get_chunks(raw_text)
|
99 |
+
|
100 |
+
#create vectorstore
|
101 |
+
vectorstore=get_vectorstore(text_chunks)
|
102 |
+
|
103 |
+
#create conversation chain
|
104 |
+
st.session_state.conversation=get_conversationchain(vectorstore)
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == '__main__':
|
108 |
+
main()
|
htmlTemplates.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
css = '''
|
2 |
+
<style>
|
3 |
+
.chat-message {
|
4 |
+
padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
|
5 |
+
}
|
6 |
+
.chat-message.user {
|
7 |
+
align: right,
|
8 |
+
background-color: #2b313e
|
9 |
+
}
|
10 |
+
.chat-message.bot {
|
11 |
+
background-color: #475063
|
12 |
+
}
|
13 |
+
.chat-message .avatar {
|
14 |
+
width: 20%;
|
15 |
+
}
|
16 |
+
.chat-message .avatar img {
|
17 |
+
max-width: 78px;
|
18 |
+
max-height: 78px;
|
19 |
+
border-radius: 50%;
|
20 |
+
object-fit: cover;
|
21 |
+
}
|
22 |
+
.chat-message .message {
|
23 |
+
width: 80%;
|
24 |
+
padding: 0 1.5rem;
|
25 |
+
color: #fff;
|
26 |
+
}
|
27 |
+
'''
|
28 |
+
bot_template = '''
|
29 |
+
<div class="chat-message bot">
|
30 |
+
<div class="avatar">
|
31 |
+
<img src="https://cdn-icons-png.flaticon.com/512/6134/6134346.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
|
32 |
+
</div>
|
33 |
+
<div class="message">{{MSG}}</div>
|
34 |
+
</div>
|
35 |
+
'''
|
36 |
+
|
37 |
+
user_template = '''
|
38 |
+
<div class="chat-message user">
|
39 |
+
<div class="message" style="text-align:right">{{MSG}}</div>
|
40 |
+
<div class="avatar">
|
41 |
+
<img src="https://png.pngtree.com/png-vector/20190321/ourmid/pngtree-vector-users-icon-png-image_856952.jpg">
|
42 |
+
</div>
|
43 |
+
|
44 |
+
</div>
|
45 |
+
'''
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
pypdf4
|
3 |
+
chromadb
|
4 |
+
streamlit
|
5 |
+
einops
|
6 |
+
langchain-together
|
7 |
+
faiss-gpu
|
8 |
+
sentence-transformers
|
9 |
+
htmltemplate
|
10 |
+
PyPDF2==3.0.1
|
11 |
+
openai
|
12 |
+
tiktoken
|
13 |
+
einops
|