Spaces:
Sleeping
Sleeping
Commit
Β·
1c798e7
1
Parent(s):
6243f9c
hooha
Browse files- .streamlit/config.toml +2 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- pages/1_π_Credentials.py +75 -0
- pages/2_π_Upload_Documents.py +58 -0
- pages/3_π€_Chatbot.py +105 -0
- requirements.txt +12 -0
- utils.py +83 -0
- π_Introduction.py +57 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base="dark"
|
__pycache__/utils.cpython-310.pyc
ADDED
Binary file (2.96 kB). View file
|
|
pages/1_π_Credentials.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(
|
4 |
+
page_title="Credentials",
|
5 |
+
page_icon="π",
|
6 |
+
)
|
7 |
+
|
8 |
+
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
|
9 |
+
|
10 |
+
|
11 |
+
# """
|
12 |
+
# Initialising session states
|
13 |
+
# """
|
14 |
+
if 'openai_api_key' not in st.session_state:
|
15 |
+
st.session_state['openai_api_key'] = None
|
16 |
+
|
17 |
+
if 'pinecone_api_key' not in st.session_state:
|
18 |
+
st.session_state['pinecone_api_key'] = None
|
19 |
+
|
20 |
+
if 'pinecone_env' not in st.session_state:
|
21 |
+
st.session_state['pinecone_env'] = None
|
22 |
+
|
23 |
+
if 'pinecone_index_namespace' not in st.session_state:
|
24 |
+
st.session_state['pinecone_index_namespace'] = None
|
25 |
+
|
26 |
+
if 'requests' not in st.session_state:
|
27 |
+
st.session_state['requests'] = []
|
28 |
+
|
29 |
+
if 'responses' not in st.session_state:
|
30 |
+
st.session_state['responses'] = ["How can I assist you?"]
|
31 |
+
|
32 |
+
if 'buffer_memory' not in st.session_state:
|
33 |
+
st.session_state.buffer_memory=ConversationBufferWindowMemory(k=3,return_messages=True)
|
34 |
+
|
35 |
+
st.write("# Enter your Credentials! π")
|
36 |
+
|
37 |
+
# """
|
38 |
+
# Page Content
|
39 |
+
# """
|
40 |
+
st.markdown(
|
41 |
+
"""
|
42 |
+
Enter the following credentials to start uploading and querying documents.
|
43 |
+
"""
|
44 |
+
)
|
45 |
+
|
46 |
+
# OpenAI API Key input
|
47 |
+
openai_api_key = st.text_input("Enter your OpenAI API Key", type='password')
|
48 |
+
if openai_api_key:
|
49 |
+
# Use the OpenAI API key (e.g., validate it, make a request to an API, etc.)
|
50 |
+
st.session_state['openai_api_key'] = openai_api_key
|
51 |
+
|
52 |
+
# Pinecone API Key input
|
53 |
+
pinecone_api_key = st.text_input("Enter your Pinecone API Key", type='password')
|
54 |
+
if pinecone_api_key:
|
55 |
+
st.session_state['pinecone_api_key'] = pinecone_api_key
|
56 |
+
|
57 |
+
# Pinecone Environment input
|
58 |
+
pinecone_env = st.text_input("Enter your Pinecone Environment", type='password')
|
59 |
+
if pinecone_env:
|
60 |
+
st.session_state['pinecone_env'] = pinecone_env
|
61 |
+
|
62 |
+
# Index Namespace input
|
63 |
+
pinecone_index_namespace = st.text_input("Enter your Pinecone Index Namespace", type='password')
|
64 |
+
if pinecone_index_namespace:
|
65 |
+
st.session_state['pinecone_index_namespace'] = pinecone_index_namespace
|
66 |
+
|
67 |
+
# Check if all required fields are filled
|
68 |
+
all_fields_filled = all([st.session_state['pinecone_api_key'],
|
69 |
+
st.session_state['openai_api_key'],
|
70 |
+
st.session_state['pinecone_env'],
|
71 |
+
st.session_state['pinecone_index_namespace']])
|
72 |
+
|
73 |
+
if all_fields_filled:
|
74 |
+
st.success('Credentials Stored')
|
75 |
+
|
pages/2_π_Upload_Documents.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(
|
4 |
+
page_title="Upload Documents",
|
5 |
+
page_icon="π",
|
6 |
+
)
|
7 |
+
|
8 |
+
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
|
9 |
+
from utils import ingest
|
10 |
+
|
11 |
+
# """
|
12 |
+
# Initialising session states
|
13 |
+
# """
|
14 |
+
if 'openai_api_key' not in st.session_state:
|
15 |
+
st.session_state['openai_api_key'] = None
|
16 |
+
|
17 |
+
if 'pinecone_api_key' not in st.session_state:
|
18 |
+
st.session_state['pinecone_api_key'] = None
|
19 |
+
|
20 |
+
if 'pinecone_env' not in st.session_state:
|
21 |
+
st.session_state['pinecone_env'] = None
|
22 |
+
|
23 |
+
if 'pinecone_index_namespace' not in st.session_state:
|
24 |
+
st.session_state['pinecone_index_namespace'] = None
|
25 |
+
|
26 |
+
if 'requests' not in st.session_state:
|
27 |
+
st.session_state['requests'] = []
|
28 |
+
|
29 |
+
if 'responses' not in st.session_state:
|
30 |
+
st.session_state['responses'] = ["How can I assist you?"]
|
31 |
+
|
32 |
+
if 'buffer_memory' not in st.session_state:
|
33 |
+
st.session_state.buffer_memory=ConversationBufferWindowMemory(k=3,return_messages=True)
|
34 |
+
|
35 |
+
|
36 |
+
# """
|
37 |
+
# Page Content
|
38 |
+
# """
|
39 |
+
st.write("# Upload your Documents! π")
|
40 |
+
|
41 |
+
# Check if all required fields are filled
|
42 |
+
all_fields_filled = all([st.session_state['pinecone_api_key'],
|
43 |
+
st.session_state['openai_api_key'],
|
44 |
+
st.session_state['pinecone_env'],
|
45 |
+
st.session_state['pinecone_index_namespace']])
|
46 |
+
|
47 |
+
if not all_fields_filled:
|
48 |
+
st.error('Credentials π not found. Enter Credentials π to activate uploader')
|
49 |
+
uploaded_file = st.file_uploader("Upload Document", type=['pdf'], disabled=True)
|
50 |
+
|
51 |
+
else:
|
52 |
+
uploaded_file = st.file_uploader("Upload Document", type=['pdf', 'docx'], disabled=False)
|
53 |
+
if uploaded_file:
|
54 |
+
ingest(uploaded_file,
|
55 |
+
pinecone_api_key=st.session_state['pinecone_api_key'],
|
56 |
+
pinecone_env=st.session_state['pinecone_env'],
|
57 |
+
pinecone_index_namespace=st.session_state['pinecone_index_namespace']
|
58 |
+
)
|
pages/3_π€_Chatbot.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# π€
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
st.set_page_config(
|
5 |
+
page_title="Chatbot",
|
6 |
+
page_icon="π",
|
7 |
+
)
|
8 |
+
|
9 |
+
from streamlit_chat import message
|
10 |
+
from langchain.chat_models import ChatOpenAI
|
11 |
+
from langchain.chains import ConversationChain
|
12 |
+
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
|
13 |
+
from langchain.prompts import (
|
14 |
+
SystemMessagePromptTemplate,
|
15 |
+
HumanMessagePromptTemplate,
|
16 |
+
ChatPromptTemplate,
|
17 |
+
MessagesPlaceholder
|
18 |
+
)
|
19 |
+
from utils import *
|
20 |
+
|
21 |
+
# """
|
22 |
+
# Initialising session states
|
23 |
+
# """
|
24 |
+
if 'openai_api_key' not in st.session_state:
|
25 |
+
st.session_state['openai_api_key'] = None
|
26 |
+
|
27 |
+
if 'pinecone_api_key' not in st.session_state:
|
28 |
+
st.session_state['pinecone_api_key'] = None
|
29 |
+
|
30 |
+
if 'pinecone_env' not in st.session_state:
|
31 |
+
st.session_state['pinecone_env'] = None
|
32 |
+
|
33 |
+
if 'pinecone_index_namespace' not in st.session_state:
|
34 |
+
st.session_state['pinecone_index_namespace'] = None
|
35 |
+
|
36 |
+
if 'requests' not in st.session_state:
|
37 |
+
st.session_state['requests'] = []
|
38 |
+
|
39 |
+
if 'responses' not in st.session_state:
|
40 |
+
st.session_state['responses'] = ["How can I assist you?"]
|
41 |
+
|
42 |
+
if 'buffer_memory' not in st.session_state:
|
43 |
+
st.session_state.buffer_memory=ConversationBufferWindowMemory(k=3,return_messages=True)
|
44 |
+
|
45 |
+
system_msg_template = SystemMessagePromptTemplate.from_template(template="""Answer the question as truthfully as possible using the provided context,
|
46 |
+
and if the answer is not contained within the text below, say 'I don't know'""")
|
47 |
+
|
48 |
+
human_msg_template = HumanMessagePromptTemplate.from_template(template="{input}")
|
49 |
+
|
50 |
+
prompt_template = ChatPromptTemplate.from_messages([system_msg_template, MessagesPlaceholder(variable_name="history"), human_msg_template])
|
51 |
+
|
52 |
+
empty_openai_api_key = False
|
53 |
+
|
54 |
+
try:
|
55 |
+
llm = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=st.session_state['openai_api_key'])
|
56 |
+
conversation = ConversationChain(memory=st.session_state.buffer_memory, prompt=prompt_template, llm=llm, verbose=True)
|
57 |
+
except:
|
58 |
+
empty_openai_api_key = True
|
59 |
+
|
60 |
+
# """
|
61 |
+
# Page Content
|
62 |
+
# """
|
63 |
+
st.write("# Chat with your Documents! π€")
|
64 |
+
|
65 |
+
if empty_openai_api_key:
|
66 |
+
st.error('Enter OpenAI API key in credentials tab')
|
67 |
+
|
68 |
+
else:
|
69 |
+
try:
|
70 |
+
# if st.button('Say hello'):
|
71 |
+
# st.write('Why hello there')
|
72 |
+
|
73 |
+
# container for chat history
|
74 |
+
response_container = st.container()
|
75 |
+
# container for text box
|
76 |
+
textcontainer = st.container()
|
77 |
+
|
78 |
+
|
79 |
+
with textcontainer:
|
80 |
+
query = st.text_input("Query: ", key="input")
|
81 |
+
if query:
|
82 |
+
with st.spinner("typing..."):
|
83 |
+
conversation_string = get_conversation_string()
|
84 |
+
# st.code(conversation_string)
|
85 |
+
refined_query = query_refiner(conversation_string, query)
|
86 |
+
# st.subheader("Refined Query:")
|
87 |
+
# st.write(refined_query)
|
88 |
+
context = find_match(refined_query,
|
89 |
+
pinecone_api_key=st.session_state['pinecone_api_key'],
|
90 |
+
pinecone_env=st.session_state['pinecone_env'],
|
91 |
+
pinecone_index_namespace=st.session_state['pinecone_index_namespace']
|
92 |
+
)
|
93 |
+
# print(context)
|
94 |
+
response = conversation.predict(input=f"Context:\n {context} \n\n Query:\n{query}")
|
95 |
+
st.session_state.requests.append(query)
|
96 |
+
st.session_state.responses.append(response)
|
97 |
+
with response_container:
|
98 |
+
if st.session_state['responses']:
|
99 |
+
|
100 |
+
for i in range(len(st.session_state['responses'])):
|
101 |
+
message(st.session_state['responses'][i],key=str(i))
|
102 |
+
if i < len(st.session_state['requests']):
|
103 |
+
message(st.session_state["requests"][i], is_user=True,key=str(i)+ '_user')
|
104 |
+
except Exception as error_message:
|
105 |
+
st.error("Error occured. Check your API key")
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair<5
|
2 |
+
streamlit==1.17.0
|
3 |
+
streamlit-chat
|
4 |
+
langchain
|
5 |
+
openai
|
6 |
+
sentence-transformers
|
7 |
+
unstructured
|
8 |
+
unstructured[local-inference]
|
9 |
+
pinecone-client
|
10 |
+
tabulate
|
11 |
+
chromadb
|
12 |
+
pypdf
|
utils.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.document_loaders import DirectoryLoader
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
import pinecone
|
6 |
+
from langchain.vectorstores import Pinecone
|
7 |
+
from langchain.document_loaders import PyPDFLoader
|
8 |
+
import tempfile
|
9 |
+
import streamlit as st
|
10 |
+
import openai
|
11 |
+
|
12 |
+
@st.cache(allow_output_mutation=True)
|
13 |
+
def get_embeddings_model():
|
14 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
15 |
+
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
16 |
+
return model, embeddings
|
17 |
+
|
18 |
+
model, embeddings = get_embeddings_model()
|
19 |
+
|
20 |
+
def ingest(
|
21 |
+
uploaded_document,
|
22 |
+
pinecone_api_key,
|
23 |
+
pinecone_env,
|
24 |
+
pinecone_index_namespace,
|
25 |
+
chunk_size=500,
|
26 |
+
chunk_overlap=20
|
27 |
+
):
|
28 |
+
with tempfile.NamedTemporaryFile(delete=False) as tf:
|
29 |
+
tf.write(uploaded_document.getbuffer())
|
30 |
+
file_path = tf.name
|
31 |
+
loader = PyPDFLoader(file_path)
|
32 |
+
documents = loader.load()
|
33 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
34 |
+
docs = text_splitter.split_documents(documents)
|
35 |
+
# embeddings = get_embeddings_model()
|
36 |
+
pinecone.init(
|
37 |
+
api_key=pinecone_api_key,
|
38 |
+
environment=pinecone_env
|
39 |
+
)
|
40 |
+
index_name = pinecone_index_namespace
|
41 |
+
try:
|
42 |
+
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
|
43 |
+
st.success('Document uploaded to Pinecone database successfully')
|
44 |
+
except Exception as error_message:
|
45 |
+
st.error(error_message)
|
46 |
+
|
47 |
+
# # To create embeddings on hard disk
|
48 |
+
# # !pip install chromadb
|
49 |
+
# # from langchain.vectorstores import Chroma
|
50 |
+
# # persist_directory = './data/embeddings'
|
51 |
+
# # vStore = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
|
52 |
+
|
53 |
+
|
54 |
+
def query_refiner(conversation, query):
|
55 |
+
response = openai.Completion.create(
|
56 |
+
model="text-davinci-003",
|
57 |
+
prompt=f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation}\n\nQuery: {query}\n\nRefined Query:",
|
58 |
+
temperature=0.7,
|
59 |
+
max_tokens=256,
|
60 |
+
top_p=1,
|
61 |
+
frequency_penalty=0,
|
62 |
+
presence_penalty=0
|
63 |
+
)
|
64 |
+
return response['choices'][0]['text']
|
65 |
+
|
66 |
+
|
67 |
+
def find_match(input, pinecone_api_key, pinecone_env, pinecone_index_namespace):
|
68 |
+
pinecone.init(
|
69 |
+
api_key=pinecone_api_key,
|
70 |
+
environment=pinecone_env
|
71 |
+
)
|
72 |
+
index = pinecone.Index(pinecone_index_namespace)
|
73 |
+
input_em = model.encode(input).tolist()
|
74 |
+
result = index.query(input_em, top_k=2, includeMetadata=True)
|
75 |
+
return result['matches'][0]['metadata']['text']+"\n"+result['matches'][1]['metadata']['text']
|
76 |
+
|
77 |
+
|
78 |
+
def get_conversation_string():
|
79 |
+
conversation_string = ""
|
80 |
+
for i in range(len(st.session_state['responses'])-1):
|
81 |
+
conversation_string += "Human: "+st.session_state['requests'][i] + "\n"
|
82 |
+
conversation_string += "Bot: "+ st.session_state['responses'][i+1] + "\n"
|
83 |
+
return conversation_string
|
π_Introduction.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(
|
4 |
+
page_title="Introduction",
|
5 |
+
page_icon="π",
|
6 |
+
)
|
7 |
+
|
8 |
+
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
|
9 |
+
|
10 |
+
# """
|
11 |
+
# Initialising session states
|
12 |
+
# """
|
13 |
+
if 'openai_api_key' not in st.session_state:
|
14 |
+
st.session_state['openai_api_key'] = None
|
15 |
+
|
16 |
+
if 'pinecone_api_key' not in st.session_state:
|
17 |
+
st.session_state['pinecone_api_key'] = None
|
18 |
+
|
19 |
+
if 'pinecone_env' not in st.session_state:
|
20 |
+
st.session_state['pinecone_env'] = None
|
21 |
+
|
22 |
+
if 'pinecone_index_namespace' not in st.session_state:
|
23 |
+
st.session_state['pinecone_index_namespace'] = None
|
24 |
+
|
25 |
+
if 'responses' not in st.session_state:
|
26 |
+
st.session_state['responses'] = ["How can I assist you?"]
|
27 |
+
|
28 |
+
if 'requests' not in st.session_state:
|
29 |
+
st.session_state['requests'] = []
|
30 |
+
|
31 |
+
if 'responses' not in st.session_state:
|
32 |
+
st.session_state['responses'] = ["How can I assist you?"]
|
33 |
+
|
34 |
+
if 'buffer_memory' not in st.session_state:
|
35 |
+
st.session_state.buffer_memory=ConversationBufferWindowMemory(k=3,return_messages=True)
|
36 |
+
|
37 |
+
|
38 |
+
# """
|
39 |
+
# Page Content
|
40 |
+
# """
|
41 |
+
st.write("# Welcome to ChatPDF! π")
|
42 |
+
|
43 |
+
st.markdown(
|
44 |
+
"""
|
45 |
+
ChatPDF is a user-friendly software that allows you to ask questions
|
46 |
+
and get answers from your personal and organizational documents.
|
47 |
+
|
48 |
+
This software uses OpenAI ChatGPT to query documents and Pinecone Vector Database to store documents.
|
49 |
+
|
50 |
+
How to use:
|
51 |
+
1. Make an accout on [OpenAI](https://platform.openai.com/) and [Pinecone](https://www.pinecone.io/).
|
52 |
+
2. Enter your credentials. This include OpenAI API Key, Pinecone API Key, Pinecone Environment and Pinecone Index Name.
|
53 |
+
3. Upload Documents. These documents will be uploaded to Pinecone Database.
|
54 |
+
4. Chat with your documents. Chatbot is built on top of ChatGPT Engine.
|
55 |
+
|
56 |
+
"""
|
57 |
+
)
|