darthPanda commited on
Commit
1c798e7
Β·
1 Parent(s): 6243f9c
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [theme]
2
+ base="dark"
__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.96 kB). View file
 
pages/1_πŸ”_Credentials.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(
4
+ page_title="Credentials",
5
+ page_icon="πŸ”",
6
+ )
7
+
8
+ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
9
+
10
+
11
+ # """
12
+ # Initialising session states
13
+ # """
14
+ if 'openai_api_key' not in st.session_state:
15
+ st.session_state['openai_api_key'] = None
16
+
17
+ if 'pinecone_api_key' not in st.session_state:
18
+ st.session_state['pinecone_api_key'] = None
19
+
20
+ if 'pinecone_env' not in st.session_state:
21
+ st.session_state['pinecone_env'] = None
22
+
23
+ if 'pinecone_index_namespace' not in st.session_state:
24
+ st.session_state['pinecone_index_namespace'] = None
25
+
26
+ if 'requests' not in st.session_state:
27
+ st.session_state['requests'] = []
28
+
29
+ if 'responses' not in st.session_state:
30
+ st.session_state['responses'] = ["How can I assist you?"]
31
+
32
+ if 'buffer_memory' not in st.session_state:
33
+ st.session_state.buffer_memory=ConversationBufferWindowMemory(k=3,return_messages=True)
34
+
35
+ st.write("# Enter your Credentials! πŸ”")
36
+
37
+ # """
38
+ # Page Content
39
+ # """
40
+ st.markdown(
41
+ """
42
+ Enter the following credentials to start uploading and querying documents.
43
+ """
44
+ )
45
+
46
+ # OpenAI API Key input
47
+ openai_api_key = st.text_input("Enter your OpenAI API Key", type='password')
48
+ if openai_api_key:
49
+ # Use the OpenAI API key (e.g., validate it, make a request to an API, etc.)
50
+ st.session_state['openai_api_key'] = openai_api_key
51
+
52
+ # Pinecone API Key input
53
+ pinecone_api_key = st.text_input("Enter your Pinecone API Key", type='password')
54
+ if pinecone_api_key:
55
+ st.session_state['pinecone_api_key'] = pinecone_api_key
56
+
57
+ # Pinecone Environment input
58
+ pinecone_env = st.text_input("Enter your Pinecone Environment", type='password')
59
+ if pinecone_env:
60
+ st.session_state['pinecone_env'] = pinecone_env
61
+
62
+ # Index Namespace input
63
+ pinecone_index_namespace = st.text_input("Enter your Pinecone Index Namespace", type='password')
64
+ if pinecone_index_namespace:
65
+ st.session_state['pinecone_index_namespace'] = pinecone_index_namespace
66
+
67
+ # Check if all required fields are filled
68
+ all_fields_filled = all([st.session_state['pinecone_api_key'],
69
+ st.session_state['openai_api_key'],
70
+ st.session_state['pinecone_env'],
71
+ st.session_state['pinecone_index_namespace']])
72
+
73
+ if all_fields_filled:
74
+ st.success('Credentials Stored')
75
+
pages/2_πŸ“„_Upload_Documents.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(
4
+ page_title="Upload Documents",
5
+ page_icon="πŸ“„",
6
+ )
7
+
8
+ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
9
+ from utils import ingest
10
+
11
+ # """
12
+ # Initialising session states
13
+ # """
14
+ if 'openai_api_key' not in st.session_state:
15
+ st.session_state['openai_api_key'] = None
16
+
17
+ if 'pinecone_api_key' not in st.session_state:
18
+ st.session_state['pinecone_api_key'] = None
19
+
20
+ if 'pinecone_env' not in st.session_state:
21
+ st.session_state['pinecone_env'] = None
22
+
23
+ if 'pinecone_index_namespace' not in st.session_state:
24
+ st.session_state['pinecone_index_namespace'] = None
25
+
26
+ if 'requests' not in st.session_state:
27
+ st.session_state['requests'] = []
28
+
29
+ if 'responses' not in st.session_state:
30
+ st.session_state['responses'] = ["How can I assist you?"]
31
+
32
+ if 'buffer_memory' not in st.session_state:
33
+ st.session_state.buffer_memory=ConversationBufferWindowMemory(k=3,return_messages=True)
34
+
35
+
36
+ # """
37
+ # Page Content
38
+ # """
39
+ st.write("# Upload your Documents! πŸ“„")
40
+
41
+ # Check if all required fields are filled
42
+ all_fields_filled = all([st.session_state['pinecone_api_key'],
43
+ st.session_state['openai_api_key'],
44
+ st.session_state['pinecone_env'],
45
+ st.session_state['pinecone_index_namespace']])
46
+
47
+ if not all_fields_filled:
48
+ st.error('Credentials πŸ” not found. Enter Credentials πŸ” to activate uploader')
49
+ uploaded_file = st.file_uploader("Upload Document", type=['pdf'], disabled=True)
50
+
51
+ else:
52
+ uploaded_file = st.file_uploader("Upload Document", type=['pdf', 'docx'], disabled=False)
53
+ if uploaded_file:
54
+ ingest(uploaded_file,
55
+ pinecone_api_key=st.session_state['pinecone_api_key'],
56
+ pinecone_env=st.session_state['pinecone_env'],
57
+ pinecone_index_namespace=st.session_state['pinecone_index_namespace']
58
+ )
pages/3_πŸ€–_Chatbot.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # πŸ€–
2
+ import streamlit as st
3
+
4
+ st.set_page_config(
5
+ page_title="Chatbot",
6
+ page_icon="πŸ“„",
7
+ )
8
+
9
+ from streamlit_chat import message
10
+ from langchain.chat_models import ChatOpenAI
11
+ from langchain.chains import ConversationChain
12
+ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
13
+ from langchain.prompts import (
14
+ SystemMessagePromptTemplate,
15
+ HumanMessagePromptTemplate,
16
+ ChatPromptTemplate,
17
+ MessagesPlaceholder
18
+ )
19
+ from utils import *
20
+
21
+ # """
22
+ # Initialising session states
23
+ # """
24
+ if 'openai_api_key' not in st.session_state:
25
+ st.session_state['openai_api_key'] = None
26
+
27
+ if 'pinecone_api_key' not in st.session_state:
28
+ st.session_state['pinecone_api_key'] = None
29
+
30
+ if 'pinecone_env' not in st.session_state:
31
+ st.session_state['pinecone_env'] = None
32
+
33
+ if 'pinecone_index_namespace' not in st.session_state:
34
+ st.session_state['pinecone_index_namespace'] = None
35
+
36
+ if 'requests' not in st.session_state:
37
+ st.session_state['requests'] = []
38
+
39
+ if 'responses' not in st.session_state:
40
+ st.session_state['responses'] = ["How can I assist you?"]
41
+
42
+ if 'buffer_memory' not in st.session_state:
43
+ st.session_state.buffer_memory=ConversationBufferWindowMemory(k=3,return_messages=True)
44
+
45
+ system_msg_template = SystemMessagePromptTemplate.from_template(template="""Answer the question as truthfully as possible using the provided context,
46
+ and if the answer is not contained within the text below, say 'I don't know'""")
47
+
48
+ human_msg_template = HumanMessagePromptTemplate.from_template(template="{input}")
49
+
50
+ prompt_template = ChatPromptTemplate.from_messages([system_msg_template, MessagesPlaceholder(variable_name="history"), human_msg_template])
51
+
52
+ empty_openai_api_key = False
53
+
54
+ try:
55
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=st.session_state['openai_api_key'])
56
+ conversation = ConversationChain(memory=st.session_state.buffer_memory, prompt=prompt_template, llm=llm, verbose=True)
57
+ except:
58
+ empty_openai_api_key = True
59
+
60
+ # """
61
+ # Page Content
62
+ # """
63
+ st.write("# Chat with your Documents! πŸ€–")
64
+
65
+ if empty_openai_api_key:
66
+ st.error('Enter OpenAI API key in credentials tab')
67
+
68
+ else:
69
+ try:
70
+ # if st.button('Say hello'):
71
+ # st.write('Why hello there')
72
+
73
+ # container for chat history
74
+ response_container = st.container()
75
+ # container for text box
76
+ textcontainer = st.container()
77
+
78
+
79
+ with textcontainer:
80
+ query = st.text_input("Query: ", key="input")
81
+ if query:
82
+ with st.spinner("typing..."):
83
+ conversation_string = get_conversation_string()
84
+ # st.code(conversation_string)
85
+ refined_query = query_refiner(conversation_string, query)
86
+ # st.subheader("Refined Query:")
87
+ # st.write(refined_query)
88
+ context = find_match(refined_query,
89
+ pinecone_api_key=st.session_state['pinecone_api_key'],
90
+ pinecone_env=st.session_state['pinecone_env'],
91
+ pinecone_index_namespace=st.session_state['pinecone_index_namespace']
92
+ )
93
+ # print(context)
94
+ response = conversation.predict(input=f"Context:\n {context} \n\n Query:\n{query}")
95
+ st.session_state.requests.append(query)
96
+ st.session_state.responses.append(response)
97
+ with response_container:
98
+ if st.session_state['responses']:
99
+
100
+ for i in range(len(st.session_state['responses'])):
101
+ message(st.session_state['responses'][i],key=str(i))
102
+ if i < len(st.session_state['requests']):
103
+ message(st.session_state["requests"][i], is_user=True,key=str(i)+ '_user')
104
+ except Exception as error_message:
105
+ st.error("Error occured. Check your API key")
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair<5
2
+ streamlit==1.17.0
3
+ streamlit-chat
4
+ langchain
5
+ openai
6
+ sentence-transformers
7
+ unstructured
8
+ unstructured[local-inference]
9
+ pinecone-client
10
+ tabulate
11
+ chromadb
12
+ pypdf
utils.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import DirectoryLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings import SentenceTransformerEmbeddings
4
+ from sentence_transformers import SentenceTransformer
5
+ import pinecone
6
+ from langchain.vectorstores import Pinecone
7
+ from langchain.document_loaders import PyPDFLoader
8
+ import tempfile
9
+ import streamlit as st
10
+ import openai
11
+
12
+ @st.cache(allow_output_mutation=True)
13
+ def get_embeddings_model():
14
+ model = SentenceTransformer('all-MiniLM-L6-v2')
15
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
16
+ return model, embeddings
17
+
18
+ model, embeddings = get_embeddings_model()
19
+
20
+ def ingest(
21
+ uploaded_document,
22
+ pinecone_api_key,
23
+ pinecone_env,
24
+ pinecone_index_namespace,
25
+ chunk_size=500,
26
+ chunk_overlap=20
27
+ ):
28
+ with tempfile.NamedTemporaryFile(delete=False) as tf:
29
+ tf.write(uploaded_document.getbuffer())
30
+ file_path = tf.name
31
+ loader = PyPDFLoader(file_path)
32
+ documents = loader.load()
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
34
+ docs = text_splitter.split_documents(documents)
35
+ # embeddings = get_embeddings_model()
36
+ pinecone.init(
37
+ api_key=pinecone_api_key,
38
+ environment=pinecone_env
39
+ )
40
+ index_name = pinecone_index_namespace
41
+ try:
42
+ index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
43
+ st.success('Document uploaded to Pinecone database successfully')
44
+ except Exception as error_message:
45
+ st.error(error_message)
46
+
47
+ # # To create embeddings on hard disk
48
+ # # !pip install chromadb
49
+ # # from langchain.vectorstores import Chroma
50
+ # # persist_directory = './data/embeddings'
51
+ # # vStore = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
52
+
53
+
54
+ def query_refiner(conversation, query):
55
+ response = openai.Completion.create(
56
+ model="text-davinci-003",
57
+ prompt=f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation}\n\nQuery: {query}\n\nRefined Query:",
58
+ temperature=0.7,
59
+ max_tokens=256,
60
+ top_p=1,
61
+ frequency_penalty=0,
62
+ presence_penalty=0
63
+ )
64
+ return response['choices'][0]['text']
65
+
66
+
67
+ def find_match(input, pinecone_api_key, pinecone_env, pinecone_index_namespace):
68
+ pinecone.init(
69
+ api_key=pinecone_api_key,
70
+ environment=pinecone_env
71
+ )
72
+ index = pinecone.Index(pinecone_index_namespace)
73
+ input_em = model.encode(input).tolist()
74
+ result = index.query(input_em, top_k=2, includeMetadata=True)
75
+ return result['matches'][0]['metadata']['text']+"\n"+result['matches'][1]['metadata']['text']
76
+
77
+
78
+ def get_conversation_string():
79
+ conversation_string = ""
80
+ for i in range(len(st.session_state['responses'])-1):
81
+ conversation_string += "Human: "+st.session_state['requests'][i] + "\n"
82
+ conversation_string += "Bot: "+ st.session_state['responses'][i+1] + "\n"
83
+ return conversation_string
πŸ‘‹_Introduction.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(
4
+ page_title="Introduction",
5
+ page_icon="πŸ‘‹",
6
+ )
7
+
8
+ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
9
+
10
+ # """
11
+ # Initialising session states
12
+ # """
13
+ if 'openai_api_key' not in st.session_state:
14
+ st.session_state['openai_api_key'] = None
15
+
16
+ if 'pinecone_api_key' not in st.session_state:
17
+ st.session_state['pinecone_api_key'] = None
18
+
19
+ if 'pinecone_env' not in st.session_state:
20
+ st.session_state['pinecone_env'] = None
21
+
22
+ if 'pinecone_index_namespace' not in st.session_state:
23
+ st.session_state['pinecone_index_namespace'] = None
24
+
25
+ if 'responses' not in st.session_state:
26
+ st.session_state['responses'] = ["How can I assist you?"]
27
+
28
+ if 'requests' not in st.session_state:
29
+ st.session_state['requests'] = []
30
+
31
+ if 'responses' not in st.session_state:
32
+ st.session_state['responses'] = ["How can I assist you?"]
33
+
34
+ if 'buffer_memory' not in st.session_state:
35
+ st.session_state.buffer_memory=ConversationBufferWindowMemory(k=3,return_messages=True)
36
+
37
+
38
+ # """
39
+ # Page Content
40
+ # """
41
+ st.write("# Welcome to ChatPDF! πŸ‘‹")
42
+
43
+ st.markdown(
44
+ """
45
+ ChatPDF is a user-friendly software that allows you to ask questions
46
+ and get answers from your personal and organizational documents.
47
+
48
+ This software uses OpenAI ChatGPT to query documents and Pinecone Vector Database to store documents.
49
+
50
+ How to use:
51
+ 1. Make an accout on [OpenAI](https://platform.openai.com/) and [Pinecone](https://www.pinecone.io/).
52
+ 2. Enter your credentials. This include OpenAI API Key, Pinecone API Key, Pinecone Environment and Pinecone Index Name.
53
+ 3. Upload Documents. These documents will be uploaded to Pinecone Database.
54
+ 4. Chat with your documents. Chatbot is built on top of ChatGPT Engine.
55
+
56
+ """
57
+ )