xuyingliKepler commited on
Commit
f5114b7
Β·
1 Parent(s): fd1f55d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -0
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import tempfile
4
+ import streamlit as st
5
+ import openai
6
+ from langchain.retrievers.multi_vector import MultiVectorRetriever
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.embeddings import OpenAIEmbeddings
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.storage import InMemoryStore
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.llms import OpenAI
13
+ from langchain.chains import ConversationalRetrievalChain
14
+ from langchain.chat_models import ChatOpenAI
15
+ from langchain.prompts import ChatPromptTemplate
16
+ from langchain.schema.output_parser import StrOutputParser
17
+ import uuid
18
+ from langchain.schema.document import Document
19
+ from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
20
+ from langchain.document_loaders import PyPDFLoader
21
+
22
+ # Set OpenAI API key
23
+ OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
24
+ if not OPENAI_API_KEY:
25
+ st.error("OPENAI_API_KEY not set in environment variables!")
26
+ raise SystemExit
27
+ openai.api_key = OPENAI_API_KEY
28
+
29
+
30
+ def process_pdf(uploaded_file):
31
+ with st.spinner("Processing PDF..."):
32
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
33
+ tmp.write(uploaded_file.getvalue())
34
+ tmp_path = tmp.name
35
+ loaders = [PyPDFLoader(tmp_path)]
36
+ docs = []
37
+ for l in loaders:
38
+ docs.extend(l.load())
39
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
40
+ docs = text_splitter.split_documents(docs)
41
+ return docs
42
+
43
+
44
+ def smaller_chunks_strategy(docs):
45
+ with st.spinner('Processing with smaller_chunks_strategy'):
46
+ vectorstore = Chroma(
47
+ collection_name="full_documents",
48
+ embedding_function=OpenAIEmbeddings()
49
+ )
50
+ store = InMemoryStore()
51
+ id_key = "doc_id"
52
+ retriever = MultiVectorRetriever(
53
+ vectorstore=vectorstore,
54
+ docstore=store,
55
+ id_key=id_key,
56
+ )
57
+ doc_ids = [str(uuid.uuid4()) for _ in docs]
58
+ child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
59
+ sub_docs = []
60
+ for i, doc in enumerate(docs):
61
+ _id = doc_ids[i]
62
+ _sub_docs = child_text_splitter.split_documents([doc])
63
+ for _doc in _sub_docs:
64
+ _doc.metadata[id_key] = _id
65
+ sub_docs.extend(_sub_docs)
66
+
67
+ retriever.vectorstore.add_documents(sub_docs)
68
+ retriever.docstore.mset(list(zip(doc_ids, docs)))
69
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
70
+ qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=memory)
71
+ prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="1")
72
+ if prompt:
73
+ st.info(prompt, icon="🧐")
74
+ result = qa({"question": prompt})
75
+ st.success(result['answer'], icon="πŸ€–")
76
+
77
+
78
+ def summary_strategy(docs):
79
+ with st.spinner('Processing with summary_strategy'):
80
+ chain = (
81
+ {"doc": lambda x: x.page_content}
82
+ | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
83
+ | ChatOpenAI(max_retries=0)
84
+ | StrOutputParser()
85
+ )
86
+ summaries = chain.batch(docs, {"max_concurrency": 5})
87
+ vectorstore = Chroma(
88
+ collection_name="summaries",
89
+ embedding_function= OpenAIEmbeddings()
90
+ )
91
+ store = InMemoryStore()
92
+ id_key = "doc_id"
93
+ retriever = MultiVectorRetriever(
94
+ vectorstore=vectorstore,
95
+ docstore=store,
96
+ id_key=id_key,
97
+ )
98
+ doc_ids = [str(uuid.uuid4()) for _ in docs]
99
+ summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)]
100
+ retriever.vectorstore.add_documents(summary_docs)
101
+ retriever.docstore.mset(list(zip(doc_ids, docs)))
102
+ qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True))
103
+ prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="2")
104
+ if prompt:
105
+ st.info(prompt, icon="🧐")
106
+ result = qa({"question": prompt})
107
+ st.success(result['answer'], icon="πŸ€–")
108
+
109
+
110
+ def hypothetical_questions_strategy(docs):
111
+ with st.spinner('Processing with hypothetical_questions_strategy'):
112
+ functions = [
113
+ {
114
+ "name": "hypothetical_questions",
115
+ "description": "Generate hypothetical questions",
116
+ "parameters": {
117
+ "type": "object",
118
+ "properties": {
119
+ "questions": {
120
+ "type": "array",
121
+ "items": {
122
+ "type": "string"
123
+ },
124
+ },
125
+ },
126
+ "required": ["questions"]
127
+ }
128
+ }
129
+ ]
130
+ chain = (
131
+ {"doc": lambda x: x.page_content}
132
+ | ChatPromptTemplate.from_template("Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}")
133
+ | ChatOpenAI(max_retries=0, model="gpt-4").bind(functions=functions, function_call={"name": "hypothetical_questions"})
134
+ | JsonKeyOutputFunctionsParser(key_name="questions")
135
+ )
136
+ hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})
137
+ vectorstore = Chroma(
138
+ collection_name="hypo-questions",
139
+ embedding_function=OpenAIEmbeddings()
140
+ )
141
+ store = InMemoryStore()
142
+ id_key = "doc_id"
143
+ retriever = MultiVectorRetriever(
144
+ vectorstore=vectorstore,
145
+ docstore=store,
146
+ id_key=id_key,
147
+ )
148
+ doc_ids = [str(uuid.uuid4()) for _ in docs]
149
+ question_docs = []
150
+ for i, question_list in enumerate(hypothetical_questions):
151
+ question_docs.extend([Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list])
152
+ retriever.vectorstore.add_documents(question_docs)
153
+ retriever.docstore.mset(list(zip(doc_ids, docs)))
154
+ qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True))
155
+ prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="3")
156
+ if prompt:
157
+ st.info(prompt, icon="🧐")
158
+ result = qa({"question": prompt})
159
+ st.success(result['answer'], icon="πŸ€–")
160
+
161
+
162
+
163
+ def app():
164
+ image_path = "icon.png"
165
+ st.sidebar.image(image_path, caption="icon", use_column_width=True)
166
+ st.title("VecDBCompare 0.0.1")
167
+ st.sidebar.markdown("""
168
+ # πŸš€ **VecDBCompare: Your Vector DB Strategy Tester**
169
+ ## πŸ“Œ **What is it?**
170
+ VecDBCompare lets you evaluate and compare three vector database retrieval strategies in a snap!
171
+ ## πŸ“€ **How to Use?**
172
+ 1. **Upload a PDF** πŸ“„
173
+ 2. Get **Three QABots** πŸ€–πŸ€–πŸ€–, each with a different strategy.
174
+ 3. **Ask questions** ❓ and see how each bot responds differently.
175
+ 4. **Decide** βœ… which strategy works best for you!
176
+ ## 🌟 **Why VecDBCompare?**
177
+ - **Simple & Fast** ⚑: Upload, ask, and compare!
178
+ - **Real-time Comparison** πŸ”: See strategies in action side-by-side.
179
+ - **Empower Your Choice** πŸ’‘: Pick the best strategy for your needs.
180
+ Dive in and discover with VecDBCompare! 🌐
181
+ """)
182
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
183
+ if uploaded_file:
184
+ docs = process_pdf(uploaded_file)
185
+ option = st.selectbox(
186
+ "Which retrieval strategy would you like to use?",
187
+ ("Smaller Chunks", "Summary", "Hypothetical Questions")
188
+ )
189
+ if option == 'Smaller Chunks':
190
+ smaller_chunks_strategy(docs)
191
+ elif option == 'Summary':
192
+ summary_strategy(docs)
193
+ elif option == 'Hypothetical Questions':
194
+ hypothetical_questions_strategy(docs)
195
+
196
+
197
+ if __name__ == "__main__":
198
+ st.set_page_config(layout="wide")
199
+ app()