VecDBCompare / app.py
xuyingliKepler's picture
Update app.py
613ac12
raw
history blame
8.7 kB
import os
import uuid
import tempfile
import streamlit as st
import openai
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
import uuid
from langchain.schema.document import Document
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.document_loaders import PyPDFLoader
# Set OpenAI API key
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
if not OPENAI_API_KEY:
st.error("OPENAI_API_KEY not set in environment variables!")
raise SystemExit
openai.api_key = OPENAI_API_KEY
def process_pdf(uploaded_file):
with st.spinner("Processing PDF..."):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(uploaded_file.getvalue())
tmp_path = tmp.name
loaders = [PyPDFLoader(tmp_path)]
docs = []
for l in loaders:
docs.extend(l.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(docs)
return docs
def smaller_chunks_strategy(docs):
prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="1")
if prompt:
with st.spinner('Processing with smaller_chunks_strategy'):
vectorstore = Chroma(
collection_name="full_documents",
embedding_function=OpenAIEmbeddings()
)
store = InMemoryStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
sub_docs = []
for i, doc in enumerate(docs):
_id = doc_ids[i]
_sub_docs = child_text_splitter.split_documents([doc])
for _doc in _sub_docs:
_doc.metadata[id_key] = _id
sub_docs.extend(_sub_docs)
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=memory)
st.info(prompt, icon="🧐")
result = qa({"question": prompt})
st.success(result['answer'], icon="πŸ€–")
def summary_strategy(docs):
prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="2")
if prompt:
with st.spinner('Processing with summary_strategy'):
chain = (
{"doc": lambda x: x.page_content}
| ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
| ChatOpenAI(max_retries=0)
| StrOutputParser()
)
summaries = chain.batch(docs, {"max_concurrency": 5})
vectorstore = Chroma(
collection_name="summaries",
embedding_function= OpenAIEmbeddings()
)
store = InMemoryStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]
summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True))
st.info(prompt, icon="🧐")
result = qa({"question": prompt})
st.success(result['answer'], icon="πŸ€–")
def hypothetical_questions_strategy(docs):
prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="3")
if prompt:
with st.spinner('Processing with hypothetical_questions_strategy'):
functions = [
{
"name": "hypothetical_questions",
"description": "Generate hypothetical questions",
"parameters": {
"type": "object",
"properties": {
"questions": {
"type": "array",
"items": {
"type": "string"
},
},
},
"required": ["questions"]
}
}
]
chain = (
{"doc": lambda x: x.page_content}
| ChatPromptTemplate.from_template("Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}")
| ChatOpenAI(max_retries=0, model="gpt-4").bind(functions=functions, function_call={"name": "hypothetical_questions"})
| JsonKeyOutputFunctionsParser(key_name="questions")
)
hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})
vectorstore = Chroma(
collection_name="hypo-questions",
embedding_function=OpenAIEmbeddings()
)
store = InMemoryStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
question_docs.extend([Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list])
retriever.vectorstore.add_documents(question_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True))
st.info(prompt, icon="🧐")
result = qa({"question": prompt})
st.success(result['answer'], icon="πŸ€–")
def app():
image_path = "icon.png"
st.sidebar.image(image_path, caption="icon", use_column_width=True)
st.title("VecDBCompare 0.0.1")
st.sidebar.markdown("""
# πŸš€ **VecDBCompare: Your Vector DB Strategy Tester**
## πŸ“Œ **What is it?**
VecDBCompare lets you evaluate and compare three vector database retrieval strategies in a snap!
## πŸ“€ **How to Use?**
1. **Upload a PDF** πŸ“„
2. Get **Three QABots** πŸ€–πŸ€–πŸ€–, each with a different strategy.
3. **Ask questions** ❓ and see how each bot responds differently.
4. **Decide** βœ… which strategy works best for you!
## 🌟 **Why VecDBCompare?**
- **Simple & Fast** ⚑: Upload, ask, and compare!
- **Real-time Comparison** πŸ”: See strategies in action side-by-side.
- **Empower Your Choice** πŸ’‘: Pick the best strategy for your needs.
Dive in and discover with VecDBCompare! 🌐
""")
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
if uploaded_file:
docs = process_pdf(uploaded_file)
option = st.selectbox(
"Which retrieval strategy would you like to use?",
("Smaller Chunks", "Summary", "Hypothetical Questions")
)
if option == 'Smaller Chunks':
smaller_chunks_strategy(docs)
elif option == 'Summary':
summary_strategy(docs)
elif option == 'Hypothetical Questions':
hypothetical_questions_strategy(docs)
if __name__ == "__main__":
st.set_page_config(layout="wide")
app()