Spaces:
Sleeping
Sleeping
import os | |
import uuid | |
import tempfile | |
import streamlit as st | |
import openai | |
from langchain.retrievers.multi_vector import MultiVectorRetriever | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.storage import InMemoryStore | |
from langchain.memory import ConversationBufferMemory | |
from langchain.llms import OpenAI | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.chat_models import ChatOpenAI | |
from langchain.prompts import ChatPromptTemplate | |
from langchain.schema.output_parser import StrOutputParser | |
import uuid | |
from langchain.schema.document import Document | |
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser | |
from langchain.document_loaders import PyPDFLoader | |
# Set OpenAI API key | |
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"] | |
if not OPENAI_API_KEY: | |
st.error("OPENAI_API_KEY not set in environment variables!") | |
raise SystemExit | |
openai.api_key = OPENAI_API_KEY | |
def process_pdf(uploaded_file): | |
with st.spinner("Processing PDF..."): | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
tmp.write(uploaded_file.getvalue()) | |
tmp_path = tmp.name | |
loaders = [PyPDFLoader(tmp_path)] | |
docs = [] | |
for l in loaders: | |
docs.extend(l.load()) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000) | |
docs = text_splitter.split_documents(docs) | |
return docs | |
def smaller_chunks_strategy(docs): | |
prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="1") | |
if prompt: | |
with st.spinner('Processing with smaller_chunks_strategy'): | |
vectorstore = Chroma( | |
collection_name="full_documents", | |
embedding_function=OpenAIEmbeddings() | |
) | |
store = InMemoryStore() | |
id_key = "doc_id" | |
retriever = MultiVectorRetriever( | |
vectorstore=vectorstore, | |
docstore=store, | |
id_key=id_key, | |
) | |
doc_ids = [str(uuid.uuid4()) for _ in docs] | |
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400) | |
sub_docs = [] | |
for i, doc in enumerate(docs): | |
_id = doc_ids[i] | |
_sub_docs = child_text_splitter.split_documents([doc]) | |
for _doc in _sub_docs: | |
_doc.metadata[id_key] = _id | |
sub_docs.extend(_sub_docs) | |
retriever.vectorstore.add_documents(sub_docs) | |
retriever.docstore.mset(list(zip(doc_ids, docs))) | |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=memory) | |
st.info(prompt, icon="π§") | |
result = qa({"question": prompt}) | |
st.success(result['answer'], icon="π€") | |
def summary_strategy(docs): | |
prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="2") | |
if prompt: | |
with st.spinner('Processing with summary_strategy'): | |
chain = ( | |
{"doc": lambda x: x.page_content} | |
| ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}") | |
| ChatOpenAI(max_retries=0) | |
| StrOutputParser() | |
) | |
summaries = chain.batch(docs, {"max_concurrency": 5}) | |
vectorstore = Chroma( | |
collection_name="summaries", | |
embedding_function= OpenAIEmbeddings() | |
) | |
store = InMemoryStore() | |
id_key = "doc_id" | |
retriever = MultiVectorRetriever( | |
vectorstore=vectorstore, | |
docstore=store, | |
id_key=id_key, | |
) | |
doc_ids = [str(uuid.uuid4()) for _ in docs] | |
summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)] | |
retriever.vectorstore.add_documents(summary_docs) | |
retriever.docstore.mset(list(zip(doc_ids, docs))) | |
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True)) | |
st.info(prompt, icon="π§") | |
result = qa({"question": prompt}) | |
st.success(result['answer'], icon="π€") | |
def hypothetical_questions_strategy(docs): | |
prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="3") | |
if prompt: | |
with st.spinner('Processing with hypothetical_questions_strategy'): | |
functions = [ | |
{ | |
"name": "hypothetical_questions", | |
"description": "Generate hypothetical questions", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"questions": { | |
"type": "array", | |
"items": { | |
"type": "string" | |
}, | |
}, | |
}, | |
"required": ["questions"] | |
} | |
} | |
] | |
chain = ( | |
{"doc": lambda x: x.page_content} | |
| ChatPromptTemplate.from_template("Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}") | |
| ChatOpenAI(max_retries=0, model="gpt-4").bind(functions=functions, function_call={"name": "hypothetical_questions"}) | |
| JsonKeyOutputFunctionsParser(key_name="questions") | |
) | |
hypothetical_questions = chain.batch(docs, {"max_concurrency": 5}) | |
vectorstore = Chroma( | |
collection_name="hypo-questions", | |
embedding_function=OpenAIEmbeddings() | |
) | |
store = InMemoryStore() | |
id_key = "doc_id" | |
retriever = MultiVectorRetriever( | |
vectorstore=vectorstore, | |
docstore=store, | |
id_key=id_key, | |
) | |
doc_ids = [str(uuid.uuid4()) for _ in docs] | |
question_docs = [] | |
for i, question_list in enumerate(hypothetical_questions): | |
question_docs.extend([Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]) | |
retriever.vectorstore.add_documents(question_docs) | |
retriever.docstore.mset(list(zip(doc_ids, docs))) | |
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True)) | |
st.info(prompt, icon="π§") | |
result = qa({"question": prompt}) | |
st.success(result['answer'], icon="π€") | |
def app(): | |
image_path = "icon.png" | |
st.sidebar.image(image_path, caption="icon", use_column_width=True) | |
st.title("VecDBCompare 0.0.1") | |
st.sidebar.markdown(""" | |
# π **VecDBCompare: Your Vector DB Strategy Tester** | |
## π **What is it?** | |
VecDBCompare lets you evaluate and compare three vector database retrieval strategies in a snap! | |
## π€ **How to Use?** | |
1. **Upload a PDF** π | |
2. Get **Three QABots** π€π€π€, each with a different strategy. | |
3. **Ask questions** β and see how each bot responds differently. | |
4. **Decide** β which strategy works best for you! | |
## π **Why VecDBCompare?** | |
- **Simple & Fast** β‘: Upload, ask, and compare! | |
- **Real-time Comparison** π: See strategies in action side-by-side. | |
- **Empower Your Choice** π‘: Pick the best strategy for your needs. | |
Dive in and discover with VecDBCompare! π | |
""") | |
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"]) | |
if uploaded_file: | |
docs = process_pdf(uploaded_file) | |
option = st.selectbox( | |
"Which retrieval strategy would you like to use?", | |
("Smaller Chunks", "Summary", "Hypothetical Questions") | |
) | |
if option == 'Smaller Chunks': | |
smaller_chunks_strategy(docs) | |
elif option == 'Summary': | |
summary_strategy(docs) | |
elif option == 'Hypothetical Questions': | |
hypothetical_questions_strategy(docs) | |
if __name__ == "__main__": | |
st.set_page_config(layout="wide") | |
app() | |