import streamlit as st from dotenv import load_dotenv import sys from PyPDF2 import PdfReader from langchain_community.llms import OpenAI from langchain_community.chat_models import ChatOpenAI from langchain_text_splitters import CharacterTextSplitter from langchain_openai.embeddings import OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceInstructEmbeddings from langchain_community.vectorstores import FAISS from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers.document_compressors import LLMChainExtractor from langchain.retrievers import MultiQueryRetriever from langchain.chains import RetrievalQA from langchain.llms import OpenAI , Cohere def get_pdf_text(pdf_docs): text = "" pdf_reader = PdfReader(pdf_docs) for page in pdf_reader.pages: text += page.extract_text() return text def get_text_chunks(text): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len, is_separator_regex=False,) chunks = text_splitter.split_text(text) return chunks def get_vectorstore(text_chunks): embeddings = OpenAIEmbeddings() # embeddings = HuggingFaceInstructEmbeddings(model_name = "hkunlp/instructor-large") vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) return vectorstore def ll_retriver(vectorstore): llm = OpenAI(temperature=0) llm_based_retriver=MultiQueryRetriever.from_llm( retriever=vectorstore.as_retriever(), llm=llm ) return llm_based_retriver def chain(llm_based_retriever): llm = Cohere(temperature=0) QA_Chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=llm_based_retriever ) return QA_Chain def main(): load_dotenv() st.set_page_config(page_title = "Chat with a PDFs",page_icon=":books:") if "conversation" not in st.session_state: st.session_state.conversation = None if "Q_A_Chain" not in st.session_state: st.session_state.Q_A_Chain = None st.header("Chat with PDF :books:") # question = st.text_input("Ask a Question about your document:") with st.sidebar: st.subheader("Upload your PDF") pdf_docs = st.file_uploader("Upload your PDF here then Process") if st.button("Process"): with st.spinner("Processing"): # get the raw PDF context raw_text = get_pdf_text(pdf_docs) # st.write(raw_text) # get the chunks text_chunks = get_text_chunks(raw_text) # st.write(text_chunks) #Create Vector Store vectorstore = get_vectorstore(text_chunks) # Conversation chain llm_based_retriver = ll_retriver(vectorstore) st.session_state.Q_A_Chain = chain(llm_based_retriver) st.success("PDF processed successfully, you can now ask Questions.") if st.session_state.Q_A_Chain: question = st.text_input("Ask a Question about your document:") if st.button("Submit Question"): if question: with st.spinner("Getting answer..."): docs = st.session_state.Q_A_Chain({"query":question}) st.write(docs['result']) if __name__ == "__main__": main()