Spaces:

testcolab2
/

RAG_SOC_BOT

Sleeping

App Files Files Community

testcolab2 commited on Jan 21, 2024

Commit

89c2788

verified ·

1 Parent(s): a60407d

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -205

app.py CHANGED Viewed

@@ -1,214 +1,74 @@
-from langchain_community.document_loaders import DirectoryLoader
-from langchain_community.document_loaders import PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.chains import ConversationChain
-from langchain.memory import ConversationBufferMemory
-from langchain.chains import (
-    StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
-)
-from langchain_core.prompts import PromptTemplate
 import streamlit as st
 from PyPDF2 import PdfReader
-css = '''
-<style>
-.chat-message {
-    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
-}
-.chat-message.user {
-    background-color: #2b313e
-}
-.chat-message.bot {
-    background-color: #475063
-}
-.chat-message .avatar {
-  width: 20%;
-}
-.chat-message .avatar img {
-  max-width: 78px;
-  max-height: 78px;
-  border-radius: 50%;
-  object-fit: cover;
-}
-.chat-message .message {
-  width: 80%;
-  padding: 0 1.5rem;
-  color: #fff;
-}
-'''
-bot_template = '''
-<div class="chat-message bot">
-    <div class="avatar">
-        <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png">
-    </div>
-    <div class="message">{{MSG}}</div>
-</div>
-'''
-user_template = '''
-<div class="chat-message user">
-    <div class="avatar">
-        <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
-    </div>
-    <div class="message">{{MSG}}</div>
-</div>
-'''
-def get_pdf_text(pdf_docs):
-    text = ""
-    for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    return text
-def get_text_chunks(text):
-    text_splitter = CharacterTextSplitter(
-        separator="\n",
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
-    chunks = text_splitter.split_text(text)
-    return chunks
-def get_vectorstore(text_chunks):
-    # embeddings = OpenAIEmbeddings()
-    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
-    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
-    return vectorstore
-def get_conversation_chain(vectorstore):
-    # llm = ChatOpenAI()
-    llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
-    memory = ConversationBufferMemory(
-        memory_key='chat_history', return_messages=True)
-    conversation_chain = ConversationalRetrievalChain.from_llm(
-        llm=llm,
-        retriever=vectorstore.as_retriever(),
-        memory=memory
-    )
-    return conversation_chain
-def handle_userinput(user_question):
-    response = st.session_state.conversation({'question': user_question})
-    st.session_state.chat_history = response['chat_history']
-    for i, message in enumerate(st.session_state.chat_history):
-        if i % 2 == 0:
-            st.write(user_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
         else:
-            st.write(bot_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
-def initialize_conversation_chain(text_chunks):
-    vectorstore = get_vectorstore(text_chunks)
-    return get_conversation_chain(vectorstore)
-def main():
-    st.set_page_config(page_title="Chat with multiple PDFs", page_icon="logo1.png")
-    st.write(css, unsafe_allow_html=True)
-    if "conversation" not in st.session_state:
-        st.session_state.conversation = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = None
-    st.header("Chat with multiple PDFs :books:")
-    user_question = st.text_input("Ask a question about your documents:")
-    if user_question:
-        handle_userinput(user_question)
-    with st.sidebar:
-        st.subheader("Your documents")
-        pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
-        if st.button("Process"):
-            with st.spinner("Processing"):
-                # Get PDF text
-                raw_text = get_pdf_text(pdf_docs)
-                # Get the text chunks
-                text_chunks = get_text_chunks(raw_text)
-                vectorstore = get_vectorstore(text_chunks)
-                # Initialize conversation chain
-                st.session_state.conversation = get_conversation_chain(vectorstore)
-if __name__ == '__main__':
-    main()
-# def main():
-#     st.set_page_config(page_title="Chat with multiple PDFs",
-#                        page_icon="logo1.png" )
-#     st.write(css, unsafe_allow_html=True)
-#     if "conversation" not in st.session_state:
-#         st.session_state.conversation = None
-#     if "chat_history" not in st.session_state:
-#         st.session_state.chat_history = None
-#     st.header("Chat with multiple PDFs :books:")
-#     user_question = st.text_input("Ask a question about your documents:")
-#     if user_question:
-#         handle_userinput(user_question)
-#     with st.sidebar:
-#         st.subheader("Your documents")
-#         pdf_docs = st.file_uploader(
-#             "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
-#         if st.button("Process"):
-#             with st.spinner("Processing"):
-#                 # get pdf text
-#                 raw_text = get_pdf_text(pdf_docs)
-#                 # get the text chunks
-#                 text_chunks = get_text_chunks(raw_text)
-#                 # create vector store
-#                 vectorstore = get_vectorstore(text_chunks)
-#                 # create conversation chain
-#                 st.session_state.conversation = get_conversation_chain(
-#                     vectorstore)
-#                 # Clear chat history
-#                 st.session_state.chat_history = None
-#     if st.session_state.conversation is not None:
-#         if st.session_state.chat_history is None:
-#             # Greet the user
-#             greeting = "Hello! How can I assist you with your documents?"
-#             st.write(bot_template.replace("{{MSG}}", greeting), unsafe_allow_html=True)
-# if __name__ == '__main__':
-#     main()

 import streamlit as st
+from llama_index import VectorStoreIndex, ServiceContext
+from llama_index.embeddings import HuggingFaceEmbedding
+from llama_index.llms import HuggingFaceInferenceAPI
+from llama_index.schema import Document
 from PyPDF2 import PdfReader
+class DocumentLoader:
+    @staticmethod
+    def read_pdf(uploaded_file):
+        pdf_reader = PdfReader(uploaded_file)
+        text = ""
+        for page_num in range(len(pdf_reader.pages)):
+            text += pdf_reader.pages[page_num].extract_text()
+        return text
+    @staticmethod
+    def load_documents(uploaded_pdf):
+        file_contents = DocumentLoader.read_pdf(uploaded_pdf)
+        return [Document(text=file_contents)]
+class IndexCreator:
+    @staticmethod
+    def create_index(documents, hf_token):
+        llm = HuggingFaceInferenceAPI(model_name="HuggingFaceH4/zephyr-7b-alpha", token=hf_token)
+        embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1")
+        service_context = ServiceContext.from_defaults(
+            llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae
+        )
+        index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)
+        index.storage_context.persist()
+        return index.as_query_engine()
+class PDFQueryApp:
+    def __init__(self):
+        st.title("Private LLM")
+        st.write("Base Model : **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggineFace)**")
+        st.write("Embedding Model : **WhereIsAI/UAE-Large-V1(open-source from HuggineFace)**")
+        st.write("Ask anything from the data that you upload")
+        st.wrte("Note !! As its runnning on a CPU it takes times 5 to 8 mins for each response")
+        self.hf_token = st.text_input("Enter your Hugging Face token [Free]:")
+        self.uploaded_pdf = st.file_uploader("Upload your data[PDF for now]", type=['pdf'])
+        self.query_engine = None
+    def load_and_create_index(self):
+        if self.uploaded_pdf:
+            st.success("Dataset has been loaded into the model succesfully")
+            documents = DocumentLoader.load_documents(self.uploaded_pdf)
+            self.query_engine = IndexCreator.create_index(documents, self.hf_token)
+            st.success("Vector embeddings have been succesfully created and initiated")
         else:
+            st.warning("You have to upload a PDF file first.")
+    def run_query(self, user_query):
+        if self.query_engine and user_query:
+            with st.spinner('Fetching the response from the model Please wait !!!!...'):
+                response = self.query_engine.query(user_query)
+            st.markdown(f"**Response:** {response}")
+        else:
+            st.warning("Please load documents and create vector embeddings before querying.")
+if __name__ == "__main__":
+    app = PDFQueryApp()
+    # Load and create index
+    app.load_and_create_index()
+    # Streamlit input for user query
+    user_query = st.text_input("Enter your query from the dataset:")
+    # Query engine with user input
+    app.run_query(user_query)