lorrain_airag_assist

Sleeping

App Files Files Community

IAMTFRMZA commited on Oct 29, 2024

Commit

b65a2d4

verified ·

1 Parent(s): 3ccaeb2

app.py

Browse files

Files changed (1) hide show

app.py +88 -22

app.py CHANGED Viewed

@@ -1,12 +1,17 @@
 import os
 import shutil
 import streamlit as st
 from langchain_core.prompts import ChatPromptTemplate
-from langchain_community.vectorstores import FAISS
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_community.llms import Together
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.embeddings import HuggingFaceEmbeddings
@@ -51,33 +56,79 @@ def configure_model():
     )
-def configure_retriever(pdf_loader):
     """Configure the retriever with embeddings and a FAISS vector store."""
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    vector_db = FAISS.from_documents(pdf_loader, embeddings)
     return vector_db.as_retriever()
-def load_documents(path):
-    """Load and preprocess documents from PDF files located at the specified path."""
-    pdf_loader = []
     for file in os.listdir(path):
         if file.endswith('.pdf'):
             filepath = os.path.join(path, file)
             loader = UnstructuredPDFLoader(filepath)
-            documents = loader.load()
-            text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
-            docs = text_splitter.split_documents(documents)
-            pdf_loader.extend(docs)
-    return pdf_loader
 def process_document(path, input_query):
     """Process the document by setting up the chain and invoking it with the input query."""
-    pdf_loader = load_documents(path)
     llm_model = configure_model()
     prompt = generate_prompt()
-    retriever = configure_retriever(pdf_loader)
     chain = create_chain(retriever, prompt, llm_model)
     response = inference(chain, input_query)
     return response
@@ -86,16 +137,17 @@ def process_document(path, input_query):
 def main():
     """Main function to run the Streamlit app."""
     tmp_folder = '/tmp/1'
-    os.makedirs(tmp_folder,exist_ok=True)
-    st.title("Q&A PDF AI RAG Chatbot")
-    uploaded_files = st.sidebar.file_uploader("Choose PDF files", accept_multiple_files=True, type='pdf')
     if uploaded_files:
         for file in uploaded_files:
             with open(os.path.join(tmp_folder, file.name), 'wb') as f:
                 f.write(file.getbuffer())
-        st.success('File successfully uploaded. Start prompting!')
     if 'chat_history' not in st.session_state:
         st.session_state.chat_history = []
@@ -108,21 +160,35 @@ def main():
         if st.button("Clear Chat History"):
             st.session_state.chat_history = []
         for chat in st.session_state.chat_history:
             st.markdown(f"**Q:** {chat['question']}")
             st.markdown(f"**A:** {chat['answer']}")
             st.markdown("---")
     else:
-        st.success('Upload Document to Start Process !')
     if st.sidebar.button("REMOVE UPLOADED FILES"):
         document_count = os.listdir(tmp_folder)
         if len(document_count) > 0:
             shutil.rmtree(tmp_folder)
-            st.sidebar.write("FILES DELETED SUCCESSFULLY !!!")
         else:
-            st.sidebar.write("NO DOCUMENT FOUND TO DELETE !!! PLEASE UPLOAD DOCUMENTS TO START PROCESS !! ")
 if __name__ == "__main__":
-    main()

 import os
 import shutil
 import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_community.llms import Together
+from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import UnstructuredPDFLoader
+from langchain_community.document_loaders import UnstructuredWordDocumentLoader
+from langchain_community.document_loaders import UnstructuredExcelLoader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.embeddings import HuggingFaceEmbeddings
     )
+def configure_retriever(documents):
     """Configure the retriever with embeddings and a FAISS vector store."""
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    vector_db = FAISS.from_documents(documents, embeddings)
     return vector_db.as_retriever()
+def load_pdf_documents(path):
+    """Load and preprocess PDF documents from the specified path."""
+    documents = []
     for file in os.listdir(path):
         if file.endswith('.pdf'):
             filepath = os.path.join(path, file)
             loader = UnstructuredPDFLoader(filepath)
+            documents.extend(loader.load())
+    return documents
+def load_word_documents(path):
+    """Load and preprocess Word documents from the specified path."""
+    documents = []
+    for file in os.listdir(path):
+        if file.endswith('.docx'):
+            filepath = os.path.join(path, file)
+            loader = UnstructuredWordDocumentLoader(filepath)
+            documents.extend(loader.load())
+    return documents
+def load_excel_documents(path):
+    """Load and preprocess Excel documents from the specified path."""
+    documents = []
+    for file in os.listdir(path):
+        if file.endswith('.xlsx'):
+            filepath = os.path.join(path, file)
+            loader = UnstructuredExcelLoader(filepath)
+            documents.extend(loader.load())
+    return documents
+def load_documents(path):
+    """Load and preprocess documents from PDF, Word, and Excel files."""
+    pdf_docs = load_pdf_documents(path)
+    word_docs = load_word_documents(path)
+    excel_docs = load_excel_documents(path)
+    return pdf_docs + word_docs + excel_docs
+def scrape_url(url):
+    """Scrape content from a given URL and save it to a text file."""
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Ensure we notice bad responses
+        soup = BeautifulSoup(response.content, 'html.parser')
+        text = soup.get_text()
+        # Save the text content to a file for processing
+        text_file_path = "data/scraped_content.txt"
+        with open(text_file_path, "w") as file:
+            file.write(text)
+        return text_file_path
+    except requests.RequestException as e:
+        st.error(f"Error fetching the URL: {e}")
+        return None
 def process_document(path, input_query):
     """Process the document by setting up the chain and invoking it with the input query."""
+    documents = load_documents(path)
+    text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
+    split_docs = text_splitter.split_documents(documents)
     llm_model = configure_model()
     prompt = generate_prompt()
+    retriever = configure_retriever(split_docs)
     chain = create_chain(retriever, prompt, llm_model)
     response = inference(chain, input_query)
     return response
 def main():
     """Main function to run the Streamlit app."""
     tmp_folder = '/tmp/1'
+    os.makedirs(tmp_folder, exist_ok=True)
+    st.title("Q&A Document AI RAG Chatbot")
+    uploaded_files = st.sidebar.file_uploader("Choose PDF, Word, or Excel files", accept_multiple_files=True, type=['pdf', 'docx', 'xlsx'])
     if uploaded_files:
         for file in uploaded_files:
             with open(os.path.join(tmp_folder, file.name), 'wb') as f:
                 f.write(file.getbuffer())
+        st.success('Files successfully uploaded. Start prompting!')
     if 'chat_history' not in st.session_state:
         st.session_state.chat_history = []
         if st.button("Clear Chat History"):
             st.session_state.chat_history = []
         for chat in st.session_state.chat_history:
             st.markdown(f"**Q:** {chat['question']}")
             st.markdown(f"**A:** {chat['answer']}")
             st.markdown("---")
     else:
+        st.success('Upload Documents to Start Processing!')
+    url_input = st.sidebar.text_input("Or enter a URL to scrape content from:")
+    if st.sidebar.button("Scrape URL"):
+        if url_input:
+            file_path = scrape_url(url_input)
+            if file_path:
+                documents = load_documents(tmp_folder)
+                response = process_document(tmp_folder, "What is the content of the URL?")
+                st.session_state.chat_history.append({"question": "What is the content of the URL?", "answer": response})
+                st.success("URL content processed successfully!")
+            else:
+                st.error("Failed to process URL content.")
+        else:
+            st.warning("Please enter a valid URL.")
     if st.sidebar.button("REMOVE UPLOADED FILES"):
         document_count = os.listdir(tmp_folder)
         if len(document_count) > 0:
             shutil.rmtree(tmp_folder)
+            st.sidebar.write("FILES DELETED SUCCESSFULLY!")
         else:
+            st.sidebar.write("NO DOCUMENT FOUND TO DELETE! PLEASE UPLOAD DOCUMENTS TO START PROCESS!")
 if __name__ == "__main__":
+    main()