IAMTFRMZA commited on
Commit
b65a2d4
·
verified ·
1 Parent(s): 3ccaeb2
Files changed (1) hide show
  1. app.py +88 -22
app.py CHANGED
@@ -1,12 +1,17 @@
1
  import os
2
  import shutil
3
  import streamlit as st
 
 
 
4
  from langchain_core.prompts import ChatPromptTemplate
5
- from langchain_community.vectorstores import FAISS
6
  from langchain_core.output_parsers import StrOutputParser
7
  from langchain_core.runnables import RunnablePassthrough
8
  from langchain_community.llms import Together
 
9
  from langchain_community.document_loaders import UnstructuredPDFLoader
 
 
10
  from langchain.text_splitter import CharacterTextSplitter
11
  from langchain.embeddings import HuggingFaceEmbeddings
12
 
@@ -51,33 +56,79 @@ def configure_model():
51
  )
52
 
53
 
54
- def configure_retriever(pdf_loader):
55
  """Configure the retriever with embeddings and a FAISS vector store."""
56
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
57
- vector_db = FAISS.from_documents(pdf_loader, embeddings)
58
  return vector_db.as_retriever()
59
 
60
 
61
- def load_documents(path):
62
- """Load and preprocess documents from PDF files located at the specified path."""
63
- pdf_loader = []
64
  for file in os.listdir(path):
65
  if file.endswith('.pdf'):
66
  filepath = os.path.join(path, file)
67
  loader = UnstructuredPDFLoader(filepath)
68
- documents = loader.load()
69
- text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
70
- docs = text_splitter.split_documents(documents)
71
- pdf_loader.extend(docs)
72
- return pdf_loader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  def process_document(path, input_query):
76
  """Process the document by setting up the chain and invoking it with the input query."""
77
- pdf_loader = load_documents(path)
 
 
78
  llm_model = configure_model()
79
  prompt = generate_prompt()
80
- retriever = configure_retriever(pdf_loader)
81
  chain = create_chain(retriever, prompt, llm_model)
82
  response = inference(chain, input_query)
83
  return response
@@ -86,16 +137,17 @@ def process_document(path, input_query):
86
  def main():
87
  """Main function to run the Streamlit app."""
88
  tmp_folder = '/tmp/1'
89
- os.makedirs(tmp_folder,exist_ok=True)
90
 
91
- st.title("Q&A PDF AI RAG Chatbot")
92
 
93
- uploaded_files = st.sidebar.file_uploader("Choose PDF files", accept_multiple_files=True, type='pdf')
94
  if uploaded_files:
95
  for file in uploaded_files:
96
  with open(os.path.join(tmp_folder, file.name), 'wb') as f:
97
  f.write(file.getbuffer())
98
- st.success('File successfully uploaded. Start prompting!')
 
99
  if 'chat_history' not in st.session_state:
100
  st.session_state.chat_history = []
101
 
@@ -108,21 +160,35 @@ def main():
108
 
109
  if st.button("Clear Chat History"):
110
  st.session_state.chat_history = []
 
111
  for chat in st.session_state.chat_history:
112
  st.markdown(f"**Q:** {chat['question']}")
113
  st.markdown(f"**A:** {chat['answer']}")
114
  st.markdown("---")
115
  else:
116
- st.success('Upload Document to Start Process !')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  if st.sidebar.button("REMOVE UPLOADED FILES"):
119
  document_count = os.listdir(tmp_folder)
120
  if len(document_count) > 0:
121
  shutil.rmtree(tmp_folder)
122
- st.sidebar.write("FILES DELETED SUCCESSFULLY !!!")
123
  else:
124
- st.sidebar.write("NO DOCUMENT FOUND TO DELETE !!! PLEASE UPLOAD DOCUMENTS TO START PROCESS !! ")
125
-
126
 
127
  if __name__ == "__main__":
128
- main()
 
1
  import os
2
  import shutil
3
  import streamlit as st
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import pandas as pd
7
  from langchain_core.prompts import ChatPromptTemplate
 
8
  from langchain_core.output_parsers import StrOutputParser
9
  from langchain_core.runnables import RunnablePassthrough
10
  from langchain_community.llms import Together
11
+ from langchain_community.vectorstores import FAISS
12
  from langchain_community.document_loaders import UnstructuredPDFLoader
13
+ from langchain_community.document_loaders import UnstructuredWordDocumentLoader
14
+ from langchain_community.document_loaders import UnstructuredExcelLoader
15
  from langchain.text_splitter import CharacterTextSplitter
16
  from langchain.embeddings import HuggingFaceEmbeddings
17
 
 
56
  )
57
 
58
 
59
+ def configure_retriever(documents):
60
  """Configure the retriever with embeddings and a FAISS vector store."""
61
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
62
+ vector_db = FAISS.from_documents(documents, embeddings)
63
  return vector_db.as_retriever()
64
 
65
 
66
+ def load_pdf_documents(path):
67
+ """Load and preprocess PDF documents from the specified path."""
68
+ documents = []
69
  for file in os.listdir(path):
70
  if file.endswith('.pdf'):
71
  filepath = os.path.join(path, file)
72
  loader = UnstructuredPDFLoader(filepath)
73
+ documents.extend(loader.load())
74
+ return documents
75
+
76
+
77
+ def load_word_documents(path):
78
+ """Load and preprocess Word documents from the specified path."""
79
+ documents = []
80
+ for file in os.listdir(path):
81
+ if file.endswith('.docx'):
82
+ filepath = os.path.join(path, file)
83
+ loader = UnstructuredWordDocumentLoader(filepath)
84
+ documents.extend(loader.load())
85
+ return documents
86
+
87
+
88
+ def load_excel_documents(path):
89
+ """Load and preprocess Excel documents from the specified path."""
90
+ documents = []
91
+ for file in os.listdir(path):
92
+ if file.endswith('.xlsx'):
93
+ filepath = os.path.join(path, file)
94
+ loader = UnstructuredExcelLoader(filepath)
95
+ documents.extend(loader.load())
96
+ return documents
97
+
98
+
99
+ def load_documents(path):
100
+ """Load and preprocess documents from PDF, Word, and Excel files."""
101
+ pdf_docs = load_pdf_documents(path)
102
+ word_docs = load_word_documents(path)
103
+ excel_docs = load_excel_documents(path)
104
+ return pdf_docs + word_docs + excel_docs
105
+
106
+
107
+ def scrape_url(url):
108
+ """Scrape content from a given URL and save it to a text file."""
109
+ try:
110
+ response = requests.get(url)
111
+ response.raise_for_status() # Ensure we notice bad responses
112
+ soup = BeautifulSoup(response.content, 'html.parser')
113
+ text = soup.get_text()
114
+ # Save the text content to a file for processing
115
+ text_file_path = "data/scraped_content.txt"
116
+ with open(text_file_path, "w") as file:
117
+ file.write(text)
118
+ return text_file_path
119
+ except requests.RequestException as e:
120
+ st.error(f"Error fetching the URL: {e}")
121
+ return None
122
 
123
 
124
  def process_document(path, input_query):
125
  """Process the document by setting up the chain and invoking it with the input query."""
126
+ documents = load_documents(path)
127
+ text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
128
+ split_docs = text_splitter.split_documents(documents)
129
  llm_model = configure_model()
130
  prompt = generate_prompt()
131
+ retriever = configure_retriever(split_docs)
132
  chain = create_chain(retriever, prompt, llm_model)
133
  response = inference(chain, input_query)
134
  return response
 
137
  def main():
138
  """Main function to run the Streamlit app."""
139
  tmp_folder = '/tmp/1'
140
+ os.makedirs(tmp_folder, exist_ok=True)
141
 
142
+ st.title("Q&A Document AI RAG Chatbot")
143
 
144
+ uploaded_files = st.sidebar.file_uploader("Choose PDF, Word, or Excel files", accept_multiple_files=True, type=['pdf', 'docx', 'xlsx'])
145
  if uploaded_files:
146
  for file in uploaded_files:
147
  with open(os.path.join(tmp_folder, file.name), 'wb') as f:
148
  f.write(file.getbuffer())
149
+ st.success('Files successfully uploaded. Start prompting!')
150
+
151
  if 'chat_history' not in st.session_state:
152
  st.session_state.chat_history = []
153
 
 
160
 
161
  if st.button("Clear Chat History"):
162
  st.session_state.chat_history = []
163
+
164
  for chat in st.session_state.chat_history:
165
  st.markdown(f"**Q:** {chat['question']}")
166
  st.markdown(f"**A:** {chat['answer']}")
167
  st.markdown("---")
168
  else:
169
+ st.success('Upload Documents to Start Processing!')
170
+
171
+ url_input = st.sidebar.text_input("Or enter a URL to scrape content from:")
172
+ if st.sidebar.button("Scrape URL"):
173
+ if url_input:
174
+ file_path = scrape_url(url_input)
175
+ if file_path:
176
+ documents = load_documents(tmp_folder)
177
+ response = process_document(tmp_folder, "What is the content of the URL?")
178
+ st.session_state.chat_history.append({"question": "What is the content of the URL?", "answer": response})
179
+ st.success("URL content processed successfully!")
180
+ else:
181
+ st.error("Failed to process URL content.")
182
+ else:
183
+ st.warning("Please enter a valid URL.")
184
 
185
  if st.sidebar.button("REMOVE UPLOADED FILES"):
186
  document_count = os.listdir(tmp_folder)
187
  if len(document_count) > 0:
188
  shutil.rmtree(tmp_folder)
189
+ st.sidebar.write("FILES DELETED SUCCESSFULLY!")
190
  else:
191
+ st.sidebar.write("NO DOCUMENT FOUND TO DELETE! PLEASE UPLOAD DOCUMENTS TO START PROCESS!")
 
192
 
193
  if __name__ == "__main__":
194
+ main()