bohmian commited on
Commit
5e078a0
·
verified ·
1 Parent(s): 02f90d5

Update web_scrape_and_pdf_loader.py

Browse files
Files changed (1) hide show
  1. web_scrape_and_pdf_loader.py +5 -6
web_scrape_and_pdf_loader.py CHANGED
@@ -114,15 +114,13 @@ def pdf_loader(url, country):
114
  # Same as above but for pdf in local directory
115
  def pdf_loader_local(pdf_filename, country):
116
  try:
117
- with open(pdf_filename, 'wb') as f: # save the pdf locally first
118
- f.write(response.content)
119
  loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
120
  raw_pdf_documents = loader.load()
121
  raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
122
  return raw_pdf_documents
123
 
124
  except Exception as e:
125
- print(f"Failed to load for {url}")
126
  return False
127
 
128
  # If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
@@ -182,7 +180,7 @@ def process_links_load_documents(all_links):
182
  # Note: If we are using a lot more data than can be stored in the RAM or when in production,
183
  # better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
184
 
185
- def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country):
186
  chromadb_dir = "chromadb"
187
  if not os.path.exists(chromadb_dir):
188
  os.makedirs(chromadb_dir)
@@ -192,7 +190,7 @@ def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country
192
  chunk_size=chunk_size, chunk_overlap=chunk_overlap
193
  )
194
  split_documents = text_splitter.split_documents(all_documents)
195
- persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
196
 
197
  # Build the vector database using Chroma and persist it in a local directory
198
  chroma_db = Chroma.from_documents(split_documents,
@@ -222,8 +220,9 @@ def setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country):
222
  split_documents = text_splitter.split_documents(all_documents)
223
  split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
224
  bm25_retriever = BM25Retriever.from_documents(split_documents)
225
- filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}.pickle"
226
 
227
  with open(filename, 'wb') as handle:
228
  pickle.dump(bm25_retriever, handle)
229
 
 
 
114
  # Same as above but for pdf in local directory
115
  def pdf_loader_local(pdf_filename, country):
116
  try:
 
 
117
  loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
118
  raw_pdf_documents = loader.load()
119
  raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
120
  return raw_pdf_documents
121
 
122
  except Exception as e:
123
+ print(f"Failed to load for {pdf_filename} {e}")
124
  return False
125
 
126
  # If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
 
180
  # Note: If we are using a lot more data than can be stored in the RAM or when in production,
181
  # better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
182
 
183
+ def setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country):
184
  chromadb_dir = "chromadb"
185
  if not os.path.exists(chromadb_dir):
186
  os.makedirs(chromadb_dir)
 
190
  chunk_size=chunk_size, chunk_overlap=chunk_overlap
191
  )
192
  split_documents = text_splitter.split_documents(all_documents)
193
+ persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
194
 
195
  # Build the vector database using Chroma and persist it in a local directory
196
  chroma_db = Chroma.from_documents(split_documents,
 
220
  split_documents = text_splitter.split_documents(all_documents)
221
  split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
222
  bm25_retriever = BM25Retriever.from_documents(split_documents)
223
+ filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_.pickle"
224
 
225
  with open(filename, 'wb') as handle:
226
  pickle.dump(bm25_retriever, handle)
227
 
228
+ return True # to let user know this process is done