OuroborosM commited on
Commit
fdba7fc
·
1 Parent(s): ada5c0c

update splitter

Browse files
Files changed (1) hide show
  1. app.py +4 -2
app.py CHANGED
@@ -322,7 +322,8 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
322
  print("No new documents to load")
323
  exit(0)
324
  print(f"Loaded {len(documents)} new documents from {source_directory}")
325
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
326
  texts = text_splitter.split_documents(documents)
327
  print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
328
  return texts
@@ -339,7 +340,8 @@ def process_documents_2(ignored_files: List[str] = []) -> List[Document]:
339
  print("No new documents to load")
340
  exit(0)
341
  print(f"Loaded {len(documents)} new documents from {source_directory}")
342
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
343
  texts = text_splitter.split_documents(documents)
344
  print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
345
  return texts
 
322
  print("No new documents to load")
323
  exit(0)
324
  print(f"Loaded {len(documents)} new documents from {source_directory}")
325
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
326
+ text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
327
  texts = text_splitter.split_documents(documents)
328
  print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
329
  return texts
 
340
  print("No new documents to load")
341
  exit(0)
342
  print(f"Loaded {len(documents)} new documents from {source_directory}")
343
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
344
+ text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
345
  texts = text_splitter.split_documents(documents)
346
  print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
347
  return texts