OuroborosM commited on
Commit
ada5c0c
·
1 Parent(s): dfc2f0a

update splitter

Browse files
Files changed (1) hide show
  1. app.py +3 -1
app.py CHANGED
@@ -60,6 +60,7 @@ from langchain.document_loaders import (
60
  UnstructuredExcelLoader
61
  )
62
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
63
  from langchain.docstore.document import Document
64
  import langchain
65
  import asyncio
@@ -355,7 +356,8 @@ def process_documents_3(ignored_files: List[str] = []) -> List[Document]:
355
  print("No new documents to load")
356
  exit(0)
357
  print(f"Loaded {len(documents)} new documents from {source_directory}")
358
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=1000)
 
359
  texts = text_splitter.split_documents(documents)
360
  print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
361
  return texts
 
60
  UnstructuredExcelLoader
61
  )
62
  from langchain.text_splitter import RecursiveCharacterTextSplitter
63
+ from langchain.text_splitter import TokenTextSplitter
64
  from langchain.docstore.document import Document
65
  import langchain
66
  import asyncio
 
356
  print("No new documents to load")
357
  exit(0)
358
  print(f"Loaded {len(documents)} new documents from {source_directory}")
359
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=1000)
360
+ text_splitter = TokenTextSplitter(chunk_size=4000, chunk_overlap=500)
361
  texts = text_splitter.split_documents(documents)
362
  print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
363
  return texts