Spaces:
Runtime error
Runtime error
Commit
·
fdba7fc
1
Parent(s):
ada5c0c
update splitter
Browse files
app.py
CHANGED
@@ -322,7 +322,8 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
|
|
322 |
print("No new documents to load")
|
323 |
exit(0)
|
324 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
325 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
326 |
texts = text_splitter.split_documents(documents)
|
327 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
328 |
return texts
|
@@ -339,7 +340,8 @@ def process_documents_2(ignored_files: List[str] = []) -> List[Document]:
|
|
339 |
print("No new documents to load")
|
340 |
exit(0)
|
341 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
342 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
343 |
texts = text_splitter.split_documents(documents)
|
344 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
345 |
return texts
|
|
|
322 |
print("No new documents to load")
|
323 |
exit(0)
|
324 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
325 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
326 |
+
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
327 |
texts = text_splitter.split_documents(documents)
|
328 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
329 |
return texts
|
|
|
340 |
print("No new documents to load")
|
341 |
exit(0)
|
342 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
343 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
344 |
+
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
345 |
texts = text_splitter.split_documents(documents)
|
346 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
347 |
return texts
|