Spaces:
Runtime error
Runtime error
Commit
·
ada5c0c
1
Parent(s):
dfc2f0a
update splitter
Browse files
app.py
CHANGED
@@ -60,6 +60,7 @@ from langchain.document_loaders import (
|
|
60 |
UnstructuredExcelLoader
|
61 |
)
|
62 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
63 |
from langchain.docstore.document import Document
|
64 |
import langchain
|
65 |
import asyncio
|
@@ -355,7 +356,8 @@ def process_documents_3(ignored_files: List[str] = []) -> List[Document]:
|
|
355 |
print("No new documents to load")
|
356 |
exit(0)
|
357 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
358 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=1000)
|
|
|
359 |
texts = text_splitter.split_documents(documents)
|
360 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
361 |
return texts
|
|
|
60 |
UnstructuredExcelLoader
|
61 |
)
|
62 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
63 |
+
from langchain.text_splitter import TokenTextSplitter
|
64 |
from langchain.docstore.document import Document
|
65 |
import langchain
|
66 |
import asyncio
|
|
|
356 |
print("No new documents to load")
|
357 |
exit(0)
|
358 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
359 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=1000)
|
360 |
+
text_splitter = TokenTextSplitter(chunk_size=4000, chunk_overlap=500)
|
361 |
texts = text_splitter.split_documents(documents)
|
362 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
363 |
return texts
|