Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +7 -4
split_files_to_excel.py
CHANGED
@@ -359,10 +359,13 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
359 |
is_first_chunk = True # Keep track of the first chunk in the document
|
360 |
to_encode += doc.page_content
|
361 |
# if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
|
|
|
|
|
|
366 |
#print(f"to_encode:\n{to_encode}")
|
367 |
encoded = tokenizer.encode(to_encode)#encode the current document
|
368 |
if len(encoded) < min_chunk_size and not skip_next:
|
|
|
359 |
is_first_chunk = True # Keep track of the first chunk in the document
|
360 |
to_encode += doc.page_content
|
361 |
# if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
|
362 |
+
try:
|
363 |
+
if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
|
364 |
+
# print('SAME DOC')
|
365 |
+
skip_next = True
|
366 |
+
to_encode += documents[i+1].page_content
|
367 |
+
except Exception as e:
|
368 |
+
print(e)
|
369 |
#print(f"to_encode:\n{to_encode}")
|
370 |
encoded = tokenizer.encode(to_encode)#encode the current document
|
371 |
if len(encoded) < min_chunk_size and not skip_next:
|