Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +20 -20
split_files_to_excel.py
CHANGED
@@ -475,27 +475,27 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
|
|
475 |
# Select the appropriate document loader
|
476 |
chunks=[]
|
477 |
if path.endswith(".pdf"):
|
478 |
-
try:
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
else:
|
491 |
-
break
|
492 |
else:
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
print("
|
|
|
|
|
499 |
elif path.endswith(".docx"):
|
500 |
try:
|
501 |
print ("Treatment of docx file", path)
|
|
|
475 |
# Select the appropriate document loader
|
476 |
chunks=[]
|
477 |
if path.endswith(".pdf"):
|
478 |
+
# try:
|
479 |
+
print("Treatment of pdf file", path)
|
480 |
+
raw_chunks = split_pdf(path, input_folder)
|
481 |
+
for j, raw_chunk in enumerate(raw_chunks):
|
482 |
+
print(f"BASE zzzzz LIST : {base_folders} = i = {j}")
|
483 |
+
raw_chunk.metadata["Base Folder"] = base_folders[j]
|
484 |
+
sb_chunks = group_chunks_by_section(raw_chunks)
|
485 |
+
if nb_pages > 0:
|
486 |
+
for sb_chunk in sb_chunks:
|
487 |
+
print(f"CHUNK PAGENUM = {sb_chunk.metadata['page_number']}")
|
488 |
+
if int(sb_chunk.metadata["page_number"])<nb_pages:
|
489 |
+
chunks.append(sb_chunk)
|
|
|
|
|
490 |
else:
|
491 |
+
break
|
492 |
+
else:
|
493 |
+
chunks = sb_chunks
|
494 |
+
print(f"Document splitted in {len(chunks)} chunks")
|
495 |
+
# for chunk in chunks:
|
496 |
+
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
497 |
+
# except Exception as e:
|
498 |
+
# print("Error while splitting the pdf file: ", e)
|
499 |
elif path.endswith(".docx"):
|
500 |
try:
|
501 |
print ("Treatment of docx file", path)
|