Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +6 -5
split_files_to_excel.py
CHANGED
@@ -471,19 +471,20 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
|
|
471 |
docs = []
|
472 |
for i, filename in enumerate(input_folder):
|
473 |
path = filename#os.path.join(input_folder, filename)
|
474 |
-
print(f"Treating file {i}/{len(input_folder)}")
|
475 |
# Select the appropriate document loader
|
476 |
chunks=[]
|
477 |
if path.endswith(".pdf"):
|
478 |
try:
|
479 |
print("Treatment of pdf file", path)
|
480 |
raw_chunks = split_pdf(path, input_folder)
|
481 |
-
for raw_chunk in raw_chunks:
|
482 |
-
print(f"BASE zzzzz LIST : {base_folders} = i = {
|
483 |
-
raw_chunk.metadata["Base Folder"] = base_folders[
|
484 |
sb_chunks = group_chunks_by_section(raw_chunks)
|
485 |
if nb_pages > 0:
|
486 |
for sb_chunk in sb_chunks:
|
|
|
487 |
if int(sb_chunk.metadata["page_number"])<nb_pages:
|
488 |
chunks.append(sb_chunk)
|
489 |
else:
|
@@ -602,7 +603,7 @@ def split_in_df(files, nb_pages):
|
|
602 |
else:
|
603 |
processed_files.append(file_path)
|
604 |
base_folders.append("")
|
605 |
-
print(f"BASE FOLDERS LIST : {base_folders}")
|
606 |
print("Finished processing zip files\nSplitting files into chunks...")
|
607 |
documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
|
608 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
|
|
471 |
docs = []
|
472 |
for i, filename in enumerate(input_folder):
|
473 |
path = filename#os.path.join(input_folder, filename)
|
474 |
+
print(f"Treating file {i+1}/{len(input_folder)}")
|
475 |
# Select the appropriate document loader
|
476 |
chunks=[]
|
477 |
if path.endswith(".pdf"):
|
478 |
try:
|
479 |
print("Treatment of pdf file", path)
|
480 |
raw_chunks = split_pdf(path, input_folder)
|
481 |
+
for j, raw_chunk in enumerate(raw_chunks):
|
482 |
+
print(f"BASE zzzzz LIST : {base_folders} = i = {j}")
|
483 |
+
raw_chunk.metadata["Base Folder"] = base_folders[j]
|
484 |
sb_chunks = group_chunks_by_section(raw_chunks)
|
485 |
if nb_pages > 0:
|
486 |
for sb_chunk in sb_chunks:
|
487 |
+
print(f"CHUNK PAGENUM = {sb_chunk.metadata['page_number']}")
|
488 |
if int(sb_chunk.metadata["page_number"])<nb_pages:
|
489 |
chunks.append(sb_chunk)
|
490 |
else:
|
|
|
603 |
else:
|
604 |
processed_files.append(file_path)
|
605 |
base_folders.append("")
|
606 |
+
print(f"BASE FOLDERS LIST : {base_folders}, FILES LIST : {processed_files}")
|
607 |
print("Finished processing zip files\nSplitting files into chunks...")
|
608 |
documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
|
609 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|