Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +3 -3
split_files_to_excel.py
CHANGED
@@ -470,7 +470,7 @@ def split_doc_in_chunks(input_folder, base_folders):
|
|
470 |
print("Treatment of pdf file", path)
|
471 |
raw_chunks = split_pdf(path, input_folder)
|
472 |
for raw_chunk in raw_chunks:
|
473 |
-
print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
|
474 |
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
475 |
chunks = group_chunks_by_section(raw_chunks)
|
476 |
print(f"Document splitted in {len(chunks)} chunks")
|
@@ -581,11 +581,11 @@ def split_in_df(files):
|
|
581 |
if file_path.endswith('.zip'):
|
582 |
extracted_files = extract_zip(file_path)
|
583 |
processed_files.extend(extracted_files)
|
584 |
-
base_folders.extend([os.path.splitext(
|
585 |
else:
|
586 |
processed_files.append(file_path)
|
587 |
base_folders.append("")
|
588 |
-
print(f"BASE FOLDERS LIST : {base_folders}")
|
589 |
print("Finished processing zip files\nSplitting files into chunks...")
|
590 |
documents = split_doc_in_chunks(processed_files, base_folders)
|
591 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
|
|
470 |
print("Treatment of pdf file", path)
|
471 |
raw_chunks = split_pdf(path, input_folder)
|
472 |
for raw_chunk in raw_chunks:
|
473 |
+
#print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
|
474 |
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
475 |
chunks = group_chunks_by_section(raw_chunks)
|
476 |
print(f"Document splitted in {len(chunks)} chunks")
|
|
|
581 |
if file_path.endswith('.zip'):
|
582 |
extracted_files = extract_zip(file_path)
|
583 |
processed_files.extend(extracted_files)
|
584 |
+
base_folders.extend([[os.path.splitext(os.path.basename(file_path))[0]] * len(extracted_files)])
|
585 |
else:
|
586 |
processed_files.append(file_path)
|
587 |
base_folders.append("")
|
588 |
+
#print(f"BASE FOLDERS LIST : {base_folders}")
|
589 |
print("Finished processing zip files\nSplitting files into chunks...")
|
590 |
documents = split_doc_in_chunks(processed_files, base_folders)
|
591 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|