Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +17 -8
split_files_to_excel.py
CHANGED
@@ -455,7 +455,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
455 |
|
456 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
457 |
|
458 |
-
def split_doc_in_chunks(input_folder):
|
459 |
docs = []
|
460 |
for i, filename in enumerate(input_folder):
|
461 |
path = filename#os.path.join(input_folder, filename)
|
@@ -465,8 +465,10 @@ def split_doc_in_chunks(input_folder):
|
|
465 |
if path.endswith(".pdf"):
|
466 |
try:
|
467 |
print("Treatment of pdf file", path)
|
468 |
-
|
469 |
-
|
|
|
|
|
470 |
print(f"Document splitted in {len(chunks)} chunks")
|
471 |
# for chunk in chunks:
|
472 |
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
@@ -475,9 +477,11 @@ def split_doc_in_chunks(input_folder):
|
|
475 |
elif path.endswith(".docx"):
|
476 |
try:
|
477 |
print ("Treatment of docx file", path)
|
478 |
-
|
479 |
-
|
480 |
-
|
|
|
|
|
481 |
print(f"Document splitted in {len(chunks)} chunks")
|
482 |
#if "cards-Jan 2022-SP.docx" in path:
|
483 |
#for chunk in chunks:
|
@@ -496,6 +500,7 @@ def split_doc_in_chunks(input_folder):
|
|
496 |
chunk.metadata["filename"] = filename.split("/")[-1]
|
497 |
chunk.metadata["file_directory"] = filename.split("/")[:-1]
|
498 |
chunk.metadata["filetype"] = filename.split(".")[-1]
|
|
|
499 |
if "page" in chunk.metadata:
|
500 |
counter[chunk.metadata['page']] += 1
|
501 |
for i in range(len(chunks)):
|
@@ -566,15 +571,18 @@ def extract_zip(zip_path):
|
|
566 |
|
567 |
def split_in_df(files):
|
568 |
processed_files = []
|
|
|
569 |
print("Processing zip files...")
|
570 |
for file_path in files:
|
571 |
if file_path.endswith('.zip'):
|
572 |
extracted_files = extract_zip(file_path)
|
573 |
processed_files.extend(extracted_files)
|
|
|
574 |
else:
|
575 |
processed_files.append(file_path)
|
576 |
-
|
577 |
-
|
|
|
578 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
579 |
print("Finished splitting")
|
580 |
df = pd.DataFrame()
|
@@ -590,6 +598,7 @@ def split_in_df(files):
|
|
590 |
|
591 |
doc_data["Token_Length"] = re_doc.metadata['token_length']
|
592 |
doc_data["Titles"] = re_doc.metadata['titles'] if 'titles' in re_doc.metadata else ""
|
|
|
593 |
|
594 |
# for key, value in zip(metadata_keys, metadata_values):
|
595 |
# doc_data[key] = value
|
|
|
455 |
|
456 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
457 |
|
458 |
+
def split_doc_in_chunks(input_folder, base_folders):
|
459 |
docs = []
|
460 |
for i, filename in enumerate(input_folder):
|
461 |
path = filename#os.path.join(input_folder, filename)
|
|
|
465 |
if path.endswith(".pdf"):
|
466 |
try:
|
467 |
print("Treatment of pdf file", path)
|
468 |
+
raw_chunks = split_pdf(path, input_folder)
|
469 |
+
for raw_chunk in raw_chunks:
|
470 |
+
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
471 |
+
chunks = group_chunks_by_section(raw_chunks)
|
472 |
print(f"Document splitted in {len(chunks)} chunks")
|
473 |
# for chunk in chunks:
|
474 |
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
|
|
477 |
elif path.endswith(".docx"):
|
478 |
try:
|
479 |
print ("Treatment of docx file", path)
|
480 |
+
raw_chunks = split_docx(path, input_folder)
|
481 |
+
for raw_chunk in raw_chunks:
|
482 |
+
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
483 |
+
#print(f"RAW :\n***\n{raw_chunks}")
|
484 |
+
chunks = group_chunks_by_section(raw_chunks)
|
485 |
print(f"Document splitted in {len(chunks)} chunks")
|
486 |
#if "cards-Jan 2022-SP.docx" in path:
|
487 |
#for chunk in chunks:
|
|
|
500 |
chunk.metadata["filename"] = filename.split("/")[-1]
|
501 |
chunk.metadata["file_directory"] = filename.split("/")[:-1]
|
502 |
chunk.metadata["filetype"] = filename.split(".")[-1]
|
503 |
+
chunk.metadata["Base Folder"] = base_folders[i]
|
504 |
if "page" in chunk.metadata:
|
505 |
counter[chunk.metadata['page']] += 1
|
506 |
for i in range(len(chunks)):
|
|
|
571 |
|
572 |
def split_in_df(files):
|
573 |
processed_files = []
|
574 |
+
base_folders = []
|
575 |
print("Processing zip files...")
|
576 |
for file_path in files:
|
577 |
if file_path.endswith('.zip'):
|
578 |
extracted_files = extract_zip(file_path)
|
579 |
processed_files.extend(extracted_files)
|
580 |
+
base_folders.append(os.path.splitext(os.path.basename(file_path))[0])
|
581 |
else:
|
582 |
processed_files.append(file_path)
|
583 |
+
base_folders.append("")
|
584 |
+
print("Finished processing zip files\nSplitting files into chunks...")
|
585 |
+
documents = split_doc_in_chunks(processed_files, base_folders)
|
586 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
587 |
print("Finished splitting")
|
588 |
df = pd.DataFrame()
|
|
|
598 |
|
599 |
doc_data["Token_Length"] = re_doc.metadata['token_length']
|
600 |
doc_data["Titles"] = re_doc.metadata['titles'] if 'titles' in re_doc.metadata else ""
|
601 |
+
doc_data["Base Folder"] = re_doc.metadata["Base Folder"]
|
602 |
|
603 |
# for key, value in zip(metadata_keys, metadata_values):
|
604 |
# doc_data[key] = value
|