Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +14 -6
split_files_to_excel.py
CHANGED
@@ -68,7 +68,7 @@ text_splitter = CharacterTextSplitter(
|
|
68 |
|
69 |
def function_split_call(fi_input, dropdown, choice, chunk_size):
|
70 |
if choice == "Intelligent split":
|
71 |
-
return split_in_df(fi_input)
|
72 |
elif choice == "Non intelligent split":
|
73 |
return non_intelligent_split(fi_input, chunk_size)
|
74 |
else:
|
@@ -78,7 +78,7 @@ def change_textbox(dropdown,radio):
|
|
78 |
if len(dropdown) == 0 :
|
79 |
dropdown = ["introduction", "objective", "summary", "conclusion"]
|
80 |
if radio == "Intelligent split by keywords":
|
81 |
-
return gr.Dropdown(dropdown, multiselect=True, visible=True, allow_custom_value=True), gr.Number(visible=
|
82 |
elif radio == "Non intelligent split":
|
83 |
return gr.Dropdown(dropdown, visible=False),gr.Number(label="Chunk size", value=1000, interactive=True, visible=True)
|
84 |
else:
|
@@ -464,7 +464,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
464 |
|
465 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
466 |
|
467 |
-
def split_doc_in_chunks(input_folder, base_folders):
|
468 |
docs = []
|
469 |
for i, filename in enumerate(input_folder):
|
470 |
path = filename#os.path.join(input_folder, filename)
|
@@ -478,7 +478,15 @@ def split_doc_in_chunks(input_folder, base_folders):
|
|
478 |
for raw_chunk in raw_chunks:
|
479 |
print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
|
480 |
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
print(f"Document splitted in {len(chunks)} chunks")
|
483 |
# for chunk in chunks:
|
484 |
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
@@ -579,7 +587,7 @@ def extract_zip(zip_path):
|
|
579 |
zip_ref.extract(file_info.filename)
|
580 |
return extracted_files
|
581 |
|
582 |
-
def split_in_df(files):
|
583 |
processed_files = []
|
584 |
base_folders = []
|
585 |
print("Processing zip files...")
|
@@ -593,7 +601,7 @@ def split_in_df(files):
|
|
593 |
base_folders.append("")
|
594 |
print(f"BASE FOLDERS LIST : {base_folders}")
|
595 |
print("Finished processing zip files\nSplitting files into chunks...")
|
596 |
-
documents = split_doc_in_chunks(processed_files, base_folders)
|
597 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
598 |
print("Finished splitting")
|
599 |
df = pd.DataFrame()
|
|
|
68 |
|
69 |
def function_split_call(fi_input, dropdown, choice, chunk_size):
|
70 |
if choice == "Intelligent split":
|
71 |
+
return split_in_df(fi_input, nb_pages)
|
72 |
elif choice == "Non intelligent split":
|
73 |
return non_intelligent_split(fi_input, chunk_size)
|
74 |
else:
|
|
|
78 |
if len(dropdown) == 0 :
|
79 |
dropdown = ["introduction", "objective", "summary", "conclusion"]
|
80 |
if radio == "Intelligent split by keywords":
|
81 |
+
return gr.Dropdown(dropdown, multiselect=True, visible=True, allow_custom_value=True), gr.Number(label="First pages to keep (0 for all)", value=2, interactive=True, visible=True)
|
82 |
elif radio == "Non intelligent split":
|
83 |
return gr.Dropdown(dropdown, visible=False),gr.Number(label="Chunk size", value=1000, interactive=True, visible=True)
|
84 |
else:
|
|
|
464 |
|
465 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
466 |
|
467 |
+
def split_doc_in_chunks(input_folder, base_folders, nb_pages):
|
468 |
docs = []
|
469 |
for i, filename in enumerate(input_folder):
|
470 |
path = filename#os.path.join(input_folder, filename)
|
|
|
478 |
for raw_chunk in raw_chunks:
|
479 |
print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
|
480 |
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
481 |
+
sb_chunks = group_chunks_by_section(raw_chunks)
|
482 |
+
if nb_pages > 0:
|
483 |
+
for sb_chunk in sb_chunks:
|
484 |
+
if int(sb_chunk.metadata["page_number"])<nb_pages:
|
485 |
+
chunks.append(sb_chunk)
|
486 |
+
else:
|
487 |
+
break
|
488 |
+
else:
|
489 |
+
chunks = sb_chunks
|
490 |
print(f"Document splitted in {len(chunks)} chunks")
|
491 |
# for chunk in chunks:
|
492 |
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
|
|
587 |
zip_ref.extract(file_info.filename)
|
588 |
return extracted_files
|
589 |
|
590 |
+
def split_in_df(files, nb_pages):
|
591 |
processed_files = []
|
592 |
base_folders = []
|
593 |
print("Processing zip files...")
|
|
|
601 |
base_folders.append("")
|
602 |
print(f"BASE FOLDERS LIST : {base_folders}")
|
603 |
print("Finished processing zip files\nSplitting files into chunks...")
|
604 |
+
documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
|
605 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
606 |
print("Finished splitting")
|
607 |
df = pd.DataFrame()
|