Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +16 -6
split_files_to_excel.py
CHANGED
@@ -493,9 +493,9 @@ def split_doc_in_chunks(input_folder):
|
|
493 |
return docs
|
494 |
|
495 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
496 |
-
def resplit_by_end_of_sentence(docs):
|
497 |
print("❌❌\nResplitting docs by end of sentence\n❌❌")
|
498 |
-
resized_docs = split_chunks_by_tokens_period(docs,
|
499 |
try:
|
500 |
# add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
|
501 |
cur_source = ""
|
@@ -553,11 +553,12 @@ def split_in_df(files):
|
|
553 |
processed_files.append(file_path)
|
554 |
print("Finished processing zip files\Splitting files into chunks...")
|
555 |
documents = split_doc_in_chunks(processed_files)
|
|
|
556 |
print("Finished splitting")
|
557 |
df = pd.DataFrame()
|
558 |
-
for
|
559 |
-
filename =
|
560 |
-
content =
|
561 |
|
562 |
# metadata = document.metadata
|
563 |
# metadata_keys = list(metadata.keys())
|
@@ -836,4 +837,13 @@ def non_intelligent_split(files, chunk_size = 1000):
|
|
836 |
|
837 |
df.to_excel("dataframe_keywords.xlsx", index=False)
|
838 |
|
839 |
-
return "dataframe_keywords.xlsx"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
493 |
return docs
|
494 |
|
495 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
496 |
+
def resplit_by_end_of_sentence(docs, max_len, overlap, min_len):
|
497 |
print("❌❌\nResplitting docs by end of sentence\n❌❌")
|
498 |
+
resized_docs = split_chunks_by_tokens_period(docs, max_len, overlap, min_len)
|
499 |
try:
|
500 |
# add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
|
501 |
cur_source = ""
|
|
|
553 |
processed_files.append(file_path)
|
554 |
print("Finished processing zip files\Splitting files into chunks...")
|
555 |
documents = split_doc_in_chunks(processed_files)
|
556 |
+
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
557 |
print("Finished splitting")
|
558 |
df = pd.DataFrame()
|
559 |
+
for re_doc in re_docs:
|
560 |
+
filename = re_doc.metadata['filename']
|
561 |
+
content = re_doc.page_content
|
562 |
|
563 |
# metadata = document.metadata
|
564 |
# metadata_keys = list(metadata.keys())
|
|
|
837 |
|
838 |
df.to_excel("dataframe_keywords.xlsx", index=False)
|
839 |
|
840 |
+
return "dataframe_keywords.xlsx"
|
841 |
+
|
842 |
+
|
843 |
+
def function_split_call(fi_input, dropdown, choice, chunk_size):
|
844 |
+
if choice == "Intelligent split":
|
845 |
+
return split_in_df(fi_input)
|
846 |
+
elif choice == "Non intelligent split":
|
847 |
+
return non_intelligent_split(fi_input, chunk_size)
|
848 |
+
else:
|
849 |
+
return split_by_keywords(fi_input,dropdown)
|