Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +18 -1
split_files_to_excel.py
CHANGED
@@ -477,8 +477,25 @@ def build_index(docs, index, output_folder):
|
|
477 |
output_folder.upload_file(f, os.path.join(temp_dir, f))
|
478 |
|
479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
def split_in_df(files):
|
481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
df = pd.DataFrame()
|
483 |
for document in documents:
|
484 |
filename = document.metadata['filename']
|
|
|
477 |
output_folder.upload_file(f, os.path.join(temp_dir, f))
|
478 |
|
479 |
|
480 |
+
def extract_zip(zip_path):
|
481 |
+
extracted_files = []
|
482 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
483 |
+
for file_info in zip_ref.infolist():
|
484 |
+
extracted_files.append(file_info.filename)
|
485 |
+
zip_ref.extract(file_info.filename)
|
486 |
+
return extracted_files
|
487 |
+
|
488 |
def split_in_df(files):
|
489 |
+
print("Processing zip files...")
|
490 |
+
for file_path in files:
|
491 |
+
if file_path.endswith('.zip'):
|
492 |
+
extracted_files = extract_zip(file_path)
|
493 |
+
processed_files.extend(extracted_files)
|
494 |
+
else:
|
495 |
+
processed_files.append(file_path)
|
496 |
+
print("Finished processing zip files\Splitting files into chunks...")
|
497 |
+
documents = split_doc_in_chunks(processed_files)
|
498 |
+
print("Finished splitting")
|
499 |
df = pd.DataFrame()
|
500 |
for document in documents:
|
501 |
filename = document.metadata['filename']
|