Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +26 -1
split_files_to_excel.py
CHANGED
@@ -20,6 +20,9 @@ from unstructured.partition.auto import partition
|
|
20 |
|
21 |
from transformers import AutoTokenizer
|
22 |
|
|
|
|
|
|
|
23 |
MODEL = "thenlper/gte-base"
|
24 |
CHUNK_SIZE = 1000
|
25 |
CHUNK_OVERLAP = 200
|
@@ -471,4 +474,26 @@ def build_index(docs, index, output_folder):
|
|
471 |
with tempfile.TemporaryDirectory() as temp_dir:
|
472 |
index.save_local(temp_dir)
|
473 |
for f in os.listdir(temp_dir):
|
474 |
-
output_folder.upload_file(f, os.path.join(temp_dir, f))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
from transformers import AutoTokenizer
|
22 |
|
23 |
+
import pandas as pd
|
24 |
+
|
25 |
+
|
26 |
MODEL = "thenlper/gte-base"
|
27 |
CHUNK_SIZE = 1000
|
28 |
CHUNK_OVERLAP = 200
|
|
|
474 |
with tempfile.TemporaryDirectory() as temp_dir:
|
475 |
index.save_local(temp_dir)
|
476 |
for f in os.listdir(temp_dir):
|
477 |
+
output_folder.upload_file(f, os.path.join(temp_dir, f))
|
478 |
+
|
479 |
+
|
480 |
+
def split_in_df(files):
|
481 |
+
documents = split_doc_in_chunks(files)
|
482 |
+
df = pd.DataFrame()
|
483 |
+
for document in documents:
|
484 |
+
content = document.page_content
|
485 |
+
|
486 |
+
metadata = document.metadata
|
487 |
+
metadata_keys = list(metadata.keys())
|
488 |
+
metadata_values = list(metadata.values())
|
489 |
+
|
490 |
+
doc_data = {'Content': content}
|
491 |
+
|
492 |
+
for key, value in zip(metadata_keys, metadata_values):
|
493 |
+
doc_data[key] = value
|
494 |
+
|
495 |
+
df = df.append(doc_data, ignore_index=True)
|
496 |
+
|
497 |
+
df.to_excel("dataframe.xlsx", index=False)
|
498 |
+
|
499 |
+
return "dataframe.xlsx"
|