Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +139 -4
split_files_to_excel.py
CHANGED
@@ -25,7 +25,8 @@ from pypdf import PdfReader
|
|
25 |
|
26 |
import pandas as pd
|
27 |
|
28 |
-
|
|
|
29 |
|
30 |
MODEL = "thenlper/gte-base"
|
31 |
CHUNK_SIZE = 1000
|
@@ -530,12 +531,42 @@ def split_in_df(files):
|
|
530 |
# -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
|
531 |
|
532 |
def split_by_keywords(files, key_words, words_limit=1000):
|
|
|
533 |
extracted_content = []
|
534 |
-
|
535 |
tabLine = []
|
536 |
-
for file in files:
|
537 |
|
538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
file_name = file
|
540 |
file = PdfReader(file)
|
541 |
pdfNumberPages = len(file.pages)
|
@@ -629,6 +660,9 @@ def split_by_keywords(files, key_words, words_limit=1000):
|
|
629 |
tabLine.append([file_name, selectedText, key])
|
630 |
print(f"Selected line in keywords is: {line}")
|
631 |
|
|
|
|
|
|
|
632 |
for r in tabLine:
|
633 |
text_joined = ''.join(r[1])
|
634 |
text_joined = r[2] + " : \n " + text_joined
|
@@ -654,3 +688,104 @@ def split_by_keywords(files, key_words, words_limit=1000):
|
|
654 |
|
655 |
return "dataframe_keywords.xlsx"
|
656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
import pandas as pd
|
27 |
|
28 |
+
import requests
|
29 |
+
import json
|
30 |
|
31 |
MODEL = "thenlper/gte-base"
|
32 |
CHUNK_SIZE = 1000
|
|
|
531 |
# -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
|
532 |
|
533 |
def split_by_keywords(files, key_words, words_limit=1000):
|
534 |
+
processed_files = []
|
535 |
extracted_content = []
|
|
|
536 |
tabLine = []
|
|
|
537 |
|
538 |
+
# For each files : stock the PDF, extract the Zips and convert the Doc & Docx to PDF
|
539 |
+
try:
|
540 |
+
not_duplicate = True
|
541 |
+
for f in files:
|
542 |
+
for p in processed_files:
|
543 |
+
if (f[:f.rfind('.')] == p[:p.rfind('.')]):
|
544 |
+
not_duplicate = False
|
545 |
+
if not_duplicate:
|
546 |
+
if f.endswith('.zip'):
|
547 |
+
extracted_files = extract_zip(f)
|
548 |
+
print(f"Those are my extracted files{extracted_files}")
|
549 |
+
|
550 |
+
for doc in extracted_files:
|
551 |
+
if doc.endswith('.doc') or doc.endswith('.docx'):
|
552 |
+
processed_files.append(transform_to_pdf(doc))
|
553 |
+
|
554 |
+
if doc.endswith('.pdf'):
|
555 |
+
processed_files.append(doc)
|
556 |
+
|
557 |
+
if f.endswith('.pdf'):
|
558 |
+
processed_files.append(f)
|
559 |
+
|
560 |
+
if f.endswith('.doc') or f.endswith('.docx'):
|
561 |
+
processed_files.append(transform_to_pdf(f))
|
562 |
+
|
563 |
+
except Exception as ex:
|
564 |
+
print(f"Error occured while processing files : {ex}")
|
565 |
+
|
566 |
+
# For each processed files extract content
|
567 |
+
for file in processed_files:
|
568 |
+
|
569 |
+
try:
|
570 |
file_name = file
|
571 |
file = PdfReader(file)
|
572 |
pdfNumberPages = len(file.pages)
|
|
|
660 |
tabLine.append([file_name, selectedText, key])
|
661 |
print(f"Selected line in keywords is: {line}")
|
662 |
|
663 |
+
except Exception as ex:
|
664 |
+
print(f"Error occured while extracting content : {ex}")
|
665 |
+
|
666 |
for r in tabLine:
|
667 |
text_joined = ''.join(r[1])
|
668 |
text_joined = r[2] + " : \n " + text_joined
|
|
|
688 |
|
689 |
return "dataframe_keywords.xlsx"
|
690 |
|
691 |
+
# -------------------------------------------------------------------------------- NON INTELLIGENT SPLIT
|
692 |
+
|
693 |
+
def transform_to_pdf(doc):
|
694 |
+
instructions = {'parts': [{'file': 'document'}]}
|
695 |
+
|
696 |
+
response = requests.request(
|
697 |
+
'POST',
|
698 |
+
'https://api.pspdfkit.com/build',
|
699 |
+
headers = { 'Authorization': 'Bearer pdf_live_nS6tyylSW57PNw9TIEKKL3Tt16NmLCazlQWQ9D33t0Q'},
|
700 |
+
files = {'document': open(doc, 'rb')},
|
701 |
+
data = {'instructions': json.dumps(instructions)},
|
702 |
+
stream = True
|
703 |
+
)
|
704 |
+
|
705 |
+
pdf_name = doc[:doc.find(".doc")] + ".pdf"
|
706 |
+
|
707 |
+
if response.ok:
|
708 |
+
with open(pdf_name, 'wb') as fd:
|
709 |
+
for chunk in response.iter_content(chunk_size=8096):
|
710 |
+
fd.write(chunk)
|
711 |
+
return pdf_name
|
712 |
+
|
713 |
+
else:
|
714 |
+
print(response.text)
|
715 |
+
exit()
|
716 |
+
return none
|
717 |
+
|
718 |
+
|
719 |
+
def non_intelligent_split(files, chunk_size = 1000):
|
720 |
+
extracted_content = []
|
721 |
+
processed_files = []
|
722 |
+
|
723 |
+
|
724 |
+
# For each files : stock the PDF, extract the Zips and convert the Doc & Docx to PDF
|
725 |
+
try:
|
726 |
+
not_duplicate = True
|
727 |
+
for f in files:
|
728 |
+
for p in processed_files:
|
729 |
+
if (f[:f.rfind('.')] == p[:p.rfind('.')]):
|
730 |
+
not_duplicate = False
|
731 |
+
if not_duplicate:
|
732 |
+
if f.endswith('.zip'):
|
733 |
+
extracted_files = extract_zip(f)
|
734 |
+
print(f"Those are my extracted files{extracted_files}")
|
735 |
+
|
736 |
+
for doc in extracted_files:
|
737 |
+
if doc.endswith('.doc') or doc.endswith('.docx'):
|
738 |
+
processed_files.append(transform_to_pdf(doc))
|
739 |
+
|
740 |
+
if doc.endswith('.pdf'):
|
741 |
+
processed_files.append(doc)
|
742 |
+
|
743 |
+
if f.endswith('.pdf'):
|
744 |
+
processed_files.append(f)
|
745 |
+
|
746 |
+
if f.endswith('.doc') or f.endswith('.docx'):
|
747 |
+
processed_files.append(transform_to_pdf(f))
|
748 |
+
|
749 |
+
except Exception as ex:
|
750 |
+
print(f"Error occured while processing files : {ex}")
|
751 |
+
|
752 |
+
# Extract content from each processed files
|
753 |
+
try:
|
754 |
+
for f in processed_files:
|
755 |
+
print(f"my filename is : {f}")
|
756 |
+
file = PdfReader(f)
|
757 |
+
pdfNumberPages = len(file.pages)
|
758 |
+
selectedText = ""
|
759 |
+
|
760 |
+
for pdfPage in range(0, pdfNumberPages):
|
761 |
+
load_page = file.get_page(pdfPage)
|
762 |
+
text = load_page.extract_text()
|
763 |
+
lines = text.split("\n")
|
764 |
+
sizeOfLines = 0
|
765 |
+
|
766 |
+
for index, line in enumerate(lines):
|
767 |
+
sizeOfLines += len(line)
|
768 |
+
selectedText += " " + line
|
769 |
+
if sizeOfLines >= chunk_size:
|
770 |
+
textContent = (f"Page {str(pdfPage)} : {selectedText}")
|
771 |
+
extracted_content.append([f, textContent])
|
772 |
+
sizeOfLines = 0
|
773 |
+
selectedText = ""
|
774 |
+
|
775 |
+
textContent = (f"Page {str(pdfNumberPages)} : {selectedText}")
|
776 |
+
extracted_content.append([f, textContent])
|
777 |
+
except Exception as ex:
|
778 |
+
print(f"Error occured while extracting content from processed files : {ex}")
|
779 |
+
|
780 |
+
df = pd.DataFrame()
|
781 |
+
for content in extracted_content:
|
782 |
+
filename = content[0]
|
783 |
+
text = content[1]
|
784 |
+
|
785 |
+
doc_data = {'Filename': filename[filename.rfind("/")+1:], 'Content': text}
|
786 |
+
|
787 |
+
df = pd.concat([df, pd.DataFrame([doc_data])], ignore_index=True)
|
788 |
+
|
789 |
+
df.to_excel("dataframe_keywords.xlsx", index=False)
|
790 |
+
|
791 |
+
return "dataframe_keywords.xlsx"
|