Standard_Intelligence_Dev

Sleeping

App Files Files Community

heymenn commited on Apr 23, 2024

Commit

3f5d008

verified ·

1 Parent(s): 8e35445

Update split_files_to_excel.py

Browse files

Files changed (1) hide show

split_files_to_excel.py +139 -4

split_files_to_excel.py CHANGED Viewed

@@ -25,7 +25,8 @@ from pypdf import PdfReader
 import pandas as pd
 MODEL = "thenlper/gte-base"
 CHUNK_SIZE = 1000
@@ -530,12 +531,42 @@ def split_in_df(files):
 # -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
 def split_by_keywords(files, key_words, words_limit=1000):
     extracted_content = []
     tabLine = []
-    for file in files:
-        if file.endswith('pdf'):
             file_name = file
             file = PdfReader(file)
             pdfNumberPages = len(file.pages)
@@ -629,6 +660,9 @@ def split_by_keywords(files, key_words, words_limit=1000):
                             tabLine.append([file_name, selectedText, key])
                             print(f"Selected line in keywords is: {line}")
     for r in tabLine:
         text_joined = ''.join(r[1])
         text_joined = r[2] + " : \n " + text_joined
@@ -654,3 +688,104 @@ def split_by_keywords(files, key_words, words_limit=1000):
     return "dataframe_keywords.xlsx"

 import pandas as pd
+import requests
+import json
 MODEL = "thenlper/gte-base"
 CHUNK_SIZE = 1000
 # -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
 def split_by_keywords(files, key_words, words_limit=1000):
+    processed_files = []
     extracted_content = []
     tabLine = []
+    # For each files : stock the PDF, extract the Zips and convert the Doc & Docx to PDF
+    try:
+        not_duplicate = True
+        for f in files:
+            for p in processed_files:
+                if (f[:f.rfind('.')] == p[:p.rfind('.')]):
+                    not_duplicate = False
+            if not_duplicate:
+                if f.endswith('.zip'):
+                    extracted_files = extract_zip(f)
+                    print(f"Those are my extracted files{extracted_files}")
+                    for doc in extracted_files:
+                        if doc.endswith('.doc') or doc.endswith('.docx'):
+                            processed_files.append(transform_to_pdf(doc))
+                        if doc.endswith('.pdf'):
+                            processed_files.append(doc)
+                if f.endswith('.pdf'):
+                    processed_files.append(f)
+                if f.endswith('.doc') or f.endswith('.docx'):
+                    processed_files.append(transform_to_pdf(f))
+    except Exception as ex:
+        print(f"Error occured while processing files : {ex}")
+    # For each processed files extract content
+    for file in processed_files:
+        try:
             file_name = file
             file = PdfReader(file)
             pdfNumberPages = len(file.pages)
                             tabLine.append([file_name, selectedText, key])
                             print(f"Selected line in keywords is: {line}")
+    except Exception as ex:
+        print(f"Error occured while extracting content : {ex}")
     for r in tabLine:
         text_joined = ''.join(r[1])
         text_joined = r[2] + " : \n " + text_joined
     return "dataframe_keywords.xlsx"
+# -------------------------------------------------------------------------------- NON INTELLIGENT SPLIT
+def transform_to_pdf(doc):
+    instructions = {'parts': [{'file': 'document'}]}
+    response = requests.request(
+      'POST',
+      'https://api.pspdfkit.com/build',
+      headers = { 'Authorization': 'Bearer pdf_live_nS6tyylSW57PNw9TIEKKL3Tt16NmLCazlQWQ9D33t0Q'},
+      files = {'document': open(doc, 'rb')},
+      data = {'instructions': json.dumps(instructions)},
+      stream = True
+    )
+    pdf_name = doc[:doc.find(".doc")] + ".pdf"
+    if response.ok:
+      with open(pdf_name, 'wb') as fd:
+        for chunk in response.iter_content(chunk_size=8096):
+          fd.write(chunk)
+      return pdf_name
+    else:
+      print(response.text)
+      exit()
+      return none
+def non_intelligent_split(files, chunk_size = 1000):
+    extracted_content = []
+    processed_files = []
+    # For each files : stock the PDF, extract the Zips and convert the Doc & Docx to PDF
+    try:
+        not_duplicate = True
+        for f in files:
+            for p in processed_files:
+                if (f[:f.rfind('.')] == p[:p.rfind('.')]):
+                    not_duplicate = False
+            if not_duplicate:
+                if f.endswith('.zip'):
+                    extracted_files = extract_zip(f)
+                    print(f"Those are my extracted files{extracted_files}")
+                    for doc in extracted_files:
+                        if doc.endswith('.doc') or doc.endswith('.docx'):
+                            processed_files.append(transform_to_pdf(doc))
+                        if doc.endswith('.pdf'):
+                            processed_files.append(doc)
+                if f.endswith('.pdf'):
+                    processed_files.append(f)
+                if f.endswith('.doc') or f.endswith('.docx'):
+                    processed_files.append(transform_to_pdf(f))
+    except Exception as ex:
+        print(f"Error occured while processing files : {ex}")
+    # Extract content from each processed files
+    try:
+        for f in processed_files:
+            print(f"my filename is : {f}")
+            file = PdfReader(f)
+            pdfNumberPages = len(file.pages)
+            selectedText = ""
+            for pdfPage in range(0, pdfNumberPages):
+                load_page = file.get_page(pdfPage)
+                text = load_page.extract_text()
+                lines = text.split("\n")
+                sizeOfLines = 0
+                for index, line in enumerate(lines):
+                    sizeOfLines += len(line)
+                    selectedText += " " + line
+                    if sizeOfLines >= chunk_size:
+                        textContent = (f"Page {str(pdfPage)} : {selectedText}")
+                        extracted_content.append([f, textContent])
+                        sizeOfLines = 0
+                        selectedText = ""
+            textContent = (f"Page {str(pdfNumberPages)} : {selectedText}")
+            extracted_content.append([f, textContent])
+    except Exception as ex:
+        print(f"Error occured while extracting content from processed files : {ex}")
+    df = pd.DataFrame()
+    for content in extracted_content:
+        filename = content[0]
+        text = content[1]
+        doc_data = {'Filename': filename[filename.rfind("/")+1:], 'Content': text}
+        df = pd.concat([df, pd.DataFrame([doc_data])], ignore_index=True)
+    df.to_excel("dataframe_keywords.xlsx", index=False)
+    return "dataframe_keywords.xlsx"