Standard_Intelligence_Dev

Sleeping

App Files Files Community

heymenn commited on Apr 17, 2024

Commit

6216165

verified ·

1 Parent(s): 01e9f69

Update split_files_to_excel.py

Browse files

Files changed (1) hide show

split_files_to_excel.py +111 -1

split_files_to_excel.py CHANGED Viewed

@@ -516,4 +516,114 @@ def split_in_df(files):
     df.to_excel("dataframe.xlsx", index=False)
-    return "dataframe.xlsx"

     df.to_excel("dataframe.xlsx", index=False)
+    return "dataframe.xlsx"
+# -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
+def split_by_keywords(files, key_words,words_limit=1000):
+    extracted_content = []
+    tabLine = []
+    for file in files:
+        file = PdfReader(file)
+        pdfNumberPages = len(file.pages)
+        for pdfPage in range(0, pdfNumberPages):
+            load_page = file.get_page(pdfPage)
+            text = load_page.extract_text()
+            lines = text.split("\n")
+            sizeOfLines = len(lines) - 1
+            for index, line in enumerate(lines):
+                print(line)
+                for key in key_words:
+                    line = line.lower()
+                    if key in line:
+                        print("Found keyword")
+                        # Init variables for search
+                        lineBool = True
+                        lineIndex = index
+                        previousSelectedLines = []
+                        stringLength = 0
+                        linesForSelection = lines
+                        loadOnce = True
+                        selectedPdfPage = pdfPage
+                        # Loop while extracting text before keyword
+                        while lineBool:
+                            print(lineIndex)
+                            if stringLength > words_limit or lineIndex < 0:
+                                lineBool = False
+                            else:
+                                if lineIndex == 0:
+                                    if pdfPage == 0:
+                                        lineBool = False
+                                    # Load previous page
+                                    else:
+                                        try:
+                                            selectedPdfPage -= 1
+                                            newLoad_page = file.get_page(selectedPdfPage)
+                                            newText = newLoad_page.extract_text()
+                                            newLines = newText.split("\n")
+                                            linesForSelection = newLines
+                                            lineIndex = len(newLines) - 1
+                                        except Exception as e:
+                                            print(f"Loading previous PDF page failed")
+                                            lineBool = False
+                                previousSelectedLines.append(linesForSelection[lineIndex])
+                                stringLength += len(linesForSelection[lineIndex])
+                                lineIndex -= 1
+                        previousSelectedLines = ' '.join(previousSelectedLines[::-1])
+                        # Init variables for search
+                        lineBool = True
+                        lineIndex = index + 1
+                        nextSelectedLines = ""
+                        linesForSelection = lines
+                        loadOnce = True
+                        selectedPdfPage = pdfPage
+                        # Loop while extracting text after keyword
+                        while lineBool:
+                            if len(nextSelectedLines.split()) > words_limit:
+                                lineBool = False
+                            else:
+                                if lineIndex > sizeOfLines:
+                                    lineBool = False
+                                    if pdfPage == pdfNumberPages - 1:
+                                        lineBool = False
+                                    # Load next page
+                                    else:
+                                        try:
+                                            selectedPdfPage += 1
+                                            newLoad_page = file.get_page(selectedPdfPage)
+                                            newText = newLoad_page.extract_text()
+                                            newLines = newText.split("\n")
+                                            linesForSelection = newLines
+                                            lineIndex = 0
+                                        except Exception as e:
+                                            print(f"Loading next PDF page failed")
+                                            lineBool = False
+                                else:
+                                    nextSelectedLines += " " + linesForSelection[lineIndex]
+                                lineIndex += 1
+                        selectedText = previousSelectedLines + ' ' + nextSelectedLines
+                        tabLine.append([pdfPage, selectedText, key])
+    for r in tabLine:
+        text_joined = ''.join(r[1])
+        extracted_content.append(f'{r[2]} : \n {text_joined}')
+    return extracted_content