Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Apr 25, 2024

Commit

37d982e

1 Parent(s): 39788e9

Added -y to poppler-utils installation in Dockerfile. Added support for image files in image-based redaction.

Browse files

Files changed (3) hide show

Dockerfile +2 -2
app.py +11 -6
tools/file_conversion.py +22 -6

Dockerfile CHANGED Viewed

@@ -1,11 +1,11 @@
 FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
-# Install system dependencies
 RUN apt-get update \
     && apt-get install -y \
         tesseract-ocr \
         libtesseract-dev \
-        poppler-utils \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

 FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
+# Install system dependencies. Need to specify -y for poppler to get it to install
 RUN apt-get update \
     && apt-get install -y \
         tesseract-ocr \
         libtesseract-dev \
+        poppler-utils -y \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from tools.file_redaction import redact_text_pdf, redact_image_pdf
 from tools.helper_functions import get_file_path_end
-from tools.file_conversion import process_file, is_pdf
 from tools.aws_functions import load_data_from_aws
 from typing import List
@@ -18,9 +18,6 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
     tic = time.perf_counter()
-    if is_pdf(file_path) == False:
-        return "Please upload a PDF file.", None
     out_message = ''
     out_file_paths = []
@@ -34,7 +31,10 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
         return out_message, out_file_paths
     if in_redact_method == "Image analysis":
-        # Analyse image-based pdf
         pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
         out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
         pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
@@ -43,6 +43,9 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
         out_message = "Image-based PDF successfully redacted and saved to file."
     elif in_redact_method == "Text analysis":
         # Analyse text-based pdf
         pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
         out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
@@ -87,9 +90,11 @@ with block:
     gr.Markdown(
     """
     # Document redaction
-    Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
     WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
     """)
     with gr.Tab("Redact document"):

 from tools.file_redaction import redact_text_pdf, redact_image_pdf
 from tools.helper_functions import get_file_path_end
+from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_functions import load_data_from_aws
 from typing import List
     tic = time.perf_counter()
     out_message = ''
     out_file_paths = []
         return out_message, out_file_paths
     if in_redact_method == "Image analysis":
+        # Analyse and redact image-based pdf or image
+        if is_pdf_or_image(file_path) == False:
+            return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
         pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
         out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
         pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
         out_message = "Image-based PDF successfully redacted and saved to file."
     elif in_redact_method == "Text analysis":
+        if is_pdf(file_path) == False:
+            return "Please upload a PDF file for text analysis.", None
         # Analyse text-based pdf
         pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
         out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
     gr.Markdown(
     """
     # Document redaction
+    Take an image-based PDF or image file, or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction, and also works with JPG or PNG files. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
     WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
+    Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
     """)
     with gr.Tab("Redact document"):

tools/file_conversion.py CHANGED Viewed

@@ -1,6 +1,23 @@
 from pdf2image import convert_from_path
 import os
 def is_pdf(filename):
     """
     Check if a file name is a PDF.
@@ -16,14 +33,13 @@ def is_pdf(filename):
 # %%
 ## Convert pdf to image if necessary
-def convert_pdf_to_images(pdf_path):
-    image_paths = []
     # Convert PDF to a list of images
     images = convert_from_path(pdf_path)
-    # Save each image as a separate file
     # for i, image in enumerate(images):
     #     page_path = f"processing/page_{i+1}.png"
     #     image.save(page_path, "PNG")
@@ -39,10 +55,10 @@ def process_file(file_path):
     file_extension = os.path.splitext(file_path)[1].lower()
     # Check if the file is an image type
-    if file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
         print(f"{file_path} is an image file.")
         # Perform image processing here
-        out_path = [file_path]
     # Check if the file is a PDF
     elif file_extension == '.pdf':

 from pdf2image import convert_from_path
+from PIL import Image
 import os
+def is_pdf_or_image(filename):
+    """
+    Check if a file name is a PDF or an image file.
+    Args:
+        filename (str): The name of the file.
+    Returns:
+        bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
+    """
+    if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".png"):
+        output = True
+    else:
+        output = False
+    return output
 def is_pdf(filename):
     """
     Check if a file name is a PDF.
 # %%
 ## Convert pdf to image if necessary
+def convert_pdf_to_images(pdf_path):
     # Convert PDF to a list of images
     images = convert_from_path(pdf_path)
+    # Save each image as a separate file - deprecated
+    #image_paths = []
     # for i, image in enumerate(images):
     #     page_path = f"processing/page_{i+1}.png"
     #     image.save(page_path, "PNG")
     file_extension = os.path.splitext(file_path)[1].lower()
     # Check if the file is an image type
+    if file_extension in ['.jpg', '.jpeg', '.png']:
         print(f"{file_path} is an image file.")
         # Perform image processing here
+        out_path = [Image.open(file_path)]
     # Check if the file is a PDF
     elif file_extension == '.pdf':