seanpedrickcase commited on
Commit
37d982e
1 Parent(s): 39788e9

Added -y to poppler-utils installation in Dockerfile. Added support for image files in image-based redaction.

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -2
  2. app.py +11 -6
  3. tools/file_conversion.py +22 -6
Dockerfile CHANGED
@@ -1,11 +1,11 @@
1
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
2
 
3
- # Install system dependencies
4
  RUN apt-get update \
5
  && apt-get install -y \
6
  tesseract-ocr \
7
  libtesseract-dev \
8
- poppler-utils \
9
  && apt-get clean \
10
  && rm -rf /var/lib/apt/lists/*
11
 
 
1
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
2
 
3
+ # Install system dependencies. Need to specify -y for poppler to get it to install
4
  RUN apt-get update \
5
  && apt-get install -y \
6
  tesseract-ocr \
7
  libtesseract-dev \
8
+ poppler-utils -y \
9
  && apt-get clean \
10
  && rm -rf /var/lib/apt/lists/*
11
 
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from tools.file_redaction import redact_text_pdf, redact_image_pdf
2
  from tools.helper_functions import get_file_path_end
3
- from tools.file_conversion import process_file, is_pdf
4
  from tools.aws_functions import load_data_from_aws
5
 
6
  from typing import List
@@ -18,9 +18,6 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
18
 
19
  tic = time.perf_counter()
20
 
21
- if is_pdf(file_path) == False:
22
- return "Please upload a PDF file.", None
23
-
24
  out_message = ''
25
  out_file_paths = []
26
 
@@ -34,7 +31,10 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
34
  return out_message, out_file_paths
35
 
36
  if in_redact_method == "Image analysis":
37
- # Analyse image-based pdf
 
 
 
38
  pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
39
  out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
40
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
@@ -43,6 +43,9 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
43
  out_message = "Image-based PDF successfully redacted and saved to file."
44
 
45
  elif in_redact_method == "Text analysis":
 
 
 
46
  # Analyse text-based pdf
47
  pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
48
  out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
@@ -87,9 +90,11 @@ with block:
87
  gr.Markdown(
88
  """
89
  # Document redaction
90
- Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
91
 
92
  WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
 
 
93
  """)
94
 
95
  with gr.Tab("Redact document"):
 
1
  from tools.file_redaction import redact_text_pdf, redact_image_pdf
2
  from tools.helper_functions import get_file_path_end
3
+ from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
4
  from tools.aws_functions import load_data_from_aws
5
 
6
  from typing import List
 
18
 
19
  tic = time.perf_counter()
20
 
 
 
 
21
  out_message = ''
22
  out_file_paths = []
23
 
 
31
  return out_message, out_file_paths
32
 
33
  if in_redact_method == "Image analysis":
34
+ # Analyse and redact image-based pdf or image
35
+ if is_pdf_or_image(file_path) == False:
36
+ return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
37
+
38
  pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
39
  out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
40
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
 
43
  out_message = "Image-based PDF successfully redacted and saved to file."
44
 
45
  elif in_redact_method == "Text analysis":
46
+ if is_pdf(file_path) == False:
47
+ return "Please upload a PDF file for text analysis.", None
48
+
49
  # Analyse text-based pdf
50
  pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
51
  out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
 
90
  gr.Markdown(
91
  """
92
  # Document redaction
93
+ Take an image-based PDF or image file, or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction, and also works with JPG or PNG files. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
94
 
95
  WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
96
+
97
+ Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
98
  """)
99
 
100
  with gr.Tab("Redact document"):
tools/file_conversion.py CHANGED
@@ -1,6 +1,23 @@
1
  from pdf2image import convert_from_path
 
2
  import os
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def is_pdf(filename):
5
  """
6
  Check if a file name is a PDF.
@@ -16,14 +33,13 @@ def is_pdf(filename):
16
  # %%
17
  ## Convert pdf to image if necessary
18
 
19
- def convert_pdf_to_images(pdf_path):
20
-
21
- image_paths = []
22
 
23
  # Convert PDF to a list of images
24
  images = convert_from_path(pdf_path)
25
 
26
- # Save each image as a separate file
 
27
  # for i, image in enumerate(images):
28
  # page_path = f"processing/page_{i+1}.png"
29
  # image.save(page_path, "PNG")
@@ -39,10 +55,10 @@ def process_file(file_path):
39
  file_extension = os.path.splitext(file_path)[1].lower()
40
 
41
  # Check if the file is an image type
42
- if file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
43
  print(f"{file_path} is an image file.")
44
  # Perform image processing here
45
- out_path = [file_path]
46
 
47
  # Check if the file is a PDF
48
  elif file_extension == '.pdf':
 
1
  from pdf2image import convert_from_path
2
+ from PIL import Image
3
  import os
4
 
5
+ def is_pdf_or_image(filename):
6
+ """
7
+ Check if a file name is a PDF or an image file.
8
+
9
+ Args:
10
+ filename (str): The name of the file.
11
+
12
+ Returns:
13
+ bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
14
+ """
15
+ if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".png"):
16
+ output = True
17
+ else:
18
+ output = False
19
+ return output
20
+
21
  def is_pdf(filename):
22
  """
23
  Check if a file name is a PDF.
 
33
  # %%
34
  ## Convert pdf to image if necessary
35
 
36
+ def convert_pdf_to_images(pdf_path):
 
 
37
 
38
  # Convert PDF to a list of images
39
  images = convert_from_path(pdf_path)
40
 
41
+ # Save each image as a separate file - deprecated
42
+ #image_paths = []
43
  # for i, image in enumerate(images):
44
  # page_path = f"processing/page_{i+1}.png"
45
  # image.save(page_path, "PNG")
 
55
  file_extension = os.path.splitext(file_path)[1].lower()
56
 
57
  # Check if the file is an image type
58
+ if file_extension in ['.jpg', '.jpeg', '.png']:
59
  print(f"{file_path} is an image file.")
60
  # Perform image processing here
61
+ out_path = [Image.open(file_path)]
62
 
63
  # Check if the file is a PDF
64
  elif file_extension == '.pdf':