seanpedrickcase
commited on
Commit
•
37d982e
1
Parent(s):
39788e9
Added -y to poppler-utils installation in Dockerfile. Added support for image files in image-based redaction.
Browse files- Dockerfile +2 -2
- app.py +11 -6
- tools/file_conversion.py +22 -6
Dockerfile
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
2 |
|
3 |
-
# Install system dependencies
|
4 |
RUN apt-get update \
|
5 |
&& apt-get install -y \
|
6 |
tesseract-ocr \
|
7 |
libtesseract-dev \
|
8 |
-
poppler-utils \
|
9 |
&& apt-get clean \
|
10 |
&& rm -rf /var/lib/apt/lists/*
|
11 |
|
|
|
1 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
2 |
|
3 |
+
# Install system dependencies. Need to specify -y for poppler to get it to install
|
4 |
RUN apt-get update \
|
5 |
&& apt-get install -y \
|
6 |
tesseract-ocr \
|
7 |
libtesseract-dev \
|
8 |
+
poppler-utils -y \
|
9 |
&& apt-get clean \
|
10 |
&& rm -rf /var/lib/apt/lists/*
|
11 |
|
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from tools.file_redaction import redact_text_pdf, redact_image_pdf
|
2 |
from tools.helper_functions import get_file_path_end
|
3 |
-
from tools.file_conversion import process_file, is_pdf
|
4 |
from tools.aws_functions import load_data_from_aws
|
5 |
|
6 |
from typing import List
|
@@ -18,9 +18,6 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
|
|
18 |
|
19 |
tic = time.perf_counter()
|
20 |
|
21 |
-
if is_pdf(file_path) == False:
|
22 |
-
return "Please upload a PDF file.", None
|
23 |
-
|
24 |
out_message = ''
|
25 |
out_file_paths = []
|
26 |
|
@@ -34,7 +31,10 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
|
|
34 |
return out_message, out_file_paths
|
35 |
|
36 |
if in_redact_method == "Image analysis":
|
37 |
-
# Analyse image-based pdf
|
|
|
|
|
|
|
38 |
pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
39 |
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
40 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
@@ -43,6 +43,9 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
|
|
43 |
out_message = "Image-based PDF successfully redacted and saved to file."
|
44 |
|
45 |
elif in_redact_method == "Text analysis":
|
|
|
|
|
|
|
46 |
# Analyse text-based pdf
|
47 |
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
48 |
out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
|
@@ -87,9 +90,11 @@ with block:
|
|
87 |
gr.Markdown(
|
88 |
"""
|
89 |
# Document redaction
|
90 |
-
Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
|
91 |
|
92 |
WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
|
|
|
|
93 |
""")
|
94 |
|
95 |
with gr.Tab("Redact document"):
|
|
|
1 |
from tools.file_redaction import redact_text_pdf, redact_image_pdf
|
2 |
from tools.helper_functions import get_file_path_end
|
3 |
+
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
4 |
from tools.aws_functions import load_data_from_aws
|
5 |
|
6 |
from typing import List
|
|
|
18 |
|
19 |
tic = time.perf_counter()
|
20 |
|
|
|
|
|
|
|
21 |
out_message = ''
|
22 |
out_file_paths = []
|
23 |
|
|
|
31 |
return out_message, out_file_paths
|
32 |
|
33 |
if in_redact_method == "Image analysis":
|
34 |
+
# Analyse and redact image-based pdf or image
|
35 |
+
if is_pdf_or_image(file_path) == False:
|
36 |
+
return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
37 |
+
|
38 |
pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
39 |
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
40 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
|
|
43 |
out_message = "Image-based PDF successfully redacted and saved to file."
|
44 |
|
45 |
elif in_redact_method == "Text analysis":
|
46 |
+
if is_pdf(file_path) == False:
|
47 |
+
return "Please upload a PDF file for text analysis.", None
|
48 |
+
|
49 |
# Analyse text-based pdf
|
50 |
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
51 |
out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
|
|
|
90 |
gr.Markdown(
|
91 |
"""
|
92 |
# Document redaction
|
93 |
+
Take an image-based PDF or image file, or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction, and also works with JPG or PNG files. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
|
94 |
|
95 |
WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
96 |
+
|
97 |
+
Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
|
98 |
""")
|
99 |
|
100 |
with gr.Tab("Redact document"):
|
tools/file_conversion.py
CHANGED
@@ -1,6 +1,23 @@
|
|
1 |
from pdf2image import convert_from_path
|
|
|
2 |
import os
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
def is_pdf(filename):
|
5 |
"""
|
6 |
Check if a file name is a PDF.
|
@@ -16,14 +33,13 @@ def is_pdf(filename):
|
|
16 |
# %%
|
17 |
## Convert pdf to image if necessary
|
18 |
|
19 |
-
def convert_pdf_to_images(pdf_path):
|
20 |
-
|
21 |
-
image_paths = []
|
22 |
|
23 |
# Convert PDF to a list of images
|
24 |
images = convert_from_path(pdf_path)
|
25 |
|
26 |
-
# Save each image as a separate file
|
|
|
27 |
# for i, image in enumerate(images):
|
28 |
# page_path = f"processing/page_{i+1}.png"
|
29 |
# image.save(page_path, "PNG")
|
@@ -39,10 +55,10 @@ def process_file(file_path):
|
|
39 |
file_extension = os.path.splitext(file_path)[1].lower()
|
40 |
|
41 |
# Check if the file is an image type
|
42 |
-
if file_extension in ['.jpg', '.jpeg', '.png'
|
43 |
print(f"{file_path} is an image file.")
|
44 |
# Perform image processing here
|
45 |
-
out_path = [file_path]
|
46 |
|
47 |
# Check if the file is a PDF
|
48 |
elif file_extension == '.pdf':
|
|
|
1 |
from pdf2image import convert_from_path
|
2 |
+
from PIL import Image
|
3 |
import os
|
4 |
|
5 |
+
def is_pdf_or_image(filename):
|
6 |
+
"""
|
7 |
+
Check if a file name is a PDF or an image file.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
filename (str): The name of the file.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
|
14 |
+
"""
|
15 |
+
if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".png"):
|
16 |
+
output = True
|
17 |
+
else:
|
18 |
+
output = False
|
19 |
+
return output
|
20 |
+
|
21 |
def is_pdf(filename):
|
22 |
"""
|
23 |
Check if a file name is a PDF.
|
|
|
33 |
# %%
|
34 |
## Convert pdf to image if necessary
|
35 |
|
36 |
+
def convert_pdf_to_images(pdf_path):
|
|
|
|
|
37 |
|
38 |
# Convert PDF to a list of images
|
39 |
images = convert_from_path(pdf_path)
|
40 |
|
41 |
+
# Save each image as a separate file - deprecated
|
42 |
+
#image_paths = []
|
43 |
# for i, image in enumerate(images):
|
44 |
# page_path = f"processing/page_{i+1}.png"
|
45 |
# image.save(page_path, "PNG")
|
|
|
55 |
file_extension = os.path.splitext(file_path)[1].lower()
|
56 |
|
57 |
# Check if the file is an image type
|
58 |
+
if file_extension in ['.jpg', '.jpeg', '.png']:
|
59 |
print(f"{file_path} is an image file.")
|
60 |
# Perform image processing here
|
61 |
+
out_path = [Image.open(file_path)]
|
62 |
|
63 |
# Check if the file is a PDF
|
64 |
elif file_extension == '.pdf':
|