Commit
·
f0f9378
1
Parent(s):
aaf0acb
Added support for AWS Comprehend for PII identification. OCR and detection results now written to main output
Browse files- app.py +48 -16
- tools/aws_functions.py +24 -12
- tools/custom_image_analyser_engine.py +31 -3
- tools/data_anonymise.py +21 -0
- tools/file_redaction.py +178 -123
- tools/presidio_analyzer_custom.py +22 -1
app.py
CHANGED
@@ -7,7 +7,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
|
7 |
from gradio_image_annotation import image_annotator
|
8 |
|
9 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
|
10 |
-
from tools.aws_functions import upload_file_to_s3
|
11 |
from tools.file_redaction import choose_and_run_redactor
|
12 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
13 |
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
|
@@ -25,8 +25,14 @@ add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
|
|
25 |
|
26 |
ensure_output_folder_exists()
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
29 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
|
|
30 |
language = 'en'
|
31 |
|
32 |
host_name = socket.gethostname()
|
@@ -35,6 +41,21 @@ feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
|
35 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
36 |
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
# Create the gradio interface
|
39 |
app = gr.Blocks(theme = gr.themes.Base())
|
40 |
|
@@ -109,7 +130,9 @@ with app:
|
|
109 |
with gr.Tab("PDFs/images"):
|
110 |
with gr.Accordion("Redact document", open = True):
|
111 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
|
112 |
-
in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value =
|
|
|
|
|
113 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
114 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
115 |
current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
|
@@ -201,21 +224,30 @@ with app:
|
|
201 |
with gr.Row():
|
202 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
203 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
|
209 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
210 |
-
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
211 |
with gr.Row():
|
212 |
-
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
213 |
-
# Upload 'Allow list' for terms not to be redacted
|
214 |
-
with gr.Row():
|
215 |
in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
|
216 |
gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
|
217 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
# If a custom allow list is uploaded
|
221 |
in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
@@ -227,12 +259,12 @@ with app:
|
|
227 |
|
228 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
|
229 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
230 |
-
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
231 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
|
232 |
#then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
|
233 |
|
234 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
235 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
236 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
|
237 |
|
238 |
# If a file has been completed, the function will continue onto the next document
|
@@ -318,9 +350,9 @@ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
|
318 |
|
319 |
if __name__ == "__main__":
|
320 |
if os.environ['COGNITO_AUTH'] == "1":
|
321 |
-
app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
|
322 |
else:
|
323 |
-
app.queue().launch(show_error=True, inbrowser=True, max_file_size='100mb')
|
324 |
|
325 |
|
326 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
|
|
7 |
from gradio_image_annotation import image_annotator
|
8 |
|
9 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
|
10 |
+
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
11 |
from tools.file_redaction import choose_and_run_redactor
|
12 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
13 |
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
|
|
|
25 |
|
26 |
ensure_output_folder_exists()
|
27 |
|
28 |
+
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
29 |
+
|
30 |
+
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
|
31 |
+
|
32 |
+
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
33 |
+
|
34 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
35 |
+
|
36 |
language = 'en'
|
37 |
|
38 |
host_name = socket.gethostname()
|
|
|
41 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
42 |
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
43 |
|
44 |
+
|
45 |
+
text_ocr_option = "Simple text analysis - PDFs with selectable text"
|
46 |
+
tesseract_ocr_option = "Quick image analysis - typed text"
|
47 |
+
textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
|
48 |
+
|
49 |
+
local_pii_detector = "Local"
|
50 |
+
aws_pii_detector = "AWS Comprehend"
|
51 |
+
|
52 |
+
if RUN_AWS_FUNCTIONS == "1":
|
53 |
+
default_ocr_val = textract_option
|
54 |
+
default_pii_detector = aws_pii_detector
|
55 |
+
else:
|
56 |
+
default_ocr_val = text_ocr_option
|
57 |
+
default_pii_detector = local_pii_detector
|
58 |
+
|
59 |
# Create the gradio interface
|
60 |
app = gr.Blocks(theme = gr.themes.Base())
|
61 |
|
|
|
130 |
with gr.Tab("PDFs/images"):
|
131 |
with gr.Accordion("Redact document", open = True):
|
132 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
|
133 |
+
in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = text_ocr_option, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
134 |
+
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
135 |
+
|
136 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
137 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
138 |
current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
|
|
|
224 |
with gr.Row():
|
225 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
226 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
227 |
+
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
|
232 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
|
|
233 |
with gr.Row():
|
|
|
|
|
|
|
234 |
in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
|
235 |
gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
|
236 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
237 |
+
|
238 |
+
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
239 |
+
|
240 |
+
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
|
241 |
+
|
242 |
+
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
243 |
+
#with gr.Row():
|
244 |
+
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
245 |
+
|
246 |
+
|
247 |
+
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
248 |
+
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
249 |
+
|
250 |
+
log_files_output = gr.File(label="Log file output", interactive=False)
|
251 |
|
252 |
# If a custom allow list is uploaded
|
253 |
in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
|
|
259 |
|
260 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
|
261 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
262 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
|
263 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
|
264 |
#then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
|
265 |
|
266 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
267 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
|
268 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
|
269 |
|
270 |
# If a file has been completed, the function will continue onto the next document
|
|
|
350 |
|
351 |
if __name__ == "__main__":
|
352 |
if os.environ['COGNITO_AUTH'] == "1":
|
353 |
+
app.queue(max_size=5).launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
|
354 |
else:
|
355 |
+
app.queue(max_size=5).launch(show_error=True, inbrowser=True, max_file_size='100mb')
|
356 |
|
357 |
|
358 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
tools/aws_functions.py
CHANGED
@@ -7,24 +7,22 @@ from tools.helper_functions import get_or_create_env_var
|
|
7 |
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
|
10 |
-
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
print(f'The value of {aws_var} is {aws_var_val}')
|
16 |
|
17 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
18 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
print(e)
|
26 |
|
27 |
-
|
28 |
sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
|
29 |
sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
|
30 |
response = sts.get_caller_identity()
|
@@ -37,14 +35,28 @@ if aws_var_val == "1":
|
|
37 |
|
38 |
return assumed_role_arn, assumed_role_name
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
try:
|
41 |
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
42 |
|
43 |
print("Assumed Role ARN:", assumed_role_arn)
|
44 |
print("Assumed Role Name:", assumed_role_name)
|
|
|
45 |
except Exception as e:
|
46 |
print(e)
|
47 |
|
|
|
|
|
|
|
48 |
# Download direct from S3 - requires login credentials
|
49 |
def download_file_from_s3(bucket_name, key, local_file_path):
|
50 |
|
|
|
7 |
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
|
10 |
+
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
+
|
13 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
14 |
+
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
|
|
15 |
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
17 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
18 |
|
19 |
+
try:
|
20 |
+
comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
|
21 |
+
except Exception as e:
|
22 |
+
print(e)
|
23 |
+
comprehend_client = ""
|
|
|
24 |
|
25 |
+
def get_assumed_role_info():
|
26 |
sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
|
27 |
sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
|
28 |
response = sts.get_caller_identity()
|
|
|
35 |
|
36 |
return assumed_role_arn, assumed_role_name
|
37 |
|
38 |
+
if RUN_AWS_FUNCTIONS == "1":
|
39 |
+
try:
|
40 |
+
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
41 |
+
session = boto3.Session()
|
42 |
+
# Initialize the Boto3 client for Comprehend
|
43 |
+
|
44 |
+
|
45 |
+
except Exception as e:
|
46 |
+
print(e)
|
47 |
+
|
48 |
try:
|
49 |
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
50 |
|
51 |
print("Assumed Role ARN:", assumed_role_arn)
|
52 |
print("Assumed Role Name:", assumed_role_name)
|
53 |
+
|
54 |
except Exception as e:
|
55 |
print(e)
|
56 |
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
# Download direct from S3 - requires login credentials
|
61 |
def download_file_from_s3(bucket_name, key, local_file_path):
|
62 |
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -10,6 +10,8 @@ from PIL import ImageDraw, ImageFont, Image
|
|
10 |
from typing import Optional, Tuple, Union
|
11 |
from copy import deepcopy
|
12 |
from tools.helper_functions import clean_unicode_text
|
|
|
|
|
13 |
#import string # Import string to get a list of common punctuation characters
|
14 |
|
15 |
@dataclass
|
@@ -459,6 +461,8 @@ class CustomImageAnalyzerEngine:
|
|
459 |
self,
|
460 |
line_level_ocr_results: List[OCRResult],
|
461 |
ocr_results_with_children: Dict[str, Dict],
|
|
|
|
|
462 |
**text_analyzer_kwargs
|
463 |
) -> List[CustomImageRecognizerResult]:
|
464 |
# Define English as default language, if not specified
|
@@ -472,10 +476,34 @@ class CustomImageAnalyzerEngine:
|
|
472 |
|
473 |
combined_results = []
|
474 |
for i, line_level_ocr_result in enumerate(line_level_ocr_results):
|
|
|
|
|
|
|
475 |
# Analyze each OCR result (line) individually
|
476 |
-
|
477 |
-
|
478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
479 |
|
480 |
if i < len(ocr_results_with_children): # Check if i is a valid index
|
481 |
child_level_key = list(ocr_results_with_children.keys())[i]
|
|
|
10 |
from typing import Optional, Tuple, Union
|
11 |
from copy import deepcopy
|
12 |
from tools.helper_functions import clean_unicode_text
|
13 |
+
from tools.aws_functions import comprehend_client
|
14 |
+
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
15 |
#import string # Import string to get a list of common punctuation characters
|
16 |
|
17 |
@dataclass
|
|
|
461 |
self,
|
462 |
line_level_ocr_results: List[OCRResult],
|
463 |
ocr_results_with_children: Dict[str, Dict],
|
464 |
+
chosen_redact_comprehend_entities:List[str],
|
465 |
+
pii_identification_method:str="Local",
|
466 |
**text_analyzer_kwargs
|
467 |
) -> List[CustomImageRecognizerResult]:
|
468 |
# Define English as default language, if not specified
|
|
|
476 |
|
477 |
combined_results = []
|
478 |
for i, line_level_ocr_result in enumerate(line_level_ocr_results):
|
479 |
+
|
480 |
+
analyzer_result = []
|
481 |
+
|
482 |
# Analyze each OCR result (line) individually
|
483 |
+
|
484 |
+
if pii_identification_method == "Local":
|
485 |
+
analyzer_result = self.analyzer_engine.analyze(
|
486 |
+
text=line_level_ocr_result.text, **text_analyzer_kwargs
|
487 |
+
)
|
488 |
+
|
489 |
+
elif pii_identification_method == "AWS Comprehend":
|
490 |
+
|
491 |
+
# Call the detect_pii_entities method
|
492 |
+
response = comprehend_client.detect_pii_entities(
|
493 |
+
Text=line_level_ocr_result.text,
|
494 |
+
LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
|
495 |
+
)
|
496 |
+
|
497 |
+
for result in response["Entities"]:
|
498 |
+
result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
|
499 |
+
|
500 |
+
if result_text not in allow_list:
|
501 |
+
|
502 |
+
if result.get("Type") in chosen_redact_comprehend_entities:
|
503 |
+
|
504 |
+
recogniser_entity = recognizer_result_from_dict(result)
|
505 |
+
analyzer_result.append(recogniser_entity)
|
506 |
+
|
507 |
|
508 |
if i < len(ocr_results_with_children): # Check if i is a valid index
|
509 |
child_level_key = list(ocr_results_with_children.keys())[i]
|
tools/data_anonymise.py
CHANGED
@@ -23,6 +23,27 @@ fake = Faker("en_UK")
|
|
23 |
def fake_first_name(x):
|
24 |
return fake.first_name()
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
|
27 |
output = []
|
28 |
|
|
|
23 |
def fake_first_name(x):
|
24 |
return fake.first_name()
|
25 |
|
26 |
+
def initial_clean(text):
|
27 |
+
#### Some of my cleaning functions
|
28 |
+
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
29 |
+
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
30 |
+
non_ascii_pattern = r'[^\x00-\x7F]+'
|
31 |
+
multiple_spaces_regex = r'\s{2,}'
|
32 |
+
|
33 |
+
# Define a list of patterns and their replacements
|
34 |
+
patterns = [
|
35 |
+
(html_pattern_regex, ' '),
|
36 |
+
(html_start_pattern_end_dots_regex, ' '),
|
37 |
+
(non_ascii_pattern, ' '),
|
38 |
+
(multiple_spaces_regex, ' ')
|
39 |
+
]
|
40 |
+
|
41 |
+
# Apply each regex replacement
|
42 |
+
for pattern, replacement in patterns:
|
43 |
+
text = re.sub(pattern, replacement, text)
|
44 |
+
|
45 |
+
return text
|
46 |
+
|
47 |
def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
|
48 |
output = []
|
49 |
|
tools/file_redaction.py
CHANGED
@@ -24,13 +24,17 @@ import gradio as gr
|
|
24 |
from gradio import Progress
|
25 |
from collections import defaultdict # For efficient grouping
|
26 |
|
|
|
|
|
27 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
28 |
from tools.file_conversion import process_file
|
29 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
30 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
|
31 |
-
from tools.file_conversion import process_file, is_pdf,
|
32 |
-
from tools.data_anonymise import generate_decision_process_output
|
33 |
-
from tools.aws_textract import analyse_page_with_textract,
|
|
|
|
|
34 |
|
35 |
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
36 |
|
@@ -62,12 +66,12 @@ def sum_numbers_before_seconds(string:str):
|
|
62 |
|
63 |
return sum_of_numbers
|
64 |
|
65 |
-
|
66 |
def choose_and_run_redactor(file_paths:List[str],
|
67 |
prepared_pdf_file_paths:List[str],
|
68 |
prepared_pdf_image_paths:List[str],
|
69 |
language:str,
|
70 |
chosen_redact_entities:List[str],
|
|
|
71 |
in_redact_method:str,
|
72 |
in_allow_list:List[List[str]]=None,
|
73 |
latest_file_completed:int=0,
|
@@ -86,6 +90,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
86 |
pymupdf_doc=[],
|
87 |
current_loop_page:int=0,
|
88 |
page_break_return:bool=False,
|
|
|
89 |
progress=gr.Progress(track_tqdm=True)):
|
90 |
'''
|
91 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
@@ -94,7 +99,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
94 |
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
|
95 |
- prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
96 |
- language (str): The language of the text in the files.
|
97 |
-
- chosen_redact_entities (List[str]): A list of entity types to redact from the files.
|
|
|
98 |
- in_redact_method (str): The method to use for redaction.
|
99 |
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
100 |
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
@@ -113,6 +119,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
113 |
- pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
|
114 |
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
115 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
|
|
116 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
117 |
|
118 |
The function returns a redacted document along with processing logs.
|
@@ -121,12 +128,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
121 |
tic = time.perf_counter()
|
122 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
123 |
|
|
|
124 |
# If this is the first time around, set variables to 0/blank
|
125 |
if first_loop_state==True:
|
126 |
print("First_loop_state is True")
|
127 |
latest_file_completed = 0
|
128 |
current_loop_page = 0
|
129 |
-
#out_message = []
|
130 |
out_file_paths = []
|
131 |
estimate_total_processing_time = 0
|
132 |
estimated_time_taken_state = 0
|
@@ -136,10 +143,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
136 |
current_loop_page = 0
|
137 |
|
138 |
|
139 |
-
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
140 |
-
#if isinstance(out_message, str):
|
141 |
-
# out_message = [out_message]
|
142 |
-
|
143 |
if not out_file_paths:
|
144 |
out_file_paths = []
|
145 |
|
@@ -152,11 +155,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
152 |
else:
|
153 |
number_of_files = len(file_paths)
|
154 |
|
155 |
-
|
156 |
-
print("\nIn choose_and_run_redactor function, latest_file_completed is:", latest_file_completed)
|
157 |
-
print("current_loop_page is:", current_loop_page)
|
158 |
-
|
159 |
-
|
160 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
161 |
if latest_file_completed >= number_of_files:
|
162 |
|
@@ -242,7 +240,26 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
242 |
|
243 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
244 |
|
245 |
-
pymupdf_doc,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
# Save Textract request metadata (if exists)
|
248 |
if new_request_metadata:
|
@@ -260,7 +277,21 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
260 |
# Analyse text-based pdf
|
261 |
print('Redacting file as text-based PDF')
|
262 |
|
263 |
-
pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
else:
|
266 |
out_message = "No redaction method selected"
|
@@ -287,27 +318,37 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
287 |
pymupdf_doc.save(out_image_file_path)
|
288 |
|
289 |
out_file_paths.append(out_image_file_path)
|
|
|
290 |
if logging_file_paths:
|
291 |
log_files_output_paths.extend(logging_file_paths)
|
292 |
|
293 |
-
#if isinstance(out_message, list):
|
294 |
-
# out_message.append("File '" + file_path_without_ext + "' successfully redacted")
|
295 |
-
|
296 |
logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
|
297 |
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
298 |
-
log_files_output_paths.append(logs_output_file_name)
|
|
|
299 |
|
300 |
all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
|
301 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
302 |
-
log_files_output_paths.append(all_text_output_file_name)
|
|
|
303 |
|
304 |
# Make a combined message for the file
|
305 |
if isinstance(out_message, list):
|
306 |
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
307 |
else: combined_out_message = out_message
|
308 |
|
|
|
|
|
|
|
|
|
309 |
out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
|
310 |
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
# Increase latest file completed count unless we are at the last file
|
313 |
# if latest_file_completed != len(file_paths):
|
@@ -348,15 +389,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
348 |
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
349 |
else: combined_out_message = out_message
|
350 |
|
351 |
-
out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
|
352 |
-
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
353 |
-
|
354 |
-
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
355 |
-
print("Estimated total processing time:", str(estimate_total_processing_time))
|
356 |
-
|
357 |
-
toc = time.perf_counter()
|
358 |
-
time_taken = toc - tic
|
359 |
-
estimated_time_taken_state = estimated_time_taken_state + time_taken
|
360 |
|
361 |
# If textract requests made, write to logging file
|
362 |
if all_request_metadata:
|
@@ -392,10 +424,6 @@ def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
|
|
392 |
rect_height = pymupdf_page.rect.height
|
393 |
rect_width = pymupdf_page.rect.width
|
394 |
|
395 |
-
# Calculate scaling factors
|
396 |
-
#scale_height = rect_height / mediabox_height if mediabox_height else 1
|
397 |
-
#scale_width = rect_width / mediabox_width if mediabox_width else 1
|
398 |
-
|
399 |
# Adjust coordinates based on scaling factors
|
400 |
page_x_adjust = (rect_width - mediabox_width) / 2 # Center adjustment
|
401 |
page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
|
@@ -504,16 +532,13 @@ def move_page_info(file_path: str) -> str:
|
|
504 |
|
505 |
return new_file_path
|
506 |
|
507 |
-
def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None)
|
508 |
|
509 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
510 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
511 |
rect_height = page.rect.height
|
512 |
rect_width = page.rect.width
|
513 |
|
514 |
-
#print("page_rect_height:", page.rect.height)
|
515 |
-
#print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
|
516 |
-
|
517 |
out_annotation_boxes = {}
|
518 |
all_image_annotation_boxes = []
|
519 |
image_path = ""
|
@@ -525,16 +550,11 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
|
|
525 |
image_path = image
|
526 |
image = Image.open(image_path)
|
527 |
|
528 |
-
#print("annotations_on_page:", annotations_on_page)
|
529 |
-
|
530 |
# Check if this is an object used in the Gradio Annotation component
|
531 |
if isinstance (annotations_on_page, dict):
|
532 |
annotations_on_page = annotations_on_page["boxes"]
|
533 |
-
#print("annotations on page:", annotations_on_page)
|
534 |
|
535 |
for annot in annotations_on_page:
|
536 |
-
#print("annot:", annot)
|
537 |
-
|
538 |
# Check if an Image recogniser result, or a Gradio annotation object
|
539 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
540 |
|
@@ -600,7 +620,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
|
|
600 |
rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
|
601 |
|
602 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
603 |
-
#print("rect_single_pixel_height:", rect_single_pixel_height)
|
604 |
page.add_redact_annot(rect_single_pixel_height)
|
605 |
|
606 |
# Set up drawing a black box over the whole rect
|
@@ -614,14 +633,9 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
|
|
614 |
"boxes": all_image_annotation_boxes
|
615 |
}
|
616 |
|
617 |
-
#print("out_annotation_boxes:", out_annotation_boxes)
|
618 |
-
|
619 |
page.apply_redactions(images=0, graphics=0)
|
620 |
page.clean_contents()
|
621 |
|
622 |
-
#print("Everything is fine at end of redact_page_with_pymupdf")
|
623 |
-
#print("\nout_annotation_boxes:", out_annotation_boxes)
|
624 |
-
|
625 |
return page, out_annotation_boxes
|
626 |
|
627 |
def bounding_boxes_overlap(box1, box2):
|
@@ -668,10 +682,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
668 |
combined_text = " ".join(word['text'] for word in relevant_words)
|
669 |
|
670 |
# Calculate new dimensions for the merged box
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
reconstructed_bbox = CustomImageRecognizerResult(
|
676 |
bbox.entity_type,
|
677 |
bbox.start,
|
@@ -740,7 +750,29 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
740 |
|
741 |
return merged_bboxes
|
742 |
|
743 |
-
def redact_image_pdf(file_path:str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
744 |
|
745 |
'''
|
746 |
This function redacts sensitive information from a PDF document. It takes the following parameters:
|
@@ -749,6 +781,7 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
749 |
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
|
750 |
- language (str): The language of the text in the PDF.
|
751 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
|
|
|
752 |
- allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
|
753 |
- is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
|
754 |
- page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
|
@@ -756,13 +789,19 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
756 |
- analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
|
757 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
|
758 |
- request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
|
759 |
-
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
760 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
|
|
|
|
|
|
|
|
|
|
|
|
761 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
762 |
-
-
|
|
|
763 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
764 |
|
765 |
-
The function returns a redacted PDF document.
|
766 |
'''
|
767 |
file_name = get_file_path_end(file_path)
|
768 |
fill = (0, 0, 0) # Fill colour
|
@@ -901,20 +940,31 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
901 |
|
902 |
# Step 2: Analyze text and identify PII
|
903 |
if chosen_redact_entities:
|
|
|
|
|
|
|
904 |
redaction_bboxes = image_analyser.analyze_text(
|
905 |
line_level_ocr_results,
|
906 |
line_level_ocr_results_with_children,
|
|
|
|
|
907 |
language=language,
|
908 |
entities=chosen_redact_entities,
|
909 |
allow_list=allow_list,
|
910 |
-
score_threshold=score_threshold
|
911 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
912 |
else:
|
913 |
redaction_bboxes = []
|
914 |
-
|
915 |
-
#print("\nsignature_recogniser_boxes:", signature_recogniser_results)
|
916 |
-
#print("\nhandwriting_recogniser_boxes:", handwriting_recogniser_results)
|
917 |
-
#print("\nredaction_bboxes:", redaction_bboxes)
|
918 |
|
919 |
if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
920 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
@@ -1049,53 +1099,6 @@ def get_text_container_characters(text_container:LTTextContainer):
|
|
1049 |
return characters
|
1050 |
return []
|
1051 |
|
1052 |
-
|
1053 |
-
def initial_clean(text):
|
1054 |
-
#### Some of my cleaning functions
|
1055 |
-
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
1056 |
-
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
1057 |
-
non_ascii_pattern = r'[^\x00-\x7F]+'
|
1058 |
-
multiple_spaces_regex = r'\s{2,}'
|
1059 |
-
|
1060 |
-
# Define a list of patterns and their replacements
|
1061 |
-
patterns = [
|
1062 |
-
(html_pattern_regex, ' '),
|
1063 |
-
(html_start_pattern_end_dots_regex, ' '),
|
1064 |
-
(non_ascii_pattern, ' '),
|
1065 |
-
(multiple_spaces_regex, ' ')
|
1066 |
-
]
|
1067 |
-
|
1068 |
-
# Apply each regex replacement
|
1069 |
-
for pattern, replacement in patterns:
|
1070 |
-
text = re.sub(pattern, replacement, text)
|
1071 |
-
|
1072 |
-
return text
|
1073 |
-
|
1074 |
-
|
1075 |
-
def analyse_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
|
1076 |
-
'''
|
1077 |
-
Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
|
1078 |
-
'''
|
1079 |
-
|
1080 |
-
analyser_results = []
|
1081 |
-
|
1082 |
-
#text_to_analyse = initial_clean(text_container.text).strip()
|
1083 |
-
|
1084 |
-
text_to_analyse = initial_clean(text_container.text)
|
1085 |
-
|
1086 |
-
if chosen_redact_entities:
|
1087 |
-
#print("Running Presidio analyze method. text_to_analyse:", text_to_analyse)
|
1088 |
-
|
1089 |
-
analyser_results = nlp_analyser.analyze(text=text_to_analyse,
|
1090 |
-
language=language,
|
1091 |
-
entities=chosen_redact_entities,
|
1092 |
-
score_threshold=score_threshold,
|
1093 |
-
return_decision_process=True,
|
1094 |
-
allow_list=allow_list)
|
1095 |
-
|
1096 |
-
return analyser_results
|
1097 |
-
|
1098 |
-
|
1099 |
def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
|
1100 |
'''
|
1101 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
@@ -1292,6 +1295,53 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
|
|
1292 |
|
1293 |
return analysed_bounding_boxes
|
1294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1295 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1296 |
decision_process_table = pd.DataFrame()
|
1297 |
|
@@ -1335,6 +1385,7 @@ def redact_text_pdf(
|
|
1335 |
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
1336 |
language: str, # Language of the PDF content
|
1337 |
chosen_redact_entities: List[str], # List of entities to be redacted
|
|
|
1338 |
allow_list: List[str] = None, # Optional list of allowed entities
|
1339 |
page_min: int = 0, # Minimum page number to start redaction
|
1340 |
page_max: int = 999, # Maximum page number to end redaction
|
@@ -1345,11 +1396,12 @@ def redact_text_pdf(
|
|
1345 |
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
|
1346 |
all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
|
1347 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
|
|
1348 |
page_break_val: int = int(page_break_value), # Value for page break
|
1349 |
-
max_time: int = int(max_time_value),
|
1350 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
1351 |
):
|
1352 |
-
|
1353 |
'''
|
1354 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
1355 |
|
@@ -1358,19 +1410,20 @@ def redact_text_pdf(
|
|
1358 |
- prepared_pdf_image_path: Path to the prepared PDF image for redaction
|
1359 |
- language: Language of the PDF content
|
1360 |
- chosen_redact_entities: List of entities to be redacted
|
|
|
1361 |
- allow_list: Optional list of allowed entities
|
1362 |
- page_min: Minimum page number to start redaction
|
1363 |
- page_max: Maximum page number to end redaction
|
1364 |
- analysis_type: Type of analysis to perform
|
1365 |
- current_loop_page: Current page being processed in the loop
|
1366 |
- page_break_return: Flag to indicate if a page break should be returned
|
1367 |
-
- images: List of images (not used in this function)
|
1368 |
- annotations_all_pages: List of annotations across all pages
|
1369 |
- all_line_level_ocr_results_df: DataFrame for OCR results
|
1370 |
- all_decision_process_table: DataFrame for decision process table
|
1371 |
- pymupdf_doc: List of PyMuPDF documents
|
|
|
1372 |
- page_break_val: Value for page break
|
1373 |
-
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1374 |
- progress: Progress tracking object
|
1375 |
'''
|
1376 |
|
@@ -1393,7 +1446,6 @@ def redact_text_pdf(
|
|
1393 |
if current_loop_page == 0: page_loop_start = 0
|
1394 |
else: page_loop_start = current_loop_page
|
1395 |
|
1396 |
-
#progress_bar = progress.tqdm(range(current_loop_page, number_of_pages), unit="pages", desc="Redacting pages")
|
1397 |
progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1398 |
|
1399 |
#for page_no in range(0, number_of_pages):
|
@@ -1414,14 +1466,8 @@ def redact_text_pdf(
|
|
1414 |
image_annotations = {"image": image, "boxes": []}
|
1415 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
1416 |
|
1417 |
-
#print("pymupdf page loaded")
|
1418 |
-
|
1419 |
-
#print("Page number is:", str(page_no + 1))
|
1420 |
-
|
1421 |
if page_min <= page_no < page_max:
|
1422 |
|
1423 |
-
#print("Page is in range of pages to redact")
|
1424 |
-
|
1425 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
1426 |
|
1427 |
page_analyser_results = []
|
@@ -1465,7 +1511,16 @@ def redact_text_pdf(
|
|
1465 |
text_line_analyser_result = []
|
1466 |
text_line_bounding_boxes = []
|
1467 |
|
1468 |
-
text_line_analyser_result =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1469 |
|
1470 |
# Merge bounding boxes for the line if multiple found close together
|
1471 |
if text_line_analyser_result:
|
|
|
24 |
from gradio import Progress
|
25 |
from collections import defaultdict # For efficient grouping
|
26 |
|
27 |
+
from presidio_analyzer import RecognizerResult
|
28 |
+
|
29 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
30 |
from tools.file_conversion import process_file
|
31 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
32 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
|
33 |
+
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
34 |
+
# from tools.data_anonymise import generate_decision_process_output
|
35 |
+
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
36 |
+
from tools.aws_functions import comprehend_client
|
37 |
+
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
38 |
|
39 |
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
40 |
|
|
|
66 |
|
67 |
return sum_of_numbers
|
68 |
|
|
|
69 |
def choose_and_run_redactor(file_paths:List[str],
|
70 |
prepared_pdf_file_paths:List[str],
|
71 |
prepared_pdf_image_paths:List[str],
|
72 |
language:str,
|
73 |
chosen_redact_entities:List[str],
|
74 |
+
chosen_redact_comprehend_entities:List[str],
|
75 |
in_redact_method:str,
|
76 |
in_allow_list:List[List[str]]=None,
|
77 |
latest_file_completed:int=0,
|
|
|
90 |
pymupdf_doc=[],
|
91 |
current_loop_page:int=0,
|
92 |
page_break_return:bool=False,
|
93 |
+
pii_identification_method:str="Local",
|
94 |
progress=gr.Progress(track_tqdm=True)):
|
95 |
'''
|
96 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
|
|
99 |
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
|
100 |
- prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
101 |
- language (str): The language of the text in the files.
|
102 |
+
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
103 |
+
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
|
104 |
- in_redact_method (str): The method to use for redaction.
|
105 |
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
106 |
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
|
|
119 |
- pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
|
120 |
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
121 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
122 |
+
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
123 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
124 |
|
125 |
The function returns a redacted document along with processing logs.
|
|
|
128 |
tic = time.perf_counter()
|
129 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
130 |
|
131 |
+
|
132 |
# If this is the first time around, set variables to 0/blank
|
133 |
if first_loop_state==True:
|
134 |
print("First_loop_state is True")
|
135 |
latest_file_completed = 0
|
136 |
current_loop_page = 0
|
|
|
137 |
out_file_paths = []
|
138 |
estimate_total_processing_time = 0
|
139 |
estimated_time_taken_state = 0
|
|
|
143 |
current_loop_page = 0
|
144 |
|
145 |
|
|
|
|
|
|
|
|
|
146 |
if not out_file_paths:
|
147 |
out_file_paths = []
|
148 |
|
|
|
155 |
else:
|
156 |
number_of_files = len(file_paths)
|
157 |
|
|
|
|
|
|
|
|
|
|
|
158 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
159 |
if latest_file_completed >= number_of_files:
|
160 |
|
|
|
240 |
|
241 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
242 |
|
243 |
+
pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path,
|
244 |
+
prepared_pdf_image_paths,
|
245 |
+
language,
|
246 |
+
chosen_redact_entities,
|
247 |
+
chosen_redact_comprehend_entities,
|
248 |
+
in_allow_list_flat,
|
249 |
+
is_a_pdf,
|
250 |
+
page_min,
|
251 |
+
page_max,
|
252 |
+
in_redact_method,
|
253 |
+
handwrite_signature_checkbox,
|
254 |
+
"",
|
255 |
+
current_loop_page,
|
256 |
+
page_break_return,
|
257 |
+
prepared_pdf_image_paths,
|
258 |
+
annotations_all_pages,
|
259 |
+
all_line_level_ocr_results_df,
|
260 |
+
all_decision_process_table,
|
261 |
+
pymupdf_doc,
|
262 |
+
pii_identification_method)
|
263 |
|
264 |
# Save Textract request metadata (if exists)
|
265 |
if new_request_metadata:
|
|
|
277 |
# Analyse text-based pdf
|
278 |
print('Redacting file as text-based PDF')
|
279 |
|
280 |
+
pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path,
|
281 |
+
prepared_pdf_image_paths,language,
|
282 |
+
chosen_redact_entities,
|
283 |
+
chosen_redact_comprehend_entities,
|
284 |
+
in_allow_list_flat,
|
285 |
+
page_min,
|
286 |
+
page_max,
|
287 |
+
"Simple text analysis - PDFs with selectable text",
|
288 |
+
current_loop_page,
|
289 |
+
page_break_return,
|
290 |
+
annotations_all_pages,
|
291 |
+
all_line_level_ocr_results_df,
|
292 |
+
all_decision_process_table,
|
293 |
+
pymupdf_doc,
|
294 |
+
pii_identification_method)
|
295 |
|
296 |
else:
|
297 |
out_message = "No redaction method selected"
|
|
|
318 |
pymupdf_doc.save(out_image_file_path)
|
319 |
|
320 |
out_file_paths.append(out_image_file_path)
|
321 |
+
|
322 |
if logging_file_paths:
|
323 |
log_files_output_paths.extend(logging_file_paths)
|
324 |
|
|
|
|
|
|
|
325 |
logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
|
326 |
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
327 |
+
#log_files_output_paths.append(logs_output_file_name)
|
328 |
+
out_file_paths.append(logs_output_file_name)
|
329 |
|
330 |
all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
|
331 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
332 |
+
#log_files_output_paths.append(all_text_output_file_name)
|
333 |
+
out_file_paths.append(all_text_output_file_name)
|
334 |
|
335 |
# Make a combined message for the file
|
336 |
if isinstance(out_message, list):
|
337 |
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
338 |
else: combined_out_message = out_message
|
339 |
|
340 |
+
toc = time.perf_counter()
|
341 |
+
time_taken = toc - tic
|
342 |
+
estimated_time_taken_state = estimated_time_taken_state + time_taken
|
343 |
+
|
344 |
out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
|
345 |
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
346 |
+
|
347 |
+
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
348 |
+
print("Estimated total processing time:", str(estimate_total_processing_time))
|
349 |
+
|
350 |
+
#out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
|
351 |
+
#combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
352 |
|
353 |
# Increase latest file completed count unless we are at the last file
|
354 |
# if latest_file_completed != len(file_paths):
|
|
|
389 |
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
390 |
else: combined_out_message = out_message
|
391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
# If textract requests made, write to logging file
|
394 |
if all_request_metadata:
|
|
|
424 |
rect_height = pymupdf_page.rect.height
|
425 |
rect_width = pymupdf_page.rect.width
|
426 |
|
|
|
|
|
|
|
|
|
427 |
# Adjust coordinates based on scaling factors
|
428 |
page_x_adjust = (rect_width - mediabox_width) / 2 # Center adjustment
|
429 |
page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
|
|
|
532 |
|
533 |
return new_file_path
|
534 |
|
535 |
+
def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
|
536 |
|
537 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
538 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
539 |
rect_height = page.rect.height
|
540 |
rect_width = page.rect.width
|
541 |
|
|
|
|
|
|
|
542 |
out_annotation_boxes = {}
|
543 |
all_image_annotation_boxes = []
|
544 |
image_path = ""
|
|
|
550 |
image_path = image
|
551 |
image = Image.open(image_path)
|
552 |
|
|
|
|
|
553 |
# Check if this is an object used in the Gradio Annotation component
|
554 |
if isinstance (annotations_on_page, dict):
|
555 |
annotations_on_page = annotations_on_page["boxes"]
|
|
|
556 |
|
557 |
for annot in annotations_on_page:
|
|
|
|
|
558 |
# Check if an Image recogniser result, or a Gradio annotation object
|
559 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
560 |
|
|
|
620 |
rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
|
621 |
|
622 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
|
|
623 |
page.add_redact_annot(rect_single_pixel_height)
|
624 |
|
625 |
# Set up drawing a black box over the whole rect
|
|
|
633 |
"boxes": all_image_annotation_boxes
|
634 |
}
|
635 |
|
|
|
|
|
636 |
page.apply_redactions(images=0, graphics=0)
|
637 |
page.clean_contents()
|
638 |
|
|
|
|
|
|
|
639 |
return page, out_annotation_boxes
|
640 |
|
641 |
def bounding_boxes_overlap(box1, box2):
|
|
|
682 |
combined_text = " ".join(word['text'] for word in relevant_words)
|
683 |
|
684 |
# Calculate new dimensions for the merged box
|
|
|
|
|
|
|
|
|
685 |
reconstructed_bbox = CustomImageRecognizerResult(
|
686 |
bbox.entity_type,
|
687 |
bbox.start,
|
|
|
750 |
|
751 |
return merged_bboxes
|
752 |
|
753 |
+
def redact_image_pdf(file_path:str,
|
754 |
+
prepared_pdf_file_paths:List[str],
|
755 |
+
language:str,
|
756 |
+
chosen_redact_entities:List[str],
|
757 |
+
chosen_redact_comprehend_entities:List[str],
|
758 |
+
allow_list:List[str]=None,
|
759 |
+
is_a_pdf:bool=True,
|
760 |
+
page_min:int=0,
|
761 |
+
page_max:int=999,
|
762 |
+
analysis_type:str="Quick image analysis - typed text",
|
763 |
+
handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
|
764 |
+
request_metadata:str="", current_loop_page:int=0,
|
765 |
+
page_break_return:bool=False,
|
766 |
+
images=[],
|
767 |
+
annotations_all_pages:List=[],
|
768 |
+
all_line_level_ocr_results_df = pd.DataFrame(),
|
769 |
+
all_decision_process_table = pd.DataFrame(),
|
770 |
+
pymupdf_doc = [],
|
771 |
+
pii_identification_method:str="Local",
|
772 |
+
page_break_val:int=int(page_break_value),
|
773 |
+
logging_file_paths:List=[],
|
774 |
+
max_time:int=int(max_time_value),
|
775 |
+
progress=Progress(track_tqdm=True)):
|
776 |
|
777 |
'''
|
778 |
This function redacts sensitive information from a PDF document. It takes the following parameters:
|
|
|
781 |
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
|
782 |
- language (str): The language of the text in the PDF.
|
783 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
|
784 |
+
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from the list allowed by the AWS Comprehend service.
|
785 |
- allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
|
786 |
- is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
|
787 |
- page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
|
|
|
789 |
- analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
|
790 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
|
791 |
- request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
|
|
|
792 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
793 |
+
- images (list, optional): List of image objects for each PDF page.
|
794 |
+
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
795 |
+
- all_line_level_ocr_results_df (pd.DataFrame(), optional): All line level OCR results for the document as a Pandas dataframe,
|
796 |
+
- all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
|
797 |
+
- pymupdf_doc (List, optional): The document as a PyMupdf object.
|
798 |
+
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
799 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
800 |
+
- logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
|
801 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
802 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
803 |
|
804 |
+
The function returns a fully or partially-redacted PDF document.
|
805 |
'''
|
806 |
file_name = get_file_path_end(file_path)
|
807 |
fill = (0, 0, 0) # Fill colour
|
|
|
940 |
|
941 |
# Step 2: Analyze text and identify PII
|
942 |
if chosen_redact_entities:
|
943 |
+
|
944 |
+
pii_identification_method= "AWS Comprehend" #"Local"
|
945 |
+
|
946 |
redaction_bboxes = image_analyser.analyze_text(
|
947 |
line_level_ocr_results,
|
948 |
line_level_ocr_results_with_children,
|
949 |
+
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
950 |
+
pii_identification_method = pii_identification_method,
|
951 |
language=language,
|
952 |
entities=chosen_redact_entities,
|
953 |
allow_list=allow_list,
|
954 |
+
score_threshold=score_threshold
|
955 |
+
)
|
956 |
+
|
957 |
+
# redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
|
958 |
+
# line_level_ocr_results_with_children,
|
959 |
+
# language,
|
960 |
+
# chosen_redact_entities,
|
961 |
+
# allow_list,
|
962 |
+
# score_threshold,
|
963 |
+
# pii_identification_method)
|
964 |
+
|
965 |
else:
|
966 |
redaction_bboxes = []
|
967 |
+
|
|
|
|
|
|
|
968 |
|
969 |
if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
970 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
|
|
1099 |
return characters
|
1100 |
return []
|
1101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1102 |
def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
|
1103 |
'''
|
1104 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
|
|
1295 |
|
1296 |
return analysed_bounding_boxes
|
1297 |
|
1298 |
+
def identify_pii_in_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], chosen_redact_comprehend_entities:List[str], score_threshold:float, allow_list:List[str], pii_identification_method:str="Local") -> List[RecognizerResult]:
|
1299 |
+
'''
|
1300 |
+
Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
|
1301 |
+
'''
|
1302 |
+
|
1303 |
+
analyser_results = []
|
1304 |
+
|
1305 |
+
#text_to_analyse = initial_clean(text_container.text).strip()
|
1306 |
+
|
1307 |
+
text_to_analyse = text_container.text
|
1308 |
+
|
1309 |
+
if chosen_redact_entities:
|
1310 |
+
if pii_identification_method == "Local":
|
1311 |
+
analyser_results = nlp_analyser.analyze(text=text_to_analyse,
|
1312 |
+
language=language,
|
1313 |
+
entities=chosen_redact_entities,
|
1314 |
+
score_threshold=score_threshold,
|
1315 |
+
return_decision_process=True,
|
1316 |
+
allow_list=allow_list)
|
1317 |
+
|
1318 |
+
elif pii_identification_method == "AWS Comprehend":
|
1319 |
+
|
1320 |
+
# Call the detect_pii_entities method
|
1321 |
+
response = comprehend_client.detect_pii_entities(
|
1322 |
+
Text=text_to_analyse,
|
1323 |
+
LanguageCode=language # Specify the language of the text
|
1324 |
+
)
|
1325 |
+
|
1326 |
+
for result in response["Entities"]:
|
1327 |
+
|
1328 |
+
result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
|
1329 |
+
|
1330 |
+
if result_text not in allow_list:
|
1331 |
+
if result.get("Type") in chosen_redact_comprehend_entities:
|
1332 |
+
|
1333 |
+
recogniser_entity = recognizer_result_from_dict(result)
|
1334 |
+
|
1335 |
+
analyser_results.append(recogniser_entity)
|
1336 |
+
|
1337 |
+
else:
|
1338 |
+
analyser_results = []
|
1339 |
+
else:
|
1340 |
+
analyser_results = []
|
1341 |
+
|
1342 |
+
|
1343 |
+
return analyser_results
|
1344 |
+
|
1345 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1346 |
decision_process_table = pd.DataFrame()
|
1347 |
|
|
|
1385 |
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
1386 |
language: str, # Language of the PDF content
|
1387 |
chosen_redact_entities: List[str], # List of entities to be redacted
|
1388 |
+
chosen_redact_comprehend_entities: List[str],
|
1389 |
allow_list: List[str] = None, # Optional list of allowed entities
|
1390 |
page_min: int = 0, # Minimum page number to start redaction
|
1391 |
page_max: int = 999, # Maximum page number to end redaction
|
|
|
1396 |
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
|
1397 |
all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
|
1398 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1399 |
+
pii_identification_method: str = "Local",
|
1400 |
page_break_val: int = int(page_break_value), # Value for page break
|
1401 |
+
max_time: int = int(max_time_value),
|
1402 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
1403 |
):
|
1404 |
+
|
1405 |
'''
|
1406 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
1407 |
|
|
|
1410 |
- prepared_pdf_image_path: Path to the prepared PDF image for redaction
|
1411 |
- language: Language of the PDF content
|
1412 |
- chosen_redact_entities: List of entities to be redacted
|
1413 |
+
- chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
|
1414 |
- allow_list: Optional list of allowed entities
|
1415 |
- page_min: Minimum page number to start redaction
|
1416 |
- page_max: Maximum page number to end redaction
|
1417 |
- analysis_type: Type of analysis to perform
|
1418 |
- current_loop_page: Current page being processed in the loop
|
1419 |
- page_break_return: Flag to indicate if a page break should be returned
|
|
|
1420 |
- annotations_all_pages: List of annotations across all pages
|
1421 |
- all_line_level_ocr_results_df: DataFrame for OCR results
|
1422 |
- all_decision_process_table: DataFrame for decision process table
|
1423 |
- pymupdf_doc: List of PyMuPDF documents
|
1424 |
+
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
1425 |
- page_break_val: Value for page break
|
1426 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1427 |
- progress: Progress tracking object
|
1428 |
'''
|
1429 |
|
|
|
1446 |
if current_loop_page == 0: page_loop_start = 0
|
1447 |
else: page_loop_start = current_loop_page
|
1448 |
|
|
|
1449 |
progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1450 |
|
1451 |
#for page_no in range(0, number_of_pages):
|
|
|
1466 |
image_annotations = {"image": image, "boxes": []}
|
1467 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
1468 |
|
|
|
|
|
|
|
|
|
1469 |
if page_min <= page_no < page_max:
|
1470 |
|
|
|
|
|
1471 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
1472 |
|
1473 |
page_analyser_results = []
|
|
|
1511 |
text_line_analyser_result = []
|
1512 |
text_line_bounding_boxes = []
|
1513 |
|
1514 |
+
# text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
|
1515 |
+
|
1516 |
+
#pii_identification_method="AWS Comprehend"#"Local"
|
1517 |
+
|
1518 |
+
if chosen_redact_entities:
|
1519 |
+
|
1520 |
+
text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
|
1521 |
+
|
1522 |
+
else:
|
1523 |
+
text_line_analyser_result = []
|
1524 |
|
1525 |
# Merge bounding boxes for the line if multiple found close together
|
1526 |
if text_line_analyser_result:
|
tools/presidio_analyzer_custom.py
CHANGED
@@ -5,7 +5,28 @@ from tqdm import tqdm
|
|
5 |
from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
|
6 |
from presidio_analyzer.nlp_engine import NlpArtifacts
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def analyze_iterator_custom(
|
11 |
self,
|
|
|
5 |
from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
|
6 |
from presidio_analyzer.nlp_engine import NlpArtifacts
|
7 |
|
8 |
+
def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
|
9 |
+
"""
|
10 |
+
Create RecognizerResult from a dictionary.
|
11 |
+
|
12 |
+
:param data: e.g. {
|
13 |
+
"entity_type": "NAME",
|
14 |
+
"start": 24,
|
15 |
+
"end": 32,
|
16 |
+
"score": 0.8,
|
17 |
+
"recognition_metadata": None
|
18 |
+
}
|
19 |
+
:return: RecognizerResult
|
20 |
+
"""
|
21 |
+
|
22 |
+
entity_type = data.get("Type")
|
23 |
+
start = data.get("BeginOffset")
|
24 |
+
end = data.get("EndOffset")
|
25 |
+
score = data.get("Score")
|
26 |
+
analysis_explanation = None
|
27 |
+
recognition_metadata = None
|
28 |
+
|
29 |
+
return RecognizerResult(entity_type, start, end, score, analysis_explanation, recognition_metadata)
|
30 |
|
31 |
def analyze_iterator_custom(
|
32 |
self,
|