Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Nov 5, 2024

Commit

f0f9378

1 Parent(s): aaf0acb

Added support for AWS Comprehend for PII identification. OCR and detection results now written to main output

Browse files

Files changed (6) hide show

app.py +48 -16
tools/aws_functions.py +24 -12
tools/custom_image_analyser_engine.py +31 -3
tools/data_anonymise.py +21 -0
tools/file_redaction.py +178 -123
tools/presidio_analyzer_custom.py +22 -1

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 from gradio_image_annotation import image_annotator
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
-from tools.aws_functions import upload_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
@@ -25,8 +25,14 @@ add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
 ensure_output_folder_exists()
-chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
 host_name = socket.gethostname()
@@ -35,6 +41,21 @@ feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
 usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
@@ -109,7 +130,9 @@ with app:
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
-            in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - docs with handwriting/signatures (AWS Textract)"])
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document(s)", variant="primary")
             current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -201,21 +224,30 @@ with app:
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
-            with gr.Row():
-                handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
-        with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
-            anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
-            in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
             with gr.Row():
-                in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
-                # Upload 'Allow list' for terms not to be redacted
-                with gr.Row():
                     in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
                     gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
-            log_files_output = gr.File(label="Log file output", interactive=False)
     # If a custom allow list is uploaded
     in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
@@ -227,12 +259,12 @@ with app:
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
                     #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
     # If a file has been completed, the function will continue onto the next document
@@ -318,9 +350,9 @@ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
-        app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
     else:
-        app.queue().launch(show_error=True, inbrowser=True, max_file_size='100mb')
 # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app

 from gradio_image_annotation import image_annotator
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
+from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
 ensure_output_folder_exists()
+chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
+full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
+chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
 host_name = socket.gethostname()
 access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
 usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
+text_ocr_option = "Simple text analysis - PDFs with selectable text"
+tesseract_ocr_option = "Quick image analysis - typed text"
+textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
+local_pii_detector = "Local"
+aws_pii_detector  = "AWS Comprehend"
+if RUN_AWS_FUNCTIONS == "1":
+    default_ocr_val = textract_option
+    default_pii_detector = aws_pii_detector
+else:
+    default_ocr_val = text_ocr_option
+    default_pii_detector = local_pii_detector
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
+            in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = text_ocr_option, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
+            pii_identification_method_drop = gr.Radio(label = "Choose PII detection method", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document(s)", variant="primary")
             current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             with gr.Row():
                     in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
                     gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
+            in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
+            in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
+            handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
+            #with gr.Row():
+            in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
+        with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
+            anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
+        log_files_output = gr.File(label="Log file output", interactive=False)
     # If a custom allow list is uploaded
     in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
                     #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
     # If a file has been completed, the function will continue onto the next document
 if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
+        app.queue(max_size=5).launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
     else:
+        app.queue(max_size=5).launch(show_error=True, inbrowser=True, max_file_size='100mb')
 # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app

tools/aws_functions.py CHANGED Viewed

@@ -7,24 +7,22 @@ from tools.helper_functions import get_or_create_env_var
 PandasDataFrame = Type[pd.DataFrame]
-# Get AWS credentials if required
 bucket_name=""
-aws_var = "RUN_AWS_FUNCTIONS"
-aws_var_default = "0"
-aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
-print(f'The value of {aws_var} is {aws_var_val}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
-if aws_var_val == "1":
-    try:
-        bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
-        session = boto3.Session() # profile_name="default"
-    except Exception as e:
-        print(e)
-    def get_assumed_role_info():
         sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
         sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
         response = sts.get_caller_identity()
@@ -37,14 +35,28 @@ if aws_var_val == "1":
         return assumed_role_arn, assumed_role_name
     try:
         assumed_role_arn, assumed_role_name = get_assumed_role_info()
         print("Assumed Role ARN:", assumed_role_arn)
         print("Assumed Role Name:", assumed_role_name)
     except Exception as e:
         print(e)
 # Download direct from S3 - requires login credentials
 def download_file_from_s3(bucket_name, key, local_file_path):

 PandasDataFrame = Type[pd.DataFrame]
+# Get AWS credentials
 bucket_name=""
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
+print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
+try:
+    comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
+except Exception as e:
+    print(e)
+    comprehend_client = ""
+def get_assumed_role_info():
         sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
         sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
         response = sts.get_caller_identity()
         return assumed_role_arn, assumed_role_name
+if RUN_AWS_FUNCTIONS == "1":
+    try:
+        bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
+        session = boto3.Session()
+        # Initialize the Boto3 client for Comprehend
+    except Exception as e:
+        print(e)
     try:
         assumed_role_arn, assumed_role_name = get_assumed_role_info()
         print("Assumed Role ARN:", assumed_role_arn)
         print("Assumed Role Name:", assumed_role_name)
     except Exception as e:
         print(e)
 # Download direct from S3 - requires login credentials
 def download_file_from_s3(bucket_name, key, local_file_path):

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -10,6 +10,8 @@ from PIL import ImageDraw, ImageFont, Image
 from typing import Optional, Tuple, Union
 from copy import deepcopy
 from tools.helper_functions import clean_unicode_text
 #import string  # Import string to get a list of common punctuation characters
 @dataclass
@@ -459,6 +461,8 @@ class CustomImageAnalyzerEngine:
         self,
         line_level_ocr_results: List[OCRResult],
         ocr_results_with_children: Dict[str, Dict],
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified
@@ -472,10 +476,34 @@ class CustomImageAnalyzerEngine:
         combined_results = []
         for i, line_level_ocr_result in enumerate(line_level_ocr_results):
             # Analyze each OCR result (line) individually
-            analyzer_result = self.analyzer_engine.analyze(
-                text=line_level_ocr_result.text, **text_analyzer_kwargs
-            )
             if i < len(ocr_results_with_children):  # Check if i is a valid index
                 child_level_key = list(ocr_results_with_children.keys())[i]

 from typing import Optional, Tuple, Union
 from copy import deepcopy
 from tools.helper_functions import clean_unicode_text
+from tools.aws_functions import comprehend_client
+from tools.presidio_analyzer_custom import recognizer_result_from_dict
 #import string  # Import string to get a list of common punctuation characters
 @dataclass
         self,
         line_level_ocr_results: List[OCRResult],
         ocr_results_with_children: Dict[str, Dict],
+        chosen_redact_comprehend_entities:List[str],
+        pii_identification_method:str="Local",
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified
         combined_results = []
         for i, line_level_ocr_result in enumerate(line_level_ocr_results):
+            analyzer_result = []
             # Analyze each OCR result (line) individually
+            if pii_identification_method == "Local":
+                analyzer_result = self.analyzer_engine.analyze(
+                    text=line_level_ocr_result.text, **text_analyzer_kwargs
+                )
+            elif pii_identification_method == "AWS Comprehend":
+                # Call the detect_pii_entities method
+                response = comprehend_client.detect_pii_entities(
+                    Text=line_level_ocr_result.text,
+                    LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
+                )
+                for result in response["Entities"]:
+                    result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
+                    if result_text not in allow_list:
+                        if result.get("Type") in chosen_redact_comprehend_entities:
+                            recogniser_entity = recognizer_result_from_dict(result)
+                            analyzer_result.append(recogniser_entity)
             if i < len(ocr_results_with_children):  # Check if i is a valid index
                 child_level_key = list(ocr_results_with_children.keys())[i]

tools/data_anonymise.py CHANGED Viewed

@@ -23,6 +23,27 @@ fake = Faker("en_UK")
 def fake_first_name(x):
     return fake.first_name()
 def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
         output = []

 def fake_first_name(x):
     return fake.first_name()
+def initial_clean(text):
+    #### Some of my cleaning functions
+    html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
+    html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
+    non_ascii_pattern = r'[^\x00-\x7F]+'
+    multiple_spaces_regex = r'\s{2,}'
+    # Define a list of patterns and their replacements
+    patterns = [
+        (html_pattern_regex, ' '),
+        (html_start_pattern_end_dots_regex, ' '),
+        (non_ascii_pattern, ' '),
+        (multiple_spaces_regex, ' ')
+    ]
+    # Apply each regex replacement
+    for pattern, replacement in patterns:
+        text = re.sub(pattern, replacement, text)
+    return text
 def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
         output = []

tools/file_redaction.py CHANGED Viewed

@@ -24,13 +24,17 @@ import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 from tools.file_conversion import process_file
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
-from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
-from tools.data_anonymise import generate_decision_process_output
-from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
@@ -62,12 +66,12 @@ def sum_numbers_before_seconds(string:str):
     return sum_of_numbers
 def choose_and_run_redactor(file_paths:List[str],
  prepared_pdf_file_paths:List[str],
  prepared_pdf_image_paths:List[str],
  language:str,
  chosen_redact_entities:List[str],
  in_redact_method:str,
  in_allow_list:List[List[str]]=None,
  latest_file_completed:int=0,
@@ -86,6 +90,7 @@ def choose_and_run_redactor(file_paths:List[str],
  pymupdf_doc=[],
  current_loop_page:int=0,
  page_break_return:bool=False,
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -94,7 +99,8 @@ def choose_and_run_redactor(file_paths:List[str],
     - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
     - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
     - language (str): The language of the text in the files.
-    - chosen_redact_entities (List[str]): A list of entity types to redact from the files.
     - in_redact_method (str): The method to use for redaction.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
@@ -113,6 +119,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
     - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
@@ -121,12 +128,12 @@ def choose_and_run_redactor(file_paths:List[str],
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         print("First_loop_state is True")
         latest_file_completed = 0
         current_loop_page = 0
-        #out_message = []
         out_file_paths = []
         estimate_total_processing_time = 0
         estimated_time_taken_state = 0
@@ -136,10 +143,6 @@ def choose_and_run_redactor(file_paths:List[str],
         current_loop_page = 0
-    # If out message is string or out_file_paths are blank, change to a list so it can be appended to
-    #if isinstance(out_message, str):
-    #    out_message = [out_message]
     if not out_file_paths:
         out_file_paths = []
@@ -152,11 +155,6 @@ def choose_and_run_redactor(file_paths:List[str],
     else:
         number_of_files = len(file_paths)
-    print("\nIn choose_and_run_redactor function, latest_file_completed is:", latest_file_completed)
-    print("current_loop_page is:", current_loop_page)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= number_of_files:
@@ -242,7 +240,26 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Redacting file " + file_path_without_ext + " as an image-based file")
-            pymupdf_doc, all_decision_process_table, logging_file_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox, "", current_loop_page, page_break_return, prepared_pdf_image_paths, annotations_all_pages, all_line_level_ocr_results_df, all_decision_process_table, pymupdf_doc)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
@@ -260,7 +277,21 @@ def choose_and_run_redactor(file_paths:List[str],
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text", current_loop_page, page_break_return, annotations_all_pages, all_line_level_ocr_results_df, all_decision_process_table, pymupdf_doc)
         else:
             out_message = "No redaction method selected"
@@ -287,27 +318,37 @@ def choose_and_run_redactor(file_paths:List[str],
                 pymupdf_doc.save(out_image_file_path)
             out_file_paths.append(out_image_file_path)
             if logging_file_paths:
                 log_files_output_paths.extend(logging_file_paths)
-            #if isinstance(out_message, list):
-            #    out_message.append("File '" + file_path_without_ext + "' successfully redacted")
             logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
-            log_files_output_paths.append(logs_output_file_name)
             all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
-            log_files_output_paths.append(all_text_output_file_name)
             # Make a combined message for the file
             if isinstance(out_message, list):
                 combined_out_message = '\n'.join(out_message)  # Ensure out_message is a list of strings
             else: combined_out_message = out_message
             out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
             combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
             # Increase latest file completed count unless we are at the last file
             # if latest_file_completed != len(file_paths):
@@ -348,15 +389,6 @@ def choose_and_run_redactor(file_paths:List[str],
                 combined_out_message = '\n'.join(out_message)  # Ensure out_message is a list of strings
             else: combined_out_message = out_message
-            out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
-            combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
-            estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
-            print("Estimated total processing time:", str(estimate_total_processing_time))
-    toc = time.perf_counter()
-    time_taken = toc - tic
-    estimated_time_taken_state = estimated_time_taken_state + time_taken
    # If textract requests made, write to logging file
     if all_request_metadata:
@@ -392,10 +424,6 @@ def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
     rect_height = pymupdf_page.rect.height
     rect_width = pymupdf_page.rect.width
-    # Calculate scaling factors
-    #scale_height = rect_height / mediabox_height if mediabox_height else 1
-    #scale_width = rect_width / mediabox_width if mediabox_width else 1
     # Adjust coordinates based on scaling factors
     page_x_adjust = (rect_width - mediabox_width) / 2  # Center adjustment
     page_y_adjust = (rect_height - mediabox_height) / 2  # Center adjustment
@@ -504,16 +532,13 @@ def move_page_info(file_path: str) -> str:
     return new_file_path
-def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, scale=(1,1)):
     mediabox_height = page.mediabox[3] - page.mediabox[1]
     mediabox_width = page.mediabox[2] - page.mediabox[0]
     rect_height = page.rect.height
     rect_width = page.rect.width
-    #print("page_rect_height:", page.rect.height)
-    #print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
     out_annotation_boxes = {}
     all_image_annotation_boxes = []
     image_path = ""
@@ -525,16 +550,11 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
         image_path = image
         image = Image.open(image_path)
-    #print("annotations_on_page:", annotations_on_page)
     # Check if this is an object used in the Gradio Annotation component
     if isinstance (annotations_on_page, dict):
         annotations_on_page = annotations_on_page["boxes"]
-        #print("annotations on page:", annotations_on_page)
     for annot in annotations_on_page:
-        #print("annot:", annot)
         # Check if an Image recogniser result, or a Gradio annotation object
         if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
@@ -600,7 +620,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
         rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2)  # Small height in middle of word to remove text
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
-        #print("rect_single_pixel_height:", rect_single_pixel_height)
         page.add_redact_annot(rect_single_pixel_height)
         # Set up drawing a black box over the whole rect
@@ -614,14 +633,9 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
         "boxes": all_image_annotation_boxes
     }
-    #print("out_annotation_boxes:", out_annotation_boxes)
     page.apply_redactions(images=0, graphics=0)
     page.clean_contents()
-    #print("Everything is fine at end of redact_page_with_pymupdf")
-    #print("\nout_annotation_boxes:", out_annotation_boxes)
     return page, out_annotation_boxes
 def bounding_boxes_overlap(box1, box2):
@@ -668,10 +682,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
                         combined_text = " ".join(word['text'] for word in relevant_words)
                         # Calculate new dimensions for the merged box
                         reconstructed_bbox = CustomImageRecognizerResult(
                             bbox.entity_type,
                             bbox.start,
@@ -740,7 +750,29 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     return merged_bboxes
-def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", current_loop_page:int=0, page_break_return:bool=False, images=[], annotations_all_pages:List=[], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(), pymupdf_doc = [], page_break_val:int=int(page_break_value), logging_file_paths:List=[], max_time:int=int(max_time_value), progress=Progress(track_tqdm=True)):
     '''
     This function redacts sensitive information from a PDF document. It takes the following parameters:
@@ -749,6 +781,7 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
     - prepared_pdf_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
     - language (str): The language of the text in the PDF.
     - chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
     - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
     - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
     - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
@@ -756,13 +789,19 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
     - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
     - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
-    - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
-    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
-    The function returns a redacted PDF document.
     '''
     file_name = get_file_path_end(file_path)
     fill = (0, 0, 0)   # Fill colour
@@ -901,20 +940,31 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
             # Step 2: Analyze text and identify PII
             if chosen_redact_entities:
                 redaction_bboxes = image_analyser.analyze_text(
                     line_level_ocr_results,
                     line_level_ocr_results_with_children,
                     language=language,
                     entities=chosen_redact_entities,
                     allow_list=allow_list,
-                    score_threshold=score_threshold,
-                )
             else:
                 redaction_bboxes = []
-            #print("\nsignature_recogniser_boxes:", signature_recogniser_results)
-            #print("\nhandwriting_recogniser_boxes:", handwriting_recogniser_results)
-            #print("\nredaction_bboxes:", redaction_bboxes)
             if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
             elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
@@ -1049,53 +1099,6 @@ def get_text_container_characters(text_container:LTTextContainer):
         return characters
     return []
-def initial_clean(text):
-    #### Some of my cleaning functions
-    html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
-    html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
-    non_ascii_pattern = r'[^\x00-\x7F]+'
-    multiple_spaces_regex = r'\s{2,}'
-    # Define a list of patterns and their replacements
-    patterns = [
-        (html_pattern_regex, ' '),
-        (html_start_pattern_end_dots_regex, ' '),
-        (non_ascii_pattern, ' '),
-        (multiple_spaces_regex, ' ')
-    ]
-    # Apply each regex replacement
-    for pattern, replacement in patterns:
-        text = re.sub(pattern, replacement, text)
-    return text
-def analyse_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
-    '''
-    Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
-    '''
-    analyser_results = []
-    #text_to_analyse = initial_clean(text_container.text).strip()
-    text_to_analyse = initial_clean(text_container.text)
-    if chosen_redact_entities:
-        #print("Running Presidio analyze method. text_to_analyse:", text_to_analyse)
-        analyser_results = nlp_analyser.analyze(text=text_to_analyse,
-                                                language=language,
-                                                entities=chosen_redact_entities,
-                                                score_threshold=score_threshold,
-                                                return_decision_process=True,
-                                                allow_list=allow_list)
-    return analyser_results
 def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
     '''
     Create an OCRResult object based on a list of pdfminer LTChar objects.
@@ -1292,6 +1295,53 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
     return analysed_bounding_boxes
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
@@ -1335,6 +1385,7 @@ def redact_text_pdf(
     prepared_pdf_image_path: str,  # Path to the prepared PDF image for redaction
     language: str,  # Language of the PDF content
     chosen_redact_entities: List[str],  # List of entities to be redacted
     allow_list: List[str] = None,  # Optional list of allowed entities
     page_min: int = 0,  # Minimum page number to start redaction
     page_max: int = 999,  # Maximum page number to end redaction
@@ -1345,11 +1396,12 @@ def redact_text_pdf(
     all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(),  # DataFrame for OCR results
     all_decision_process_table: pd.DataFrame = pd.DataFrame(),  # DataFrame for decision process table
     pymupdf_doc: List = [],  # List of PyMuPDF documents
     page_break_val: int = int(page_break_value),  # Value for page break
-    max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
 ):
     '''
     Redact chosen entities from a PDF that is made up of multiple pages that are not images.
@@ -1358,19 +1410,20 @@ def redact_text_pdf(
     - prepared_pdf_image_path: Path to the prepared PDF image for redaction
     - language: Language of the PDF content
     - chosen_redact_entities: List of entities to be redacted
     - allow_list: Optional list of allowed entities
     - page_min: Minimum page number to start redaction
     - page_max: Maximum page number to end redaction
     - analysis_type: Type of analysis to perform
     - current_loop_page: Current page being processed in the loop
     - page_break_return: Flag to indicate if a page break should be returned
-    - images: List of images (not used in this function)
     - annotations_all_pages: List of annotations across all pages
     - all_line_level_ocr_results_df: DataFrame for OCR results
     - all_decision_process_table: DataFrame for decision process table
     - pymupdf_doc: List of PyMuPDF documents
     - page_break_val: Value for page break
-    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
@@ -1393,7 +1446,6 @@ def redact_text_pdf(
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
-    #progress_bar = progress.tqdm(range(current_loop_page, number_of_pages), unit="pages", desc="Redacting pages")
     progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
     #for page_no in range(0, number_of_pages):
@@ -1414,14 +1466,8 @@ def redact_text_pdf(
         image_annotations = {"image": image, "boxes": []}
         pymupdf_page = pymupdf_doc.load_page(page_no)
-        #print("pymupdf page loaded")
-        #print("Page number is:", str(page_no + 1))
         if page_min <= page_no < page_max:
-            #print("Page is in range of pages to redact")
             for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
                 page_analyser_results = []
@@ -1465,7 +1511,16 @@ def redact_text_pdf(
                             text_line_analyser_result = []
                             text_line_bounding_boxes = []
-                            text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
                             # Merge bounding boxes for the line if multiple found close together
                             if text_line_analyser_result:

 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
+from presidio_analyzer import RecognizerResult
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 from tools.file_conversion import process_file
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
+from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
+# from tools.data_anonymise import generate_decision_process_output
+from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
+from tools.aws_functions import comprehend_client
+from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
     return sum_of_numbers
 def choose_and_run_redactor(file_paths:List[str],
  prepared_pdf_file_paths:List[str],
  prepared_pdf_image_paths:List[str],
  language:str,
  chosen_redact_entities:List[str],
+ chosen_redact_comprehend_entities:List[str],
  in_redact_method:str,
  in_allow_list:List[List[str]]=None,
  latest_file_completed:int=0,
  pymupdf_doc=[],
  current_loop_page:int=0,
  page_break_return:bool=False,
+ pii_identification_method:str="Local",
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
     - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
     - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
     - language (str): The language of the text in the files.
+    - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
+    - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
     - in_redact_method (str): The method to use for redaction.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
     - pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
     - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
+    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         print("First_loop_state is True")
         latest_file_completed = 0
         current_loop_page = 0
         out_file_paths = []
         estimate_total_processing_time = 0
         estimated_time_taken_state = 0
         current_loop_page = 0
     if not out_file_paths:
         out_file_paths = []
     else:
         number_of_files = len(file_paths)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= number_of_files:
             print("Redacting file " + file_path_without_ext + " as an image-based file")
+            pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path,
+             prepared_pdf_image_paths,
+             language,
+             chosen_redact_entities,
+             chosen_redact_comprehend_entities,
+             in_allow_list_flat,
+             is_a_pdf,
+             page_min,
+             page_max,
+             in_redact_method,
+             handwrite_signature_checkbox,
+             "",
+             current_loop_page,
+             page_break_return,
+             prepared_pdf_image_paths,
+             annotations_all_pages,
+             all_line_level_ocr_results_df,
+             all_decision_process_table,
+             pymupdf_doc,
+             pii_identification_method)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path,
+            prepared_pdf_image_paths,language,
+            chosen_redact_entities,
+            chosen_redact_comprehend_entities,
+            in_allow_list_flat,
+            page_min,
+            page_max,
+            "Simple text analysis - PDFs with selectable text",
+            current_loop_page,
+            page_break_return,
+            annotations_all_pages,
+            all_line_level_ocr_results_df,
+            all_decision_process_table,
+            pymupdf_doc,
+            pii_identification_method)
         else:
             out_message = "No redaction method selected"
                 pymupdf_doc.save(out_image_file_path)
             out_file_paths.append(out_image_file_path)
             if logging_file_paths:
                 log_files_output_paths.extend(logging_file_paths)
             logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
+            #log_files_output_paths.append(logs_output_file_name)
+            out_file_paths.append(logs_output_file_name)
             all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
+            #log_files_output_paths.append(all_text_output_file_name)
+            out_file_paths.append(all_text_output_file_name)
             # Make a combined message for the file
             if isinstance(out_message, list):
                 combined_out_message = '\n'.join(out_message)  # Ensure out_message is a list of strings
             else: combined_out_message = out_message
+            toc = time.perf_counter()
+            time_taken = toc - tic
+            estimated_time_taken_state = estimated_time_taken_state + time_taken
             out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
             combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
+            estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
+            print("Estimated total processing time:", str(estimate_total_processing_time))
+            #out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
+            #combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
             # Increase latest file completed count unless we are at the last file
             # if latest_file_completed != len(file_paths):
                 combined_out_message = '\n'.join(out_message)  # Ensure out_message is a list of strings
             else: combined_out_message = out_message
    # If textract requests made, write to logging file
     if all_request_metadata:
     rect_height = pymupdf_page.rect.height
     rect_width = pymupdf_page.rect.width
     # Adjust coordinates based on scaling factors
     page_x_adjust = (rect_width - mediabox_width) / 2  # Center adjustment
     page_y_adjust = (rect_height - mediabox_height) / 2  # Center adjustment
     return new_file_path
+def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
     mediabox_height = page.mediabox[3] - page.mediabox[1]
     mediabox_width = page.mediabox[2] - page.mediabox[0]
     rect_height = page.rect.height
     rect_width = page.rect.width
     out_annotation_boxes = {}
     all_image_annotation_boxes = []
     image_path = ""
         image_path = image
         image = Image.open(image_path)
     # Check if this is an object used in the Gradio Annotation component
     if isinstance (annotations_on_page, dict):
         annotations_on_page = annotations_on_page["boxes"]
     for annot in annotations_on_page:
         # Check if an Image recogniser result, or a Gradio annotation object
         if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
         rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2)  # Small height in middle of word to remove text
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
         page.add_redact_annot(rect_single_pixel_height)
         # Set up drawing a black box over the whole rect
         "boxes": all_image_annotation_boxes
     }
     page.apply_redactions(images=0, graphics=0)
     page.clean_contents()
     return page, out_annotation_boxes
 def bounding_boxes_overlap(box1, box2):
                         combined_text = " ".join(word['text'] for word in relevant_words)
                         # Calculate new dimensions for the merged box
                         reconstructed_bbox = CustomImageRecognizerResult(
                             bbox.entity_type,
                             bbox.start,
     return merged_bboxes
+def redact_image_pdf(file_path:str,
+                     prepared_pdf_file_paths:List[str],
+                     language:str,
+                     chosen_redact_entities:List[str],
+                     chosen_redact_comprehend_entities:List[str],
+                     allow_list:List[str]=None,
+                     is_a_pdf:bool=True,
+                     page_min:int=0,
+                     page_max:int=999,
+                     analysis_type:str="Quick image analysis - typed text",
+                     handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
+                     request_metadata:str="", current_loop_page:int=0,
+                     page_break_return:bool=False,
+                     images=[],
+                     annotations_all_pages:List=[],
+                     all_line_level_ocr_results_df = pd.DataFrame(),
+                     all_decision_process_table = pd.DataFrame(),
+                     pymupdf_doc = [],
+                     pii_identification_method:str="Local",
+                     page_break_val:int=int(page_break_value),
+                     logging_file_paths:List=[],
+                     max_time:int=int(max_time_value),
+                     progress=Progress(track_tqdm=True)):
     '''
     This function redacts sensitive information from a PDF document. It takes the following parameters:
     - prepared_pdf_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
     - language (str): The language of the text in the PDF.
     - chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
+    - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from the list allowed by the AWS Comprehend service.
     - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
     - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
     - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
     - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
     - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
+    - images (list, optional): List of image objects for each PDF page.
+    - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
+    - all_line_level_ocr_results_df (pd.DataFrame(), optional): All line level OCR results for the document as a Pandas dataframe,
+    - all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
+    - pymupdf_doc (List, optional): The document as a PyMupdf object.
+    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
+    - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
+    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
+    The function returns a fully or partially-redacted PDF document.
     '''
     file_name = get_file_path_end(file_path)
     fill = (0, 0, 0)   # Fill colour
             # Step 2: Analyze text and identify PII
             if chosen_redact_entities:
+                pii_identification_method= "AWS Comprehend" #"Local"
                 redaction_bboxes = image_analyser.analyze_text(
                     line_level_ocr_results,
                     line_level_ocr_results_with_children,
+                    chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
+                    pii_identification_method = pii_identification_method,
                     language=language,
                     entities=chosen_redact_entities,
                     allow_list=allow_list,
+                    score_threshold=score_threshold
+                )
+                # redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
+                #     line_level_ocr_results_with_children,
+                #     language,
+                #     chosen_redact_entities,
+                #     allow_list,
+                #     score_threshold,
+                #     pii_identification_method)
             else:
                 redaction_bboxes = []
             if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
             elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
         return characters
     return []
 def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
     '''
     Create an OCRResult object based on a list of pdfminer LTChar objects.
     return analysed_bounding_boxes
+def identify_pii_in_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], chosen_redact_comprehend_entities:List[str], score_threshold:float, allow_list:List[str], pii_identification_method:str="Local") -> List[RecognizerResult]:
+    '''
+    Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
+    '''
+    analyser_results = []
+    #text_to_analyse = initial_clean(text_container.text).strip()
+    text_to_analyse = text_container.text
+    if chosen_redact_entities:
+        if pii_identification_method == "Local":
+            analyser_results = nlp_analyser.analyze(text=text_to_analyse,
+                                                    language=language,
+                                                    entities=chosen_redact_entities,
+                                                    score_threshold=score_threshold,
+                                                    return_decision_process=True,
+                                                    allow_list=allow_list)
+        elif pii_identification_method == "AWS Comprehend":
+            # Call the detect_pii_entities method
+            response = comprehend_client.detect_pii_entities(
+                Text=text_to_analyse,
+                LanguageCode=language  # Specify the language of the text
+            )
+            for result in response["Entities"]:
+                result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
+                if result_text not in allow_list:
+                    if result.get("Type") in chosen_redact_comprehend_entities:
+                        recogniser_entity = recognizer_result_from_dict(result)
+                        analyser_results.append(recogniser_entity)
+        else:
+            analyser_results = []
+    else:
+        analyser_results = []
+    return analyser_results
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
     prepared_pdf_image_path: str,  # Path to the prepared PDF image for redaction
     language: str,  # Language of the PDF content
     chosen_redact_entities: List[str],  # List of entities to be redacted
+    chosen_redact_comprehend_entities: List[str],
     allow_list: List[str] = None,  # Optional list of allowed entities
     page_min: int = 0,  # Minimum page number to start redaction
     page_max: int = 999,  # Maximum page number to end redaction
     all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(),  # DataFrame for OCR results
     all_decision_process_table: pd.DataFrame = pd.DataFrame(),  # DataFrame for decision process table
     pymupdf_doc: List = [],  # List of PyMuPDF documents
+    pii_identification_method: str = "Local",
     page_break_val: int = int(page_break_value),  # Value for page break
+    max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
 ):
     '''
     Redact chosen entities from a PDF that is made up of multiple pages that are not images.
     - prepared_pdf_image_path: Path to the prepared PDF image for redaction
     - language: Language of the PDF content
     - chosen_redact_entities: List of entities to be redacted
+    - chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
     - allow_list: Optional list of allowed entities
     - page_min: Minimum page number to start redaction
     - page_max: Maximum page number to end redaction
     - analysis_type: Type of analysis to perform
     - current_loop_page: Current page being processed in the loop
     - page_break_return: Flag to indicate if a page break should be returned
     - annotations_all_pages: List of annotations across all pages
     - all_line_level_ocr_results_df: DataFrame for OCR results
     - all_decision_process_table: DataFrame for decision process table
     - pymupdf_doc: List of PyMuPDF documents
+    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - page_break_val: Value for page break
+    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
     progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
     #for page_no in range(0, number_of_pages):
         image_annotations = {"image": image, "boxes": []}
         pymupdf_page = pymupdf_doc.load_page(page_no)
         if page_min <= page_no < page_max:
             for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
                 page_analyser_results = []
                             text_line_analyser_result = []
                             text_line_bounding_boxes = []
+                            # text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
+                            #pii_identification_method="AWS Comprehend"#"Local"
+                            if chosen_redact_entities:
+                                text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
+                            else:
+                                text_line_analyser_result = []
                             # Merge bounding boxes for the line if multiple found close together
                             if text_line_analyser_result:

tools/presidio_analyzer_custom.py CHANGED Viewed

@@ -5,7 +5,28 @@ from tqdm import tqdm
 from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
 from presidio_analyzer.nlp_engine import NlpArtifacts
 def analyze_iterator_custom(
         self,

 from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
 from presidio_analyzer.nlp_engine import NlpArtifacts
+def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
+    """
+    Create RecognizerResult from a dictionary.
+    :param data: e.g. {
+        "entity_type": "NAME",
+        "start": 24,
+        "end": 32,
+        "score": 0.8,
+        "recognition_metadata": None
+    }
+    :return: RecognizerResult
+    """
+    entity_type = data.get("Type")
+    start = data.get("BeginOffset")
+    end = data.get("EndOffset")
+    score = data.get("Score")
+    analysis_explanation = None
+    recognition_metadata = None
+    return RecognizerResult(entity_type, start, end, score, analysis_explanation, recognition_metadata)
 def analyze_iterator_custom(
         self,