Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Sep 19, 2024

Commit

6ea0852

1 Parent(s): e9c4101

Improved allow list, handwriting/signature identification, logging

Browse files

Files changed (8) hide show

app.py +26 -13
tools/aws_functions.py +6 -1
tools/aws_textract.py +85 -55
tools/custom_image_analyser_engine.py +84 -12
tools/file_conversion.py +10 -0
tools/file_redaction.py +93 -72
tools/helper_functions.py +25 -0
tools/load_spacy_model_custom_recognisers.py +1 -1

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import socket
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs
 from tools.aws_functions import upload_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf
@@ -12,6 +12,7 @@ from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
 from datetime import datetime
 today_rev = datetime.now().strftime("%Y%m%d")
@@ -44,6 +45,8 @@ with app:
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
@@ -69,8 +72,8 @@ with app:
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
-            in_file = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png'])
-            redact_btn = gr.Button("Redact document(s)", variant="primary")
         with gr.Row():
             output_summary = gr.Textbox(label="Output summary")
@@ -128,6 +131,8 @@ with app:
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
@@ -135,11 +140,16 @@ with app:
             in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
             with gr.Row():
                 in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
-                in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
             log_files_output = gr.File(label="Log file output", interactive=False)
-        # Invisible text box to hold the session hash/username just for logging purposes
-        session_hash_textbox = gr.Textbox(value="", visible=False)
     # AWS options - placeholder for possibility of storing data on s3
     # with gr.Tab(label="Advanced options"):
@@ -153,16 +163,19 @@ with app:
     # ### Loading AWS data ###
     # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
     # Document redaction
-    redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
     text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number]).\
     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
      # Tabular data redaction
@@ -197,8 +210,8 @@ with app:
     # Log processing time/token usage when making a query
     usage_callback = gr.CSVLogger()
-    usage_callback.setup([session_hash_textbox, in_data_files, estimated_time_taken_number], usage_logs_folder)
-    estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, in_data_files, estimated_time_taken_number], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app

 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
 from tools.aws_functions import upload_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf
 from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
+import pandas as pd
 from datetime import datetime
 today_rev = datetime.now().strftime("%Y%m%d")
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
+    in_allow_list_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
+            in_file = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
+            document_redact_btn = gr.Button("Redact document(s)", variant="primary")
         with gr.Row():
             output_summary = gr.Textbox(label="Output summary")
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
+            with gr.Row():
+                handwrite_signature_checkbox = gr.CheckboxGroup(choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
             in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
             with gr.Row():
                 in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
+                #in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
+                with gr.Row():
+                    in_allow_list = gr.UploadButton(label="Import allow list file.", file_count="multiple")
+                    gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
+                    in_allow_list_text = gr.Textbox(label="Custom allow list load status")
             log_files_output = gr.File(label="Log file output", interactive=False)
+        # Invisible text box to hold the session hash/username and Textract request metadata just for logging purposes
+        session_hash_textbox = gr.Textbox(value="", visible=False)
+        textract_metadata_textbox = gr.Textbox(value="", visible=False)
     # AWS options - placeholder for possibility of storing data on s3
     # with gr.Tab(label="Advanced options"):
     # ### Loading AWS data ###
     # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
+    # If a custom allow list is uploaded
+    in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     # Document redaction
+    document_redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
     text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox]).\
     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
      # Tabular data redaction
     # Log processing time/token usage when making a query
     usage_callback = gr.CSVLogger()
+    usage_callback.setup([session_hash_textbox, in_data_files, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
+    estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, in_data_files, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app

tools/aws_functions.py CHANGED Viewed

@@ -14,6 +14,10 @@ aws_var_default = "0"
 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')
 if aws_var_val == "1":
     try:
         bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
@@ -22,7 +26,8 @@ if aws_var_val == "1":
         print(e)
     def get_assumed_role_info():
-        sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
         response = sts.get_caller_identity()
         # Extract ARN of the assumed role

 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')
+# Launch the Gradio app
+AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
+print(f'The value of AWS_REGION is {AWS_REGION}')
 if aws_var_val == "1":
     try:
         bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
         print(e)
     def get_assumed_role_info():
+        sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
+        sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
         response = sts.get_caller_identity()
         # Extract ARN of the assumed role

tools/aws_textract.py CHANGED Viewed

@@ -7,6 +7,22 @@ import pikepdf
 from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 def analyse_page_with_textract(pdf_page_bytes, json_file_path):
     '''
     Analyse page with AWS Textract
@@ -27,7 +43,8 @@ def analyse_page_with_textract(pdf_page_bytes, json_file_path):
     #response = client.detect_document_text(Document={'Bytes': image_bytes})
     response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
-    text_blocks = response['Blocks']
     # Write the response to a JSON file
     with open(json_file_path, 'w') as json_file:
@@ -35,7 +52,7 @@ def analyse_page_with_textract(pdf_page_bytes, json_file_path):
     print("Response has been written to output:", json_file_path)
-    return text_blocks
 def convert_pike_pdf_page_to_bytes(pdf, page_num):
@@ -66,10 +83,12 @@ def convert_pike_pdf_page_to_bytes(pdf, page_num):
 def json_to_ocrresult(json_data, page_width, page_height):
     '''
-    Convert the json response from textract to the OCRResult format used elsewhere in the code.
     '''
     all_ocr_results = []
     signature_or_handwriting_recogniser_results = []
     signatures = []
     handwriting = []
@@ -78,30 +97,40 @@ def json_to_ocrresult(json_data, page_width, page_height):
         is_signature = False
         is_handwriting = False
-        if (text_block['BlockType'] == 'WORD') | (text_block['BlockType'] == 'LINE'):
-            text = text_block['Text']
-            # Extract BoundingBox details
-            bbox = text_block["Geometry"]["BoundingBox"]
-            left = bbox["Left"]
-            top = bbox["Top"]
-            width = bbox["Width"]
-            height = bbox["Height"]
-            # Convert proportional coordinates to absolute coordinates
-            left_abs = int(left * page_width)
-            top_abs = int(top * page_height)
-            width_abs = int(width * page_width)
-            height_abs = int(height * page_height)
-            # Create OCRResult with absolute coordinates
-            ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
-            # If handwriting or signature, add to bounding box
-            confidence = text_block['Confidence']
-            if 'TextType' in text_block:
-                text_type = text_block["TextType"]
                 if text_type == "HANDWRITING":
                     is_handwriting = True
@@ -110,42 +139,43 @@ def json_to_ocrresult(json_data, page_width, page_height):
                     recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
                     handwriting.append(recogniser_result)
                     print("Handwriting found:", handwriting[-1])
-            all_ocr_results.append(ocr_result)
-        elif (text_block['BlockType'] == 'SIGNATURE'):
-            text = "SIGNATURE"
-            # Extract BoundingBox details
-            bbox = text_block["Geometry"]["BoundingBox"]
-            left = bbox["Left"]
-            top = bbox["Top"]
-            width = bbox["Width"]
-            height = bbox["Height"]
-            # Convert proportional coordinates to absolute coordinates
-            left_abs = int(left * page_width)
-            top_abs = int(top * page_height)
-            width_abs = int(width * page_width)
-            height_abs = int(height * page_height)
-            # Create OCRResult with absolute coordinates
-            ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
-            is_signature = True
-            entity_name = "Signature"
-            word_end = len(entity_name)
-            recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
-            signatures.append(recogniser_result)
-            print("Signature found:", signatures[-1])
             all_ocr_results.append(ocr_result)
-        is_signature_or_handwriting = is_signature | is_handwriting
-        # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
-        if is_signature_or_handwriting:
-            signature_or_handwriting_recogniser_results.append(recogniser_result)
-    return all_ocr_results, signature_or_handwriting_recogniser_results

 from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
+def extract_textract_metadata(response):
+    """Extracts metadata from an AWS Textract response."""
+    print("Document metadata:", response['DocumentMetadata'])
+    request_id = response['ResponseMetadata']['RequestId']
+    pages = response['DocumentMetadata']['Pages']
+    #number_of_pages = response['DocumentMetadata']['NumberOfPages']
+    return str({
+        'RequestId': request_id,
+        'Pages': pages
+        #,
+        #'NumberOfPages': number_of_pages
+    })
 def analyse_page_with_textract(pdf_page_bytes, json_file_path):
     '''
     Analyse page with AWS Textract
     #response = client.detect_document_text(Document={'Bytes': image_bytes})
     response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
+    text_blocks = response['Blocks']
+    request_metadata = extract_textract_metadata(response)
     # Write the response to a JSON file
     with open(json_file_path, 'w') as json_file:
     print("Response has been written to output:", json_file_path)
+    return text_blocks, request_metadata
 def convert_pike_pdf_page_to_bytes(pdf, page_num):
 def json_to_ocrresult(json_data, page_width, page_height):
     '''
+    Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
     '''
     all_ocr_results = []
     signature_or_handwriting_recogniser_results = []
+    signature_recogniser_results = []
+    handwriting_recogniser_results = []
     signatures = []
     handwriting = []
         is_signature = False
         is_handwriting = False
+        if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
+            if (text_block['BlockType'] == 'LINE'):
+                # If a line, pull out the text type and confidence from the child words and get text, bounding box
+                if 'Text' in text_block:
+                    text = text_block['Text']
+                if 'Relationships' in text_block:
+                    for relationship in text_block['Relationships']:
+                        if relationship['Type'] == 'CHILD':
+                            for child_id in relationship['Ids']:
+                                child_block = next((block for block in json_data if block['Id'] == child_id), None)
+                                if child_block and 'TextType' in child_block:
+                                    text_type = child_block['TextType']
+                                    confidence = text_block['Confidence']
+                                    break
+                            break
+                # Extract BoundingBox details
+                bbox = text_block["Geometry"]["BoundingBox"]
+                left = bbox["Left"]
+                top = bbox["Top"]
+                width = bbox["Width"]
+                height = bbox["Height"]
+                # Convert proportional coordinates to absolute coordinates
+                left_abs = int(left * page_width)
+                top_abs = int(top * page_height)
+                width_abs = int(width * page_width)
+                height_abs = int(height * page_height)
+                # If handwriting or signature, add to bounding box
                 if text_type == "HANDWRITING":
                     is_handwriting = True
                     recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
                     handwriting.append(recogniser_result)
                     print("Handwriting found:", handwriting[-1])
+            elif (text_block['BlockType'] == 'SIGNATURE'):
+                text = "SIGNATURE"
+                is_signature = True
+                entity_name = "SIGNATURE"
+                confidence = text_block['Confidence']
+                word_end = len(entity_name)
+                # Extract BoundingBox details
+                bbox = text_block["Geometry"]["BoundingBox"]
+                left = bbox["Left"]
+                top = bbox["Top"]
+                width = bbox["Width"]
+                height = bbox["Height"]
+                # Convert proportional coordinates to absolute coordinates
+                left_abs = int(left * page_width)
+                top_abs = int(top * page_height)
+                width_abs = int(width * page_width)
+                height_abs = int(height * page_height)
+                recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
+                signatures.append(recogniser_result)
+                print("Signature found:", signatures[-1])
+            # Create OCRResult with absolute coordinates
+            ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
             all_ocr_results.append(ocr_result)
+            is_signature_or_handwriting = is_signature | is_handwriting
+            # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
+            if is_signature_or_handwriting:
+                signature_or_handwriting_recogniser_results.append(recogniser_result)
+                if is_signature: signature_recogniser_results.append(recogniser_result)
+                if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
+    return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -64,22 +64,40 @@ class CustomImageAnalyzerEngine:
         ocr_results: List[OCRResult],
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
-        # Combine all OCR text
-        full_text = ' '.join([result.text for result in ocr_results])
         # Define English as default language, if not specified
         if "language" not in text_analyzer_kwargs:
             text_analyzer_kwargs["language"] = "en"
-        analyzer_result = self.analyzer_engine.analyze(
-            text=full_text, **text_analyzer_kwargs
-        )
         allow_list = text_analyzer_kwargs.get('allow_list', [])
-        return self.map_analyzer_results_to_bounding_boxes(
-            analyzer_result, ocr_results, full_text, allow_list
-        )
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
@@ -113,4 +131,58 @@ class CustomImageAnalyzerEngine:
             text_position = word_end + 1  # +1 for the space between words
-        return pii_bboxes

         ocr_results: List[OCRResult],
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified
         if "language" not in text_analyzer_kwargs:
             text_analyzer_kwargs["language"] = "en"
         allow_list = text_analyzer_kwargs.get('allow_list', [])
+        combined_results = []
+        for ocr_result in ocr_results:
+            # Analyze each OCR result (line) individually
+            analyzer_result = self.analyzer_engine.analyze(
+                text=ocr_result.text, **text_analyzer_kwargs
+            )
+            for result in analyzer_result:
+                # Extract the relevant portion of text based on start and end
+                relevant_text = ocr_result.text[result.start:result.end]
+                # Create a new OCRResult with the relevant text and adjusted position
+                relevant_ocr_result = OCRResult(
+                    text=relevant_text,
+                    left=ocr_result.left + self.estimate_x_offset(ocr_result.text, result.start),
+                    top=ocr_result.top,
+                    width=self.estimate_width(ocr_result, result.start, result.end),
+                    height=ocr_result.height
+                )
+                # Map the analyzer results to bounding boxes for this line
+                line_results = self.map_analyzer_results_to_bounding_boxes(
+                    [result], [relevant_ocr_result], relevant_text, allow_list
+                )
+                combined_results.extend(line_results)
+        return combined_results
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
             text_position = word_end + 1  # +1 for the space between words
+        return pii_bboxes
+    @staticmethod
+    def estimate_x_offset(full_text: str, start: int) -> int:
+        # Estimate the x-offset based on character position
+        # This is a simple estimation and might need refinement for variable-width fonts
+        return int(start / len(full_text) * len(full_text))
+    @staticmethod
+    def estimate_width(ocr_result: OCRResult, start: int, end: int) -> int:
+        # Estimate the width of the relevant text portion
+        full_width = ocr_result.width
+        full_length = len(ocr_result.text)
+        return int((end - start) / full_length * full_width)
+# Function to combine OCR results into line-level results
+def combine_ocr_results(ocr_results, x_threshold = 20, y_threshold = 10):
+    # Sort OCR results by 'top' to ensure line order
+    ocr_results = sorted(ocr_results, key=lambda x: (x.top, x.left))
+    combined_results = []
+    current_line = []
+    current_bbox = None
+    for result in ocr_results:
+        if not current_line:
+            # Start a new line
+            current_line.append(result)
+            current_bbox = result
+        else:
+            # Check if the result is on the same line (y-axis) and close horizontally (x-axis)
+            last_result = current_line[-1]
+            if abs(result.top - last_result.top) <= y_threshold and \
+               (result.left - (last_result.left + last_result.width)) <= x_threshold:
+                # Update the bounding box to include the new word
+                new_right = max(current_bbox.left + current_bbox.width, result.left + result.width)
+                current_bbox = OCRResult(
+                    text=f"{current_bbox.text} {result.text}",
+                    left=current_bbox.left,
+                    top=current_bbox.top,
+                    width=new_right - current_bbox.left,
+                    height=max(current_bbox.height, result.height)
+                )
+                current_line.append(result)
+            else:
+                # Commit the current line and start a new one
+                combined_results.append(current_bbox)
+                current_line = [result]
+                current_bbox = result
+    # Append the last line
+    if current_bbox:
+        combined_results.append(current_bbox)
+    return combined_results

tools/file_conversion.py CHANGED Viewed

@@ -3,6 +3,7 @@ from tools.helper_functions import get_file_path_end, output_folder, detect_file
 from PIL import Image
 import os
 import time
 from gradio import Progress
 from typing import List, Optional
@@ -174,6 +175,15 @@ def prepare_image_or_text_pdf(
         if file_extension in ['.jpg', '.jpeg', '.png']:
             in_redact_method = "Image analysis"
         #if file_path:
         #    file_path_without_ext = get_file_path_end(file_path)
         if not file_path:

 from PIL import Image
 import os
 import time
+import json
 from gradio import Progress
 from typing import List, Optional
         if file_extension in ['.jpg', '.jpeg', '.png']:
             in_redact_method = "Image analysis"
+        # If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
+        if file_extension in ['.json']:
+            json_contents = json.load(file_path)
+            # Write the response to a JSON file
+            out_folder = output_folder + file_path
+            with open(file_path, 'w') as json_file:
+                json.dump(json_contents, out_folder, indent=4)  # indent=4 makes the JSON file pretty-printed
+            continue
         #if file_path:
         #    file_path_without_ext = get_file_path_end(file_path)
         if not file_path:

tools/file_redaction.py CHANGED Viewed

@@ -16,7 +16,7 @@ from gradio import Progress
 from collections import defaultdict  # For efficient grouping
-from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult
 from tools.file_conversion import process_file
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
@@ -24,9 +24,11 @@ from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_
 from tools.data_anonymise import generate_decision_process_output
 from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -75,12 +77,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time
     file_paths_loop = [file_paths[int(latest_file_completed)]]
-    if in_allow_list:
-        in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
@@ -96,7 +101,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No file selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
         if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
             # Analyse and redact image-based pdf or image
@@ -104,7 +109,9 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
             print("Redacting file" + file_path_without_ext + "as an image-based file")
-            pdf_images, output_logs, logging_file_paths = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method)
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
@@ -114,12 +121,18 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             out_message.append("File '" + file_path_without_ext + "' successfully redacted")
             output_logs_str = str(output_logs)
             logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
             with open(logs_output_file_name, "w") as f:
                 f.write(output_logs_str)
             log_files_output_paths.append(logs_output_file_name)
             # Increase latest file completed count unless we are at the last file
             if latest_file_completed != len(file_paths):
                 print("Completed file number:", str(latest_file_completed))
@@ -165,7 +178,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
     toc = time.perf_counter()
@@ -175,15 +188,33 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     out_message_out = '\n'.join(out_message)
     out_message_out = out_message_out + " " + out_time
-    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
-def merge_img_bboxes(bboxes, handwriting_or_signature_boxes = [], horizontal_threshold=150, vertical_threshold=25):
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
-    if handwriting_or_signature_boxes:
-        print("Handwriting or signature boxes exist at merge:", handwriting_or_signature_boxes)
-        bboxes.extend(handwriting_or_signature_boxes)
     # 1. Group by approximate vertical proximity
     for box in bboxes:
@@ -198,13 +229,18 @@ def merge_img_bboxes(bboxes, handwriting_or_signature_boxes = [], horizontal_thr
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
                 #print("Merging a box")
                 # Calculate new dimensions for the merged box
-                print("Merged box:", merged_box)
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
                 new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
                 new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
-                merged_box = ImageRecognizerResult(
-                    merged_box.entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height
                 )
             else:
                 merged_bboxes.append(merged_box)
@@ -213,7 +249,7 @@ def merge_img_bboxes(bboxes, handwriting_or_signature_boxes = [], horizontal_thr
         merged_bboxes.append(merged_box)
     return merged_bboxes
-def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Image analysis", progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
@@ -223,6 +259,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     fill = (0, 0, 0)   # Fill colour
     decision_process_output_str = ""
     images = []
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if not image_paths:
@@ -256,6 +293,12 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     print("Page range:", str(page_min + 1), "to", str(page_max))
     #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
     for n in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
@@ -277,6 +320,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
             print("Redacting page", reported_page_number)
             # Assuming image_paths[i] is your PIL image object
             try:
                 image = image_paths[0][i]#.copy()
@@ -286,45 +330,25 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 print(e)
                 continue
-            # %%
-            # image_analyser = ImageAnalyzerEngine(nlp_analyser)
-            # engine = ImageRedactorEngine(image_analyser)
             if language == 'en':
                 ocr_lang = 'eng'
             else: ocr_lang = language
-            # bboxes = image_analyser.analyze(image,
-            #         ocr_kwargs={"lang": ocr_lang},
-            #         **{
-            #         "allow_list": allow_list,
-            #         "language": language,
-            #         "entities": chosen_redact_entities,
-            #         "score_threshold": score_threshold,
-            #         "return_decision_process":True,
-            #     })
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             if analysis_type == "Image analysis":
                 ocr_results = image_analyser.perform_ocr(image)
-                # Process all OCR text with bounding boxes
-                #print("OCR results:", ocr_results)
-                ocr_results_str = str(ocr_results)
-                ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_page_" + reported_page_number + ".txt"
-                with open(ocr_results_file_path, "w") as f:
-                    f.write(ocr_results_str)
-                logging_file_paths.append(ocr_results_file_path)
             # Import results from json and convert
             if analysis_type == "AWS Textract":
-                # Ensure image is a PIL Image object
-                # if isinstance(image, str):
-                #     image = Image.open(image)
-                # elif not isinstance(image, Image.Image):
-                #     print(f"Unexpected image type on page {i}: {type(image)}")
-                #     continue
                 # Convert the image to bytes using an in-memory buffer
                 image_buffer = io.BytesIO()
@@ -334,7 +358,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
                 if not os.path.exists(json_file_path):
-                    text_blocks = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                 else:
                     # Open the file and load the JSON data
@@ -343,19 +367,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                         text_blocks = json.load(json_file)
                         text_blocks = text_blocks['Blocks']
-                # Need image size to convert textract OCR outputs to the correct sizes
-                #print("Image size:", image.size)
-                page_width, page_height = image.size
-                ocr_results, handwriting_or_signature_boxes = json_to_ocrresult(text_blocks, page_width, page_height)
-                #print("OCR results:", ocr_results)
-                ocr_results_str = str(ocr_results)
-                textract_ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_page_" + reported_page_number + "_textract.txt"
-                with open(textract_ocr_results_file_path, "w") as f:
-                            f.write(ocr_results_str)
-                logging_file_paths.append(textract_ocr_results_file_path)
             # Step 2: Analyze text and identify PII
             bboxes = image_analyser.analyze_text(
@@ -364,21 +376,19 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 entities=chosen_redact_entities,
                 allow_list=allow_list,
                 score_threshold=score_threshold,
-            )
-            # Process the bboxes (PII entities)
-            if bboxes:
-                for bbox in bboxes:
-                    print(f"Entity: {bbox.entity_type}, Text: {bbox.text}, Bbox: ({bbox.left}, {bbox.top}, {bbox.width}, {bbox.height})")
-                decision_process_output_str = str(bboxes)
-                print("Decision process:", decision_process_output_str)
             # Merge close bounding boxes
-            merged_bboxes = merge_img_bboxes(bboxes, handwriting_or_signature_boxes)
-            #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
-            #from PIL import Image
-            #image_object = Image.open(image)
             # 3. Draw the merged boxes
             draw = ImageDraw.Draw(image)
@@ -390,9 +400,20 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 y1 = y0 + box.height
                 draw.rectangle([x0, y0, x1, y1], fill=fill)
         images.append(image)
-    return images, decision_process_output_str, logging_file_paths
 def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
     if isinstance(text_container, LTTextContainer):

 from collections import defaultdict  # For efficient grouping
+from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 from tools.file_conversion import process_file
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.data_anonymise import generate_decision_process_output
 from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
+    all_request_metadata = []
+    all_request_metadata_str = ""
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata
     file_paths_loop = [file_paths[int(latest_file_completed)]]
+    if not in_allow_list.empty:
+        in_allow_list_flat = in_allow_list[0].tolist()
+        print("In allow list:", in_allow_list_flat)
+    else:
+        in_allow_list_flat = []
     for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         else:
             out_message = "No file selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata
         if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
             # Analyse and redact image-based pdf or image
             #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
             print("Redacting file" + file_path_without_ext + "as an image-based file")
+            pdf_images, output_logs, logging_file_paths, request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
+            # Save file
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             out_message.append("File '" + file_path_without_ext + "' successfully redacted")
+            # Save decision making process
             output_logs_str = str(output_logs)
             logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
             with open(logs_output_file_name, "w") as f:
                 f.write(output_logs_str)
             log_files_output_paths.append(logs_output_file_name)
+            # Save Textract request metadata (if exists)
+            if request_metadata:
+                print("Request metadata:", all_request_metadata)
+                all_request_metadata.append(request_metadata)
             # Increase latest file completed count unless we are at the last file
             if latest_file_completed != len(file_paths):
                 print("Completed file number:", str(latest_file_completed))
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata
     toc = time.perf_counter()
     out_message_out = '\n'.join(out_message)
     out_message_out = out_message_out + " " + out_time
+    # If textract requests made, write to logging file
+    if all_request_metadata:
+        all_request_metadata_str = '\n'.join(all_request_metadata)
+        print("all_request_metadata_file_path")
+        all_request_metadata_file_path = output_folder + "textract_request_metadata.txt"
+        with open(all_request_metadata_file_path, "w") as f:
+            f.write(all_request_metadata_str)
+        log_files_output_paths.append(all_request_metadata_file_path)
+    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
+def merge_img_bboxes(bboxes, signature_recogniser_results = [], handwriting_recogniser_results = [], handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold=150, vertical_threshold=25):
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
+    if signature_recogniser_results or handwriting_recogniser_results:
+        if "Redact all identified handwriting" in handwrite_signature_checkbox:
+            print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
+            bboxes.extend(handwriting_recogniser_results)
+        if "Redact all identified signatures" in handwrite_signature_checkbox:
+            print("Signature boxes exist at merge:", handwriting_recogniser_results)
+            bboxes.extend(signature_recogniser_results)
     # 1. Group by approximate vertical proximity
     for box in bboxes:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
                 #print("Merging a box")
                 # Calculate new dimensions for the merged box
+                #print("Merged box:", merged_box)
+                if merged_box.text == next_box.text:
+                    new_text = merged_box.text
+                else:
+                    new_text = merged_box.text + " " + next_box.text
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
                 new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
                 new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
+                merged_box = CustomImageRecognizerResult(
+                    merged_box.entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
                 )
             else:
                 merged_bboxes.append(merged_box)
         merged_bboxes.append(merged_box)
     return merged_bboxes
+def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Image analysis", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
     fill = (0, 0, 0)   # Fill colour
     decision_process_output_str = ""
     images = []
+    request_metadata = {}
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if not image_paths:
     print("Page range:", str(page_min + 1), "to", str(page_max))
     #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
+    all_ocr_results = []
+    all_decision_process = []
+    if analysis_type == "Image analysis": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
+    elif analysis_type == "AWS Textract": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
     for n in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
             print("Redacting page", reported_page_number)
             # Assuming image_paths[i] is your PIL image object
             try:
                 image = image_paths[0][i]#.copy()
                 print(e)
                 continue
+            # Need image size to convert textract OCR outputs to the correct sizes
+            page_width, page_height = image.size
+            # Possibility to use different languages
             if language == 'en':
                 ocr_lang = 'eng'
             else: ocr_lang = language
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             if analysis_type == "Image analysis":
                 ocr_results = image_analyser.perform_ocr(image)
+                # Combine OCR results
+                ocr_results = combine_ocr_results(ocr_results)
             # Import results from json and convert
             if analysis_type == "AWS Textract":
                 # Convert the image to bytes using an in-memory buffer
                 image_buffer = io.BytesIO()
                 json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
                 if not os.path.exists(json_file_path):
+                    text_blocks, request_metadata = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                 else:
                     # Open the file and load the JSON data
                         text_blocks = json.load(json_file)
                         text_blocks = text_blocks['Blocks']
+                ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results = json_to_ocrresult(text_blocks, page_width, page_height)
             # Step 2: Analyze text and identify PII
             bboxes = image_analyser.analyze_text(
                 entities=chosen_redact_entities,
                 allow_list=allow_list,
                 score_threshold=score_threshold,
+            )
             # Merge close bounding boxes
+            merged_bboxes = merge_img_bboxes(bboxes, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
+            # Export the decision making process
+            if merged_bboxes:
+                for bbox in merged_bboxes:
+                    print(f"Entity: {bbox.entity_type}, Text: {bbox.text}, Bbox: ({bbox.left}, {bbox.top}, {bbox.width}, {bbox.height})")
+                decision_process_output_str = "Page " + reported_page_number + ":\n" + str(merged_bboxes)
+                all_decision_process.append(decision_process_output_str)
             # 3. Draw the merged boxes
             draw = ImageDraw.Draw(image)
                 y1 = y0 + box.height
                 draw.rectangle([x0, y0, x1, y1], fill=fill)
+            ocr_results_str = "Page:" + reported_page_number + "\n" + str(ocr_results)
+            all_ocr_results.append(ocr_results_str)
         images.append(image)
+    # Write OCR results as a log file
+    ocr_results_out = "\n".join(all_ocr_results)
+    with open(ocr_results_file_path, "w") as f:
+        f.write(ocr_results_out)
+    logging_file_paths.append(ocr_results_file_path)
+    all_decision_process_str = "\n".join(all_decision_process)
+    return images, all_decision_process_str, logging_file_paths, request_metadata
 def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
     if isinstance(text_container, LTTextContainer):

tools/helper_functions.py CHANGED Viewed

@@ -73,6 +73,31 @@ def ensure_output_folder_exists():
     else:
         print(f"The 'output/' folder already exists.")
 def put_columns_in_df(in_file):
     new_choices = []
     concat_choices = []

     else:
         print(f"The 'output/' folder already exists.")
+def custom_regex_load(in_file):
+    '''
+    When file is loaded, update the column dropdown choices and write to relevant data states.
+    '''
+    custom_regex = pd.DataFrame()
+    file_list = [string.name for string in in_file]
+    regex_file_names = [string for string in file_list if "csv" in string.lower()]
+    if regex_file_names:
+        regex_file_name = regex_file_names[0]
+        custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
+        #regex_file_name_no_ext = get_file_path_end(regex_file_name)
+        output_text = "Allow list file loaded."
+        print(output_text)
+    else:
+        error = "No allow list file provided."
+        print(error)
+        output_text = error
+        return error, custom_regex
+    return output_text, custom_regex
 def put_columns_in_df(in_file):
     new_choices = []
     concat_choices = []

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -26,7 +26,7 @@ titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [tit
 # Custom postcode recogniser
 # Define the regex pattern in a Presidio `Pattern` object:
-ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b", score = 1)
 # Define the recognizer with one or more patterns
 ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])

 # Custom postcode recogniser
 # Define the regex pattern in a Presidio `Pattern` object:
+ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\b([A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0AA)\b", score = 1)
 # Define the recognizer with one or more patterns
 ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])