Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Sep 24, 2024

Commit

8652429

1 Parent(s): 6ea0852

Optimised Textract and Tesseract workings

Browse files

Files changed (8) hide show

README.md +5 -4
app.py +70 -44
tools/aws_textract.py +69 -37
tools/custom_image_analyser_engine.py +571 -18
tools/data_anonymise.py +6 -2
tools/file_conversion.py +32 -4
tools/file_redaction.py +410 -190
tools/load_spacy_model_custom_recognisers.py +1 -1

README.md CHANGED Viewed

@@ -9,9 +9,10 @@ pinned: false
 license: mit
 ---
-# Introduction
-Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
-WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
-Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.

 license: mit
 ---
+# Document redaction
+Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
+NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
+This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
 from tools.aws_functions import upload_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
-from tools.file_conversion import prepare_image_or_text_pdf
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
@@ -37,6 +37,9 @@ app = gr.Blocks(theme = gr.themes.Base())
 with app:
     prepared_pdf_state = gr.State([])
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
@@ -56,23 +59,38 @@ with app:
     access_logs_state = gr.State(access_logs_folder + 'log.csv')
     access_s3_logs_loc_state = gr.State(access_logs_folder)
     usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
-    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     gr.Markdown(
     """
     # Document redaction
-    Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction. If you are getting 0 redactions, it's possible that the text in the document is saved in image format instead of as selectable text. Select 'Image analysis' on the Settings page in this case.
-    WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
-    This app accepts a maximum file size of 10mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
     """)
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
-            in_file = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
             document_redact_btn = gr.Button("Redact document(s)", variant="primary")
         with gr.Row():
@@ -83,16 +101,14 @@ with app:
         with gr.Row():
             convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
         pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
         pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
-        with gr.Row():
-            s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
-            # This keeps track of the time taken to redact files for logging purposes.
-            estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False)
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
     """
@@ -115,19 +131,21 @@ with app:
             text_output_file = gr.File(label="Output files")
             text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
         data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
         data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
                 choices=["The results were good", "The results were not good"], visible=False)
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
     """
     Define redaction settings that affect both document and open text redaction.
     """)
         with gr.Accordion("Settings for documents", open = True):
-            in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis", "AWS Textract"])
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
@@ -140,53 +158,47 @@ with app:
             in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
             with gr.Row():
                 in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
-                #in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
                 with gr.Row():
                     in_allow_list = gr.UploadButton(label="Import allow list file.", file_count="multiple")
                     gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
             log_files_output = gr.File(label="Log file output", interactive=False)
-        # Invisible text box to hold the session hash/username and Textract request metadata just for logging purposes
-        session_hash_textbox = gr.Textbox(value="", visible=False)
-        textract_metadata_textbox = gr.Textbox(value="", visible=False)
-    # AWS options - placeholder for possibility of storing data on s3
-    # with gr.Tab(label="Advanced options"):
-    #     with gr.Accordion(label = "AWS data access", open = True):
-    #         aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
-    #         with gr.Row():
-    #             in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
-    #             load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
-    #         aws_log_box = gr.Textbox(label="AWS data load status")
-    # ### Loading AWS data ###
-    # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
     # If a custom allow list is uploaded
     in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
-    # Document redaction
-    document_redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
-    text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox]).\
     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
-     # Tabular data redaction
-    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
-    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_text")
     # If the output file count text box changes, keep going with redacting each data file until done
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     # Get connection details on app load
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
@@ -198,8 +210,8 @@ with app:
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger()
-    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_logs_folder)
-    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
@@ -210,8 +222,8 @@ with app:
     # Log processing time/token usage when making a query
     usage_callback = gr.CSVLogger()
-    usage_callback.setup([session_hash_textbox, in_data_files, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
-    estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, in_data_files, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
@@ -222,4 +234,18 @@ if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
         app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
     else:
-        app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')

 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
 from tools.aws_functions import upload_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
+from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
 with app:
+    ###
+    # STATE VARIABLES
+    ###
     prepared_pdf_state = gr.State([])
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
     access_logs_state = gr.State(access_logs_folder + 'log.csv')
     access_s3_logs_loc_state = gr.State(access_logs_folder)
     usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
+    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
+    # Invisible elements effectively used as state variables
+    session_hash_textbox = gr.Textbox(value="", visible=False) # Invisible text box to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
+    textract_metadata_textbox = gr.Textbox(value="", visible=False)
+    doc_file_name_textbox = gr.Textbox(value="", visible=False)
+    data_file_name_textbox = gr.Textbox(value="", visible=False)
+    s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
+    estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
+    ###
+    # UI DESIGN
+    ###
     gr.Markdown(
     """
     # Document redaction
+    Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
+    NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
+    This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
     """)
+    # PDF / IMAGES TAB
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
+            in_doc_files = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
+            in_redaction_method = gr.Radio(label="Choose document redaction method. Note that for AWS Textract, there will be a cost to the service from use of AWS services.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - AWS Textract, handwriting/signatures"])
+            gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document(s)", variant="primary")
         with gr.Row():
         with gr.Row():
             convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
+        # Feedback elements are invisible until revealed by redaction action
         pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
         pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+    # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
     """
             text_output_file = gr.File(label="Output files")
             text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
+        # Feedback elements are invisible until revealed by redaction action
         data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
         data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
                 choices=["The results were good", "The results were not good"], visible=False)
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+    # SETTINGS TAB
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
     """
     Define redaction settings that affect both document and open text redaction.
     """)
         with gr.Accordion("Settings for documents", open = True):
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
             in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
             with gr.Row():
                 in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
+                # Upload 'Allow list' for terms not to be redacted
                 with gr.Row():
                     in_allow_list = gr.UploadButton(label="Import allow list file.", file_count="multiple")
                     gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
             log_files_output = gr.File(label="Log file output", interactive=False)
     # If a custom allow list is uploaded
     in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
+    ###
+    # PDF/IMAGE REDACTION
+    ###
+    in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox])
+    document_redact_btn.click(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare_doc").\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
+    text_documents_done.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox]).\
     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
+    ###
+    # TABULAR DATA REDACTION
+    ###
+    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
+                  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_textbox])
+    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
     # If the output file count text box changes, keep going with redacting each data file until done
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
+    ###
+    # APP LOAD AND LOGGING
+    ###
     # Get connection details on app load
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger()
+    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_doc_files], feedback_logs_folder)
+    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_doc_files], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
     # Log processing time/token usage when making a query
     usage_callback = gr.CSVLogger()
+    usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
+    estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
     if os.environ['COGNITO_AUTH'] == "1":
         app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
     else:
+        app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')
+# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
+# with gr.Tab(label="Advanced options"):
+#     with gr.Accordion(label = "AWS data access", open = True):
+#         aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
+#         with gr.Row():
+#             in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
+#             load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
+#         aws_log_box = gr.Textbox(label="AWS data load status")
+# ### Loading AWS data ###
+# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_doc_files, aws_log_box])

tools/aws_textract.py CHANGED Viewed

@@ -44,7 +44,7 @@ def analyse_page_with_textract(pdf_page_bytes, json_file_path):
     response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     text_blocks = response['Blocks']
-    request_metadata = extract_textract_metadata(response)
     # Write the response to a JSON file
     with open(json_file_path, 'w') as json_file:
@@ -92,56 +92,75 @@ def json_to_ocrresult(json_data, page_width, page_height):
     signatures = []
     handwriting = []
     for text_block in json_data:
         is_signature = False
         is_handwriting = False
-        if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
-            if (text_block['BlockType'] == 'LINE'):
-                # If a line, pull out the text type and confidence from the child words and get text, bounding box
-                if 'Text' in text_block:
-                    text = text_block['Text']
                 if 'Relationships' in text_block:
                     for relationship in text_block['Relationships']:
                         if relationship['Type'] == 'CHILD':
                             for child_id in relationship['Ids']:
                                 child_block = next((block for block in json_data if block['Id'] == child_id), None)
-                                if child_block and 'TextType' in child_block:
-                                    text_type = child_block['TextType']
-                                    confidence = text_block['Confidence']
-                                    break
-                            break
-                # Extract BoundingBox details
-                bbox = text_block["Geometry"]["BoundingBox"]
-                left = bbox["Left"]
-                top = bbox["Top"]
-                width = bbox["Width"]
-                height = bbox["Height"]
-                # Convert proportional coordinates to absolute coordinates
-                left_abs = int(left * page_width)
-                top_abs = int(top * page_height)
-                width_abs = int(width * page_width)
-                height_abs = int(height * page_height)
-                # If handwriting or signature, add to bounding box
-                if text_type == "HANDWRITING":
-                    is_handwriting = True
-                    entity_name = "HANDWRITING"
-                    word_end = len(entity_name)
-                    recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
-                    handwriting.append(recogniser_result)
-                    print("Handwriting found:", handwriting[-1])
             elif (text_block['BlockType'] == 'SIGNATURE'):
-                text = "SIGNATURE"
                 is_signature = True
                 entity_name = "SIGNATURE"
@@ -161,12 +180,25 @@ def json_to_ocrresult(json_data, page_width, page_height):
                 width_abs = int(width * page_width)
                 height_abs = int(height * page_height)
-                recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
                 signatures.append(recogniser_result)
                 print("Signature found:", signatures[-1])
             # Create OCRResult with absolute coordinates
-            ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
             all_ocr_results.append(ocr_result)
             is_signature_or_handwriting = is_signature | is_handwriting
@@ -178,4 +210,4 @@ def json_to_ocrresult(json_data, page_width, page_height):
                 if is_signature: signature_recogniser_results.append(recogniser_result)
                 if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
-    return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results

     response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     text_blocks = response['Blocks']
+    request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
     # Write the response to a JSON file
     with open(json_file_path, 'w') as json_file:
     signatures = []
     handwriting = []
+    combined_results = {}
     for text_block in json_data:
         is_signature = False
         is_handwriting = False
+        if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
+            if text_block['BlockType'] == 'LINE':
+                # Extract text and bounding box for the line
+                line_text = text_block.get('Text', '')
+                line_bbox = text_block["Geometry"]["BoundingBox"]
+                line_left = int(line_bbox["Left"] * page_width)
+                line_top = int(line_bbox["Top"] * page_height)
+                line_right = int((line_bbox["Left"] + line_bbox["Width"]) * page_width)
+                line_bottom = int((line_bbox["Top"] + line_bbox["Height"]) * page_height)
+                words = []
                 if 'Relationships' in text_block:
                     for relationship in text_block['Relationships']:
                         if relationship['Type'] == 'CHILD':
                             for child_id in relationship['Ids']:
                                 child_block = next((block for block in json_data if block['Id'] == child_id), None)
+                                if child_block and child_block['BlockType'] == 'WORD':
+                                    word_text = child_block.get('Text', '')
+                                    word_bbox = child_block["Geometry"]["BoundingBox"]
+                                    confidence = child_block.get('Confidence','')
+                                    word_left = int(word_bbox["Left"] * page_width)
+                                    word_top = int(word_bbox["Top"] * page_height)
+                                    word_right = int((word_bbox["Left"] + word_bbox["Width"]) * page_width)
+                                    word_bottom = int((word_bbox["Top"] + word_bbox["Height"]) * page_height)
+                                    # Extract BoundingBox details
+                                    width = word_bbox["Width"]
+                                    height = word_bbox["Height"]
+                                    # Convert proportional coordinates to absolute coordinates
+                                    width_abs = int(width * page_width)
+                                    height_abs = int(height * page_height)
+                                    words.append({
+                                        'text': word_text,
+                                        'bounding_box': (word_left, word_top, word_right, word_bottom)
+                                    })
+                                    # Check for handwriting
+                                    text_type = child_block.get("TextType", '')
+                                    if text_type == "HANDWRITING":
+                                        is_handwriting = True
+                                        entity_name = "HANDWRITING"
+                                        word_end = len(entity_name)
+                                        recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=width_abs, height=height_abs)
+                                        handwriting.append(recogniser_result)
+                                        print("Handwriting found:", handwriting[-1])
+                combined_results[line_text] = {
+                    'bounding_box': (line_left, line_top, line_right, line_bottom),
+                    'words': words
+                }
+                # If handwriting or signature, add to bounding box
             elif (text_block['BlockType'] == 'SIGNATURE'):
+                line_text = "SIGNATURE"
                 is_signature = True
                 entity_name = "SIGNATURE"
                 width_abs = int(width * page_width)
                 height_abs = int(height * page_height)
+                recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
                 signatures.append(recogniser_result)
                 print("Signature found:", signatures[-1])
+            # Extract BoundingBox details
+            bbox = text_block["Geometry"]["BoundingBox"]
+            left = bbox["Left"]
+            top = bbox["Top"]
+            width = bbox["Width"]
+            height = bbox["Height"]
+            # Convert proportional coordinates to absolute coordinates
+            left_abs = int(left * page_width)
+            top_abs = int(top * page_height)
+            width_abs = int(width * page_width)
+            height_abs = int(height * page_height)
             # Create OCRResult with absolute coordinates
+            ocr_result = OCRResult(line_text, left_abs, top_abs, width_abs, height_abs)
             all_ocr_results.append(ocr_result)
             is_signature_or_handwriting = is_signature | is_handwriting
                 if is_signature: signature_recogniser_results.append(recogniser_result)
                 if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
+    return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, combined_results

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import pytesseract
-from PIL import Image
 import numpy as np
 from presidio_analyzer import AnalyzerEngine, RecognizerResult
 from typing import List, Dict, Optional, Union, Tuple
 from dataclasses import dataclass
 @dataclass
 class OCRResult:
@@ -25,17 +30,399 @@ class CustomImageRecognizerResult:
     height: int
     text: str
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
         analyzer_engine: Optional[AnalyzerEngine] = None,
-        tesseract_config: Optional[str] = None
     ):
         if not analyzer_engine:
             analyzer_engine = AnalyzerEngine()
         self.analyzer_engine = analyzer_engine
         self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
     def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
         # Ensure image is a PIL Image
         if isinstance(image, str):
@@ -43,18 +430,30 @@ class CustomImageAnalyzerEngine:
         elif isinstance(image, np.ndarray):
             image = Image.fromarray(image)
-        ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT, config=self.tesseract_config)
         # Filter out empty strings and low confidence results
-        valid_indices = [i for i, text in enumerate(ocr_data['text']) if text.strip() and int(ocr_data['conf'][i]) > 0]
         return [
             OCRResult(
-                text=ocr_data['text'][i],
-                left=ocr_data['left'][i],
-                top=ocr_data['top'][i],
-                width=ocr_data['width'][i],
-                height=ocr_data['height'][i]
             )
             for i in valid_indices
         ]
@@ -86,7 +485,7 @@ class CustomImageAnalyzerEngine:
                     text=relevant_text,
                     left=ocr_result.left + self.estimate_x_offset(ocr_result.text, result.start),
                     top=ocr_result.top,
-                    width=self.estimate_width(ocr_result, result.start, result.end),
                     height=ocr_result.height
                 )
@@ -132,28 +531,160 @@ class CustomImageAnalyzerEngine:
             text_position = word_end + 1  # +1 for the space between words
         return pii_bboxes
     @staticmethod
     def estimate_x_offset(full_text: str, start: int) -> int:
         # Estimate the x-offset based on character position
         # This is a simple estimation and might need refinement for variable-width fonts
         return int(start / len(full_text) * len(full_text))
-    @staticmethod
-    def estimate_width(ocr_result: OCRResult, start: int, end: int) -> int:
-        # Estimate the width of the relevant text portion
-        full_width = ocr_result.width
-        full_length = len(ocr_result.text)
-        return int((end - start) / full_length * full_width)
 # Function to combine OCR results into line-level results
-def combine_ocr_results(ocr_results, x_threshold = 20, y_threshold = 10):
     # Sort OCR results by 'top' to ensure line order
     ocr_results = sorted(ocr_results, key=lambda x: (x.top, x.left))
     combined_results = []
     current_line = []
     current_bbox = None
     for result in ocr_results:
         if not current_line:
@@ -178,11 +709,33 @@ def combine_ocr_results(ocr_results, x_threshold = 20, y_threshold = 10):
             else:
                 # Commit the current line and start a new one
                 combined_results.append(current_bbox)
                 current_line = [result]
                 current_bbox = result
     # Append the last line
     if current_bbox:
         combined_results.append(current_bbox)
-    return combined_results

 import pytesseract
 import numpy as np
 from presidio_analyzer import AnalyzerEngine, RecognizerResult
+#from presidio_image_redactor import ImagePreprocessor
 from typing import List, Dict, Optional, Union, Tuple
 from dataclasses import dataclass
+import cv2
+import PIL
+from PIL import ImageDraw, ImageFont, Image
+from typing import Optional, Tuple, Union
+from copy import deepcopy
 @dataclass
 class OCRResult:
     height: int
     text: str
+class ImagePreprocessor:
+    """ImagePreprocessor class.
+    Parent class for image preprocessing objects.
+    """
+    def __init__(self, use_greyscale: bool = True) -> None:
+        """Initialize the ImagePreprocessor class.
+        :param use_greyscale: Whether to convert the image to greyscale.
+        """
+        self.use_greyscale = use_greyscale
+    def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
+        """Preprocess the image to be analyzed.
+        :param image: Loaded PIL image.
+        :return: The processed image and any metadata regarding the
+             preprocessing approach.
+        """
+        return image, {}
+    def convert_image_to_array(self, image: Image.Image) -> np.ndarray:
+        """Convert PIL image to numpy array.
+        :param image: Loaded PIL image.
+        :param convert_to_greyscale: Whether to convert the image to greyscale.
+        :return: image pixels as a numpy array.
+        """
+        if isinstance(image, np.ndarray):
+            img = image
+        else:
+            if self.use_greyscale:
+                image = image.convert("L")
+            img = np.asarray(image)
+        return img
+    @staticmethod
+    def _get_bg_color(
+        image: Image.Image, is_greyscale: bool, invert: bool = False
+    ) -> Union[int, Tuple[int, int, int]]:
+        """Select most common color as background color.
+        :param image: Loaded PIL image.
+        :param is_greyscale: Whether the image is greyscale.
+        :param invert: TRUE if you want to get the inverse of the bg color.
+        :return: Background color.
+        """
+        # Invert colors if invert flag is True
+        if invert:
+            if image.mode == "RGBA":
+                # Handle transparency as needed
+                r, g, b, a = image.split()
+                rgb_image = Image.merge("RGB", (r, g, b))
+                inverted_image = PIL.ImageOps.invert(rgb_image)
+                r2, g2, b2 = inverted_image.split()
+                image = Image.merge("RGBA", (r2, g2, b2, a))
+            else:
+                image = PIL.ImageOps.invert(image)
+        # Get background color
+        if is_greyscale:
+            # Select most common color as color
+            bg_color = int(np.bincount(image.flatten()).argmax())
+        else:
+            # Reduce size of image to 1 pixel to get dominant color
+            tmp_image = image.copy()
+            tmp_image = tmp_image.resize((1, 1), resample=0)
+            bg_color = tmp_image.getpixel((0, 0))
+        return bg_color
+    @staticmethod
+    def _get_image_contrast(image: np.ndarray) -> Tuple[float, float]:
+        """Compute the contrast level and mean intensity of an image.
+        :param image: Input image pixels (as a numpy array).
+        :return: A tuple containing the contrast level and mean intensity of the image.
+        """
+        contrast = np.std(image)
+        mean_intensity = np.mean(image)
+        return contrast, mean_intensity
+class BilateralFilter(ImagePreprocessor):
+    """BilateralFilter class.
+    The class applies bilateral filtering to an image. and returns the filtered
+    image and metadata.
+    """
+    def __init__(
+        self, diameter: int = 3, sigma_color: int = 40, sigma_space: int = 40
+    ) -> None:
+        """Initialize the BilateralFilter class.
+        :param diameter: Diameter of each pixel neighborhood.
+        :param sigma_color: value of sigma in the color space.
+        :param sigma_space: value of sigma in the coordinate space.
+        """
+        super().__init__(use_greyscale=True)
+        self.diameter = diameter
+        self.sigma_color = sigma_color
+        self.sigma_space = sigma_space
+    def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
+        """Preprocess the image to be analyzed.
+        :param image: Loaded PIL image.
+        :return: The processed image and metadata (diameter, sigma_color, sigma_space).
+        """
+        image = self.convert_image_to_array(image)
+        # Apply bilateral filtering
+        filtered_image = cv2.bilateralFilter(
+            image,
+            self.diameter,
+            self.sigma_color,
+            self.sigma_space,
+        )
+        metadata = {
+            "diameter": self.diameter,
+            "sigma_color": self.sigma_color,
+            "sigma_space": self.sigma_space,
+        }
+        return Image.fromarray(filtered_image), metadata
+class SegmentedAdaptiveThreshold(ImagePreprocessor):
+    """SegmentedAdaptiveThreshold class.
+    The class applies adaptive thresholding to an image
+    and returns the thresholded image and metadata.
+    The parameters used to run the adaptivethresholding are selected based on
+    the contrast level of the image.
+    """
+    def __init__(
+        self,
+        block_size: int = 5,
+        contrast_threshold: int = 40,
+        c_low_contrast: int = 10,
+        c_high_contrast: int = 40,
+        bg_threshold: int = 122,
+    ) -> None:
+        """Initialize the SegmentedAdaptiveThreshold class.
+        :param block_size: Size of the neighborhood area for threshold calculation.
+        :param contrast_threshold: Threshold for low contrast images.
+        :param C_low_contrast: Constant added to the mean for low contrast images.
+        :param C_high_contrast: Constant added to the mean for high contrast images.
+        :param bg_threshold: Threshold for background color.
+        """
+        super().__init__(use_greyscale=True)
+        self.block_size = block_size
+        self.c_low_contrast = c_low_contrast
+        self.c_high_contrast = c_high_contrast
+        self.bg_threshold = bg_threshold
+        self.contrast_threshold = contrast_threshold
+    def preprocess_image(
+        self, image: Union[Image.Image, np.ndarray]
+    ) -> Tuple[Image.Image, dict]:
+        """Preprocess the image.
+        :param image: Loaded PIL image.
+        :return: The processed image and metadata (C, background_color, contrast).
+        """
+        if not isinstance(image, np.ndarray):
+            image = self.convert_image_to_array(image)
+        # Determine background color
+        background_color = self._get_bg_color(image, True)
+        contrast, _ = self._get_image_contrast(image)
+        c = (
+            self.c_low_contrast
+            if contrast <= self.contrast_threshold
+            else self.c_high_contrast
+        )
+        if background_color < self.bg_threshold:
+            adaptive_threshold_image = cv2.adaptiveThreshold(
+                image,
+                255,
+                cv2.ADAPTIVE_THRESH_MEAN_C,
+                cv2.THRESH_BINARY_INV,
+                self.block_size,
+                -c,
+            )
+        else:
+            adaptive_threshold_image = cv2.adaptiveThreshold(
+                image,
+                255,
+                cv2.ADAPTIVE_THRESH_MEAN_C,
+                cv2.THRESH_BINARY,
+                self.block_size,
+                c,
+            )
+        metadata = {"C": c, "background_color": background_color, "contrast": contrast}
+        return Image.fromarray(adaptive_threshold_image), metadata
+class ImageRescaling(ImagePreprocessor):
+    """ImageRescaling class. Rescales images based on their size."""
+    def __init__(
+        self,
+        small_size: int = 1048576,
+        large_size: int = 4000000,
+        factor: int = 2,
+        interpolation: int = cv2.INTER_AREA,
+    ) -> None:
+        """Initialize the ImageRescaling class.
+        :param small_size: Threshold for small image size.
+        :param large_size: Threshold for large image size.
+        :param factor: Scaling factor for resizing.
+        :param interpolation: Interpolation method for resizing.
+        """
+        super().__init__(use_greyscale=True)
+        self.small_size = small_size
+        self.large_size = large_size
+        self.factor = factor
+        self.interpolation = interpolation
+    def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
+        """Preprocess the image to be analyzed.
+        :param image: Loaded PIL image.
+        :return: The processed image and metadata (scale_factor).
+        """
+        scale_factor = 1
+        if image.size < self.small_size:
+            scale_factor = self.factor
+        elif image.size > self.large_size:
+            scale_factor = 1 / self.factor
+        width = int(image.shape[1] * scale_factor)
+        height = int(image.shape[0] * scale_factor)
+        dimensions = (width, height)
+        # resize image
+        rescaled_image = cv2.resize(image, dimensions, interpolation=self.interpolation)
+        metadata = {"scale_factor": scale_factor}
+        return Image.fromarray(rescaled_image), metadata
+class ContrastSegmentedImageEnhancer(ImagePreprocessor):
+    """Class containing all logic to perform contrastive segmentation.
+    Contrastive segmentation is a preprocessing step that aims to enhance the
+    text in an image by increasing the contrast between the text and the
+    background. The parameters used to run the preprocessing are selected based
+    on the contrast level of the image.
+    """
+    def __init__(
+        self,
+        bilateral_filter: Optional[BilateralFilter] = None,
+        adaptive_threshold: Optional[SegmentedAdaptiveThreshold] = None,
+        image_rescaling: Optional[ImageRescaling] = None,
+        low_contrast_threshold: int = 40,
+    ) -> None:
+        """Initialize the class.
+        :param bilateral_filter: Optional BilateralFilter instance.
+        :param adaptive_threshold: Optional AdaptiveThreshold instance.
+        :param image_rescaling: Optional ImageRescaling instance.
+        :param low_contrast_threshold: Threshold for low contrast images.
+        """
+        super().__init__(use_greyscale=True)
+        if not bilateral_filter:
+            self.bilateral_filter = BilateralFilter()
+        else:
+            self.bilateral_filter = bilateral_filter
+        if not adaptive_threshold:
+            self.adaptive_threshold = SegmentedAdaptiveThreshold()
+        else:
+            self.adaptive_threshold = adaptive_threshold
+        if not image_rescaling:
+            self.image_rescaling = ImageRescaling()
+        else:
+            self.image_rescaling = image_rescaling
+        self.low_contrast_threshold = low_contrast_threshold
+    def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
+        """Preprocess the image to be analyzed.
+        :param image: Loaded PIL image.
+        :return: The processed image and metadata (background color, scale percentage,
+             contrast level, and C value).
+        """
+        image = self.convert_image_to_array(image)
+        # Apply bilateral filtering
+        filtered_image, _ = self.bilateral_filter.preprocess_image(image)
+        # Convert to grayscale
+        pil_filtered_image = Image.fromarray(np.uint8(filtered_image))
+        pil_grayscale_image = pil_filtered_image.convert("L")
+        grayscale_image = np.asarray(pil_grayscale_image)
+        # Improve contrast
+        adjusted_image, _, adjusted_contrast = self._improve_contrast(grayscale_image)
+        # Adaptive Thresholding
+        adaptive_threshold_image, _ = self.adaptive_threshold.preprocess_image(
+            adjusted_image
+        )
+        # Increase contrast
+        _, threshold_image = cv2.threshold(
+            np.asarray(adaptive_threshold_image),
+            0,
+            255,
+            cv2.THRESH_BINARY | cv2.THRESH_OTSU,
+        )
+        # Rescale image
+        rescaled_image, scale_metadata = self.image_rescaling.preprocess_image(
+            threshold_image
+        )
+        return rescaled_image, scale_metadata
+    def _improve_contrast(self, image: np.ndarray) -> Tuple[np.ndarray, str, str]:
+        """Improve the contrast of an image based on its initial contrast level.
+        :param image: Input image.
+        :return: A tuple containing the improved image, the initial contrast level,
+             and the adjusted contrast level.
+        """
+        contrast, mean_intensity = self._get_image_contrast(image)
+        if contrast <= self.low_contrast_threshold:
+            alpha = 1.5
+            beta = -mean_intensity * alpha
+            adjusted_image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
+            adjusted_contrast, _ = self._get_image_contrast(adjusted_image)
+        else:
+            adjusted_image = image
+            adjusted_contrast = contrast
+        return adjusted_image, contrast, adjusted_contrast
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
         analyzer_engine: Optional[AnalyzerEngine] = None,
+        tesseract_config: Optional[str] = None,
+        image_preprocessor: Optional[ImagePreprocessor] = None
     ):
         if not analyzer_engine:
             analyzer_engine = AnalyzerEngine()
         self.analyzer_engine = analyzer_engine
         self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
+        if not image_preprocessor:
+            # image_preprocessor = ImagePreprocessor(
+            #     c_low_contrast=10,
+            #     c_high_contrast=20,
+            #     contrast_threshold=0.5,
+            #     bg_threshold=128,
+            #     block_size=11
+            # )
+            image_preprocessor = ContrastSegmentedImageEnhancer()
+            print(image_preprocessor)
+        self.image_preprocessor = image_preprocessor
     def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
         # Ensure image is a PIL Image
         if isinstance(image, str):
         elif isinstance(image, np.ndarray):
             image = Image.fromarray(image)
+        image_processed, preprocessing_metadata = self.image_preprocessor.preprocess_image(image)
+        #print("pre-processing metadata:", preprocessing_metadata)
+        #image_processed.save("image_processed.png")
+        ocr_data = pytesseract.image_to_data(image_processed, output_type=pytesseract.Output.DICT, config=self.tesseract_config)
+        if preprocessing_metadata and ("scale_factor" in preprocessing_metadata):
+            ocr_result = self._scale_bbox_results(
+                ocr_data, preprocessing_metadata["scale_factor"]
+            )
+        ocr_result = self.remove_space_boxes(ocr_result)
         # Filter out empty strings and low confidence results
+        valid_indices = [i for i, text in enumerate(ocr_result['text']) if text.strip() and int(ocr_result['conf'][i]) > 0]
         return [
             OCRResult(
+                text=ocr_result['text'][i],
+                left=ocr_result['left'][i],
+                top=ocr_result['top'][i],
+                width=ocr_result['width'][i],
+                height=ocr_result['height'][i]
             )
             for i in valid_indices
         ]
                     text=relevant_text,
                     left=ocr_result.left + self.estimate_x_offset(ocr_result.text, result.start),
                     top=ocr_result.top,
+                    width=self.estimate_width(ocr_result=ocr_result, start=result.start, end=result.end),
                     height=ocr_result.height
                 )
             text_position = word_end + 1  # +1 for the space between words
         return pii_bboxes
+    @staticmethod
+    def remove_space_boxes(ocr_result: dict) -> dict:
+        """Remove OCR bboxes that are for spaces.
+        :param ocr_result: OCR results (raw or thresholded).
+        :return: OCR results with empty words removed.
+        """
+        # Get indices of items with no text
+        idx = list()
+        for i, text in enumerate(ocr_result["text"]):
+            is_not_space = text.isspace() is False
+            if text != "" and is_not_space:
+                idx.append(i)
+        # Only retain items with text
+        filtered_ocr_result = {}
+        for key in list(ocr_result.keys()):
+            filtered_ocr_result[key] = [ocr_result[key][i] for i in idx]
+        return filtered_ocr_result
+    @staticmethod
+    def _scale_bbox_results(
+        ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
+    ) -> Dict[str, float]:
+        """Scale down the bounding box results based on a scale percentage.
+        :param ocr_result: OCR results (raw).
+        :param scale_percent: Scale percentage for resizing the bounding box.
+        :return: OCR results (scaled).
+        """
+        scaled_results = deepcopy(ocr_result)
+        coordinate_keys = ["left", "top"]
+        dimension_keys = ["width", "height"]
+        for coord_key in coordinate_keys:
+            scaled_results[coord_key] = [
+                int(np.ceil((x) / (scale_factor))) for x in scaled_results[coord_key]
+            ]
+        for dim_key in dimension_keys:
+            scaled_results[dim_key] = [
+                max(1, int(np.ceil(x / (scale_factor))))
+                for x in scaled_results[dim_key]
+            ]
+        return scaled_results
     @staticmethod
     def estimate_x_offset(full_text: str, start: int) -> int:
         # Estimate the x-offset based on character position
         # This is a simple estimation and might need refinement for variable-width fonts
         return int(start / len(full_text) * len(full_text))
+    def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
+        # Extract the relevant text portion
+        relevant_text = ocr_result.text[start:end]
+        # If the relevant text is the same as the full text, return the full width
+        if relevant_text == ocr_result.text:
+            return ocr_result.width
+        # Estimate width based on the proportion of the relevant text length to the total text length
+        total_text_length = len(ocr_result.text)
+        relevant_text_length = len(relevant_text)
+        if total_text_length == 0:
+            return 0  # Avoid division by zero
+        # Proportion of the relevant text to the total text
+        proportion = relevant_text_length / total_text_length
+        # Estimate the width based on the proportion
+        estimated_width = int(proportion * ocr_result.width)
+        return estimated_width
+    # def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
+    #     # Extract the relevant text portion
+    #     relevant_text = ocr_result.text[start:end]
+    #     # Check if the relevant text is the entire text of the OCR result
+    #     if relevant_text == ocr_result.text:
+    #         return ocr_result.width
+    #     # Estimate the font size based on the height of the bounding box
+    #     estimated_font_size = ocr_result.height + 4
+    #     # Create a blank image with enough width to measure the text
+    #     dummy_image = Image.new('RGB', (1000, 50), color=(255, 255, 255))
+    #     draw = ImageDraw.Draw(dummy_image)
+    #     # Specify the font and size
+    #     try:
+    #         font = ImageFont.truetype("arial.ttf", estimated_font_size)  # Adjust the font file as needed
+    #     except IOError:
+    #         font = ImageFont.load_default()  # Fallback to default font if the specified font is not found
+    #     # Draw the relevant text on the image
+    #     draw.text((0, 0), relevant_text, fill=(0, 0, 0), font=font)
+    #     # Save the image for debugging purposes
+    #     dummy_image.save("debug_image.png")
+    #     # Use pytesseract to get the bounding box of the relevant text
+    #     bbox = pytesseract.image_to_boxes(dummy_image, config=self.tesseract_config)
+    #     # Print the bbox for debugging
+    #     print("Bounding box:", bbox)
+    #     # Calculate the width from the bounding box
+    #     if bbox:
+    #         try:
+    #             # Initialize min_left and max_right with extreme values
+    #             min_left = float('inf')
+    #             max_right = float('-inf')
+    #             # Split the bbox string into lines
+    #             bbox_lines = bbox.splitlines()
+    #             for line in bbox_lines:
+    #                 parts = line.split()
+    #                 if len(parts) == 6:
+    #                     _, left, _, right, _, _ = parts
+    #                     left = int(left)
+    #                     right = int(right)
+    #                     min_left = min(min_left, left)
+    #                     max_right = max(max_right, right)
+    #             width = max_right - min_left
+    #         except ValueError as e:
+    #             print("Error parsing bounding box:", e)
+    #             width = 0
+    #     else:
+    #         width = 0
+    #     print("Estimated width:", width)
+    #     return width
 # Function to combine OCR results into line-level results
+def combine_ocr_results(ocr_results, x_threshold=20, y_threshold=3):
     # Sort OCR results by 'top' to ensure line order
     ocr_results = sorted(ocr_results, key=lambda x: (x.top, x.left))
     combined_results = []
+    new_format_results = {}
     current_line = []
     current_bbox = None
+    line_counter = 1
     for result in ocr_results:
         if not current_line:
             else:
                 # Commit the current line and start a new one
                 combined_results.append(current_bbox)
+                new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
+                    'bounding_box': (current_bbox.left, current_bbox.top,
+                                     current_bbox.left + current_bbox.width,
+                                     current_bbox.top + current_bbox.height),
+                    'words': [{'text': word.text,
+                               'bounding_box': (word.left, word.top,
+                                                word.left + word.width,
+                                                word.top + word.height)}
+                              for word in current_line]
+                }
+                line_counter += 1
                 current_line = [result]
                 current_bbox = result
     # Append the last line
     if current_bbox:
         combined_results.append(current_bbox)
+        new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
+            'bounding_box': (current_bbox.left, current_bbox.top,
+                             current_bbox.left + current_bbox.width,
+                             current_bbox.top + current_bbox.height),
+            'words': [{'text': word.text,
+                       'bounding_box': (word.left, word.top,
+                                        word.left + word.width,
+                                        word.top + word.height)}
+                      for word in current_line]
+        }
+    return combined_results, new_format_results

tools/data_anonymise.py CHANGED Viewed

@@ -195,7 +195,9 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     df_dict = df.to_dict(orient="list")
     if in_allow_list:
-        in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     #analyzer = nlp_analyser #AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
@@ -371,7 +373,9 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
     if in_allow_list:
-        in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     anon_df = pd.DataFrame()
     #out_file_paths = []

     df_dict = df.to_dict(orient="list")
     if in_allow_list:
+        in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
+    else:
+        in_allow_list_flat = []
     #analyzer = nlp_analyser #AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
     if in_allow_list:
+        in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
+    else:
+        in_allow_list_flat = []
     anon_df = pd.DataFrame()
     #out_file_paths = []

tools/file_conversion.py CHANGED Viewed

@@ -98,7 +98,33 @@ def process_file(file_path):
     return img_object
-def prepare_image_or_text_pdf(
     file_paths: List[str],
     in_redact_method: str,
     in_allow_list: Optional[List[List[str]]] = None,
@@ -159,6 +185,8 @@ def prepare_image_or_text_pdf(
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     file_paths_loop = [file_paths[int(latest_file_completed)]]
     #print("file_paths_loop:", str(file_paths_loop))
@@ -173,7 +201,7 @@ def prepare_image_or_text_pdf(
         # Check if the file is an image type
         if file_extension in ['.jpg', '.jpeg', '.png']:
-            in_redact_method = "Image analysis"
         # If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
         if file_extension in ['.json']:
@@ -191,7 +219,7 @@ def prepare_image_or_text_pdf(
             print(out_message)
             return out_message, out_file_paths
-        if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
             # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
@@ -201,7 +229,7 @@ def prepare_image_or_text_pdf(
             out_file_path = process_file(file_path)
             #print("Out file path at image conversion step:", out_file_path)
-        elif in_redact_method == "Text analysis":
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis."
                 print(out_message)

     return img_object
+def get_input_file_names(file_input):
+    '''
+    Get list of input files to report to logs.
+    '''
+    all_relevant_files = []
+    for file in file_input:
+        file_path = file.name
+        print(file_path)
+        file_path_without_ext = get_file_path_end(file_path)
+        #print("file:", file_path)
+        file_extension = os.path.splitext(file_path)[1].lower()
+        # Check if the file is an image type
+        if file_extension in ['.jpg', '.jpeg', '.png', '.xlsx', '.csv', '.parquet']:
+            all_relevant_files.append(file_path_without_ext)
+    all_relevant_files_str = ", ".join(all_relevant_files)
+    print("all_relevant_files_str:", all_relevant_files_str)
+    return all_relevant_files_str
+def prepare_image_or_pdf(
     file_paths: List[str],
     in_redact_method: str,
     in_allow_list: Optional[List[List[str]]] = None,
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    progress(0.1, desc='Preparing file')
     file_paths_loop = [file_paths[int(latest_file_completed)]]
     #print("file_paths_loop:", str(file_paths_loop))
         # Check if the file is an image type
         if file_extension in ['.jpg', '.jpeg', '.png']:
+            in_redact_method = "Quick image analysis - typed text"
         # If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
         if file_extension in ['.json']:
             print(out_message)
             return out_message, out_file_paths
+        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
             # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
             out_file_path = process_file(file_path)
             #print("Out file path at image conversion step:", out_file_path)
+        elif in_redact_method == "Simple text analysis - PDFs with selectable text":
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis."
                 print(out_message)

tools/file_redaction.py CHANGED Viewed

@@ -4,10 +4,10 @@ import json
 import io
 import os
 from PIL import Image, ImageChops, ImageDraw
-from typing import List
 import pandas as pd
-from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
@@ -20,15 +20,38 @@ from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRRes
 from tools.file_conversion import process_file
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
-from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
 from tools.data_anonymise import generate_decision_process_output
 from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
-    all_request_metadata = []
-    all_request_metadata_str = ""
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -48,36 +71,164 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
         print("Last file reached")
-        # Set to a very high number so as not to mess with subsequent file processing by the user
         latest_file_completed = 99
         final_out_message = '\n'.join(out_message)
         #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
-        def sum_numbers_before_seconds(string):
-            """Extracts numbers that precede the word 'seconds' from a string and adds them up.
-            Args:
-                string: The input string.
-            Returns:
-                The sum of all numbers before 'seconds' in the string.
-            """
-            # Extract numbers before 'seconds' using regular expression
-            numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
-            # Extract the numbers from the matches
-            numbers = [float(num.split()[0]) for num in numbers]
-            # Sum up the extracted numbers
-            sum_of_numbers = round(sum(numbers),1)
-            return sum_of_numbers
         estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata
     file_paths_loop = [file_paths[int(latest_file_completed)]]
@@ -87,7 +238,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     else:
         in_allow_list_flat = []
     for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
@@ -97,19 +247,20 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             if is_a_pdf == False:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
-                in_redact_method = "Image analysis"
         else:
             out_message = "No file selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata
-        if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
-            # Analyse and redact image-based pdf or image
-            # if is_pdf_or_image(file_path) == False:
-            #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-            print("Redacting file" + file_path_without_ext + "as an image-based file")
-            pdf_images, output_logs, logging_file_paths, request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
             # Save file
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
@@ -128,30 +279,29 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
                 f.write(output_logs_str)
             log_files_output_paths.append(logs_output_file_name)
-            # Save Textract request metadata (if exists)
-            if request_metadata:
-                print("Request metadata:", all_request_metadata)
-                all_request_metadata.append(request_metadata)
             # Increase latest file completed count unless we are at the last file
             if latest_file_completed != len(file_paths):
                 print("Completed file number:", str(latest_file_completed))
                 latest_file_completed += 1
-        elif in_redact_method == "Text analysis":
             if is_pdf(file_path) == False:
                 return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Text analysis")
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
             # Convert message
             convert_message="Converting PDF to image-based PDF to embed redactions."
-            #progress(0.8, desc=convert_message)
             print(convert_message)
             # Convert document to image-based document to 'embed' redactions
@@ -164,10 +314,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
                 f.write(output_logs_str)
             log_files_output_paths.append(logs_output_file_name)
-            # Add confirmation for converting to image if you want
-            # out_message.append(img_output_summary)
-            #out_file_paths.append(out_text_file_path)
             out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
             out_message.append(out_message_new)
@@ -178,8 +324,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata
     toc = time.perf_counter()
     out_time = f"in {toc - tic:0.1f} seconds."
@@ -188,48 +333,105 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     out_message_out = '\n'.join(out_message)
     out_message_out = out_message_out + " " + out_time
-    # If textract requests made, write to logging file
     if all_request_metadata:
         all_request_metadata_str = '\n'.join(all_request_metadata)
-        print("all_request_metadata_file_path")
-        all_request_metadata_file_path = output_folder + "textract_request_metadata.txt"
         with open(all_request_metadata_file_path, "w") as f:
             f.write(all_request_metadata_str)
-        log_files_output_paths.append(all_request_metadata_file_path)
     return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
-def merge_img_bboxes(bboxes, signature_recogniser_results = [], handwriting_recogniser_results = [], handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold=150, vertical_threshold=25):
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
             print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
             bboxes.extend(handwriting_recogniser_results)
         if "Redact all identified signatures" in handwrite_signature_checkbox:
-            print("Signature boxes exist at merge:", handwriting_recogniser_results)
             bboxes.extend(signature_recogniser_results)
-    # 1. Group by approximate vertical proximity
-    for box in bboxes:
         grouped_bboxes[round(box.top / vertical_threshold)].append(box)
-    # 2. Merge within each group
     for _, group in grouped_bboxes.items():
         group.sort(key=lambda box: box.left)
         merged_box = group[0]
         for next_box in group[1:]:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
-                #print("Merging a box")
                 # Calculate new dimensions for the merged box
-                #print("Merged box:", merged_box)
                 if merged_box.text == next_box.text:
                     new_text = merged_box.text
                 else:
@@ -247,9 +449,10 @@ def merge_img_bboxes(bboxes, signature_recogniser_results = [], handwriting_reco
                 merged_box = next_box
         merged_bboxes.append(merged_box)
     return merged_bboxes
-def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Image analysis", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
@@ -259,7 +462,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     fill = (0, 0, 0)   # Fill colour
     decision_process_output_str = ""
     images = []
-    request_metadata = {}
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if not image_paths:
@@ -297,11 +500,13 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     all_ocr_results = []
     all_decision_process = []
-    if analysis_type == "Image analysis": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
-    elif analysis_type == "AWS Textract": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
     for n in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
         try:
             image = image_paths[0][n]#.copy()
@@ -339,17 +544,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
             else: ocr_lang = language
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
-            if analysis_type == "Image analysis":
                 ocr_results = image_analyser.perform_ocr(image)
                 # Combine OCR results
-                ocr_results = combine_ocr_results(ocr_results)
             # Import results from json and convert
-            if analysis_type == "AWS Textract":
                 # Convert the image to bytes using an in-memory buffer
                 image_buffer = io.BytesIO()
                 image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
@@ -358,8 +568,9 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
                 if not os.path.exists(json_file_path):
-                    text_blocks, request_metadata = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                 else:
                     # Open the file and load the JSON data
                     print("Found existing Textract json results file for this page.")
@@ -367,7 +578,13 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                         text_blocks = json.load(json_file)
                         text_blocks = text_blocks['Blocks']
-                ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results = json_to_ocrresult(text_blocks, page_width, page_height)
             # Step 2: Analyze text and identify PII
             bboxes = image_analyser.analyze_text(
@@ -376,10 +593,18 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 entities=chosen_redact_entities,
                 allow_list=allow_list,
                 score_threshold=score_threshold,
-            )
             # Merge close bounding boxes
-            merged_bboxes = merge_img_bboxes(bboxes, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
             # Export the decision making process
             if merged_bboxes:
@@ -434,82 +659,19 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
     return [], []
 # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
-# def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
-#     '''
-#     Merge identified bounding boxes containing PII that are very close to one another
-#     '''
-#     analyzed_bounding_boxes = []
-#     if len(analyzer_results) > 0 and len(characters) > 0:
-#         merged_bounding_boxes = []
-#         current_box = None
-#         current_y = None
-#         for i, result in enumerate(analyzer_results):
-#             print("Considering result", str(i))
-#             for char in characters[result.start : result.end]:
-#                 if isinstance(char, LTChar):
-#                     char_box = list(char.bbox)
-#                     # Add vertical padding to the top of the box
-#                     char_box[3] += vertical_padding
-#                     if current_y is None or current_box is None:
-#                         current_box = char_box
-#                         current_y = char_box[1]
-#                     else:
-#                         vertical_diff_bboxes = abs(char_box[1] - current_y)
-#                         horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
-#                         if (
-#                             vertical_diff_bboxes <= 5
-#                             and horizontal_diff_bboxes <= combine_pixel_dist
-#                         ):
-#                             current_box[2] = char_box[2]  # Extend the current box horizontally
-#                             current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
-#                         else:
-#                             merged_bounding_boxes.append(
-#                                 {"boundingBox": current_box, "result": result})
-#                             # Reset current_box and current_y after appending
-#                             current_box = char_box
-#                             current_y = char_box[1]
-#             # After finishing with the current result, add the last box for this result
-#             if current_box:
-#                 merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
-#                 current_box = None
-#                 current_y = None  # Reset for the next result
-#         if not merged_bounding_boxes:
-#             analyzed_bounding_boxes.extend(
-#                 {"boundingBox": char.bbox, "result": result}
-#                 for result in analyzer_results
-#                 for char in characters[result.start:result.end]
-#                 if isinstance(char, LTChar)
-#             )
-#         else:
-#             analyzed_bounding_boxes.extend(merged_bounding_boxes)
-#         print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
-#     return analyzed_bounding_boxes
-def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2, signature_bounding_boxes=None):
     '''
-    Merge identified bounding boxes containing PII or signatures that are very close to one another.
     '''
     analyzed_bounding_boxes = []
-    merged_bounding_boxes = []
-    current_box = None
-    current_y = None
-    # Handle PII and text bounding boxes first
     if len(analyzer_results) > 0 and len(characters) > 0:
-        for i, result in enumerate(analyzer_results):
-            #print("Considering result", str(i))
-            #print("Result:", result)
-            #print("Characters:", characters)
-            for char in characters[result.start: result.end]:
                 if isinstance(char, LTChar):
                     char_box = list(char.bbox)
                     # Add vertical padding to the top of the box
@@ -535,58 +697,121 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
                             # Reset current_box and current_y after appending
                             current_box = char_box
                             current_y = char_box[1]
             # After finishing with the current result, add the last box for this result
             if current_box:
                 merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
                 current_box = None
                 current_y = None  # Reset for the next result
-    # Handle signature bounding boxes (without specific characters)
-    if signature_bounding_boxes is not None:
-        for sig_box in signature_bounding_boxes:
-            sig_box = list(sig_box)  # Ensure it's a list to modify the values
-            if current_y is None or current_box is None:
-                current_box = sig_box
-                current_y = sig_box[1]
-            else:
-                vertical_diff_bboxes = abs(sig_box[1] - current_y)
-                horizontal_diff_bboxes = abs(sig_box[0] - current_box[2])
-                if (
-                    vertical_diff_bboxes <= 5
-                    and horizontal_diff_bboxes <= combine_pixel_dist
-                ):
-                    current_box[2] = sig_box[2]  # Extend the current box horizontally
-                    current_box[3] = max(current_box[3], sig_box[3])  # Ensure the top is the highest
-                else:
-                    merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
-                    # Reset current_box and current_y after appending
-                    current_box = sig_box
-                    current_y = sig_box[1]
-            # Add the last bounding box for the signature
-            if current_box:
-                merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
-                current_box = None
-                current_y = None
-    # If no bounding boxes were merged, add individual character bounding boxes
-    if not merged_bounding_boxes:
-        analyzed_bounding_boxes.extend(
-            {"boundingBox": char.bbox, "result": result}
-            for result in analyzer_results
-            for char in characters[result.start:result.end]
-            if isinstance(char, LTChar)
-        )
-    else:
-        analyzed_bounding_boxes.extend(merged_bounding_boxes)
-    #print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
     return analyzed_bounding_boxes
 def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
@@ -625,14 +850,14 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
         annotations_on_page.append(annotation)
     return annotations_on_page
-def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Text analysis", progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
     annotations_all_pages = []
     decision_process_table_all_pages = []
-    combine_pixel_dist = 100 # Horizontal distance between PII bounding boxes under/equal they are combined into one
     pdf = Pdf.open(filename)
     page_num = 0
@@ -674,7 +899,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
             text_container_analyzed_bounding_boxes = []
             characters = []
-            if analysis_type == "Text analysis":
                 for i, text_container in enumerate(page_layout):
                     text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
@@ -686,11 +911,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                     page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
                     page_analyzer_results.extend(text_container_analyzer_results)
-                    # Merge bounding boxes if very close together
-                    text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
-                    page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
-                    page_analyzer_results.extend(text_container_analyzer_results)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)

 import io
 import os
 from PIL import Image, ImageChops, ImageDraw
+from typing import List, Dict
 import pandas as pd
+#from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from tools.file_conversion import process_file
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
+from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
 from tools.data_anonymise import generate_decision_process_output
 from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
+def sum_numbers_before_seconds(string:str):
+    """Extracts numbers that precede the word 'seconds' from a string and adds them up.
+    Args:
+        string: The input string.
+    Returns:
+        The sum of all numbers before 'seconds' in the string.
+    """
+    # Extract numbers before 'seconds' using regular expression
+    numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
+    # Extract the numbers from the matches
+    numbers = [float(num.split()[0]) for num in numbers]
+    # Sum up the extracted numbers
+    sum_of_numbers = round(sum(numbers),1)
+    return sum_of_numbers
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", progress=gr.Progress(track_tqdm=True)):
+    '''
+    Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
+    '''
     tic = time.perf_counter()
+    all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
         print("Last file reached")
+        # Set to a very high number so as not to mix up with subsequent file processing by the user
         latest_file_completed = 99
         final_out_message = '\n'.join(out_message)
         #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
+        estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
+        print("Estimated total processing time:", str(estimate_total_processing_time))
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str
+    file_paths_loop = [file_paths[int(latest_file_completed)]]
+    if not in_allow_list.empty:
+        in_allow_list_flat = in_allow_list[0].tolist()
+        print("In allow list:", in_allow_list_flat)
+    else:
+        in_allow_list_flat = []
+    for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
+        file_path = file.name
+        if file_path:
+            file_path_without_ext = get_file_path_end(file_path)
+            is_a_pdf = is_pdf(file_path) == True
+            if is_a_pdf == False:
+                # If user has not submitted a pdf, assume it's an image
+                print("File is not a pdf, assuming that image analysis needs to be used.")
+                in_redact_method = "Quick image analysis - typed text"
+        else:
+            out_message = "No file selected"
+            print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
+        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
+            #Analyse and redact image-based pdf or image
+            if is_pdf_or_image(file_path) == False:
+                out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
+            print("Redacting file " + file_path_without_ext + " as an image-based file")
+            pdf_images, output_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
+            # Save file
+            out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
+            pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
+            out_file_paths.append(out_image_file_path)
+            if logging_file_paths:
+                log_files_output_paths.extend(logging_file_paths)
+            out_message.append("File '" + file_path_without_ext + "' successfully redacted")
+            # Save decision making process
+            output_logs_str = str(output_logs)
+            logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
+            with open(logs_output_file_name, "w") as f:
+                f.write(output_logs_str)
+            log_files_output_paths.append(logs_output_file_name)
+           # Save Textract request metadata (if exists)
+            if new_request_metadata:
+                print("Request metadata:", new_request_metadata)
+                all_request_metadata.append(new_request_metadata)
+            # Increase latest file completed count unless we are at the last file
+            if latest_file_completed != len(file_paths):
+                print("Completed file number:", str(latest_file_completed))
+                latest_file_completed += 1
+        elif in_redact_method == "Simple text analysis - PDFs with selectable text":
+            if is_pdf(file_path) == False:
+                return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
+            # Analyse text-based pdf
+            print('Redacting file as text-based PDF')
+import time
+import re
+import json
+import io
+import os
+from PIL import Image, ImageChops, ImageDraw
+from typing import List, Dict
+import pandas as pd
+#from presidio_image_redactor.entities import ImageRecognizerResult
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
+from pikepdf import Pdf, Dictionary, Name
+import gradio as gr
+from gradio import Progress
+from collections import defaultdict  # For efficient grouping
+from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
+from tools.file_conversion import process_file
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
+from tools.helper_functions import get_file_path_end, output_folder
+from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
+from tools.data_anonymise import generate_decision_process_output
+from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
+def sum_numbers_before_seconds(string:str):
+    """Extracts numbers that precede the word 'seconds' from a string and adds them up.
+    Args:
+        string: The input string.
+    Returns:
+        The sum of all numbers before 'seconds' in the string.
+    """
+    # Extract numbers before 'seconds' using regular expression
+    numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
+    # Extract the numbers from the matches
+    numbers = [float(num.split()[0]) for num in numbers]
+    # Sum up the extracted numbers
+    sum_of_numbers = round(sum(numbers),1)
+    return sum_of_numbers
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", progress=gr.Progress(track_tqdm=True)):
+    '''
+    Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
+    '''
+    tic = time.perf_counter()
+    all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
+    # If this is the first time around, set variables to 0/blank
+    if first_loop_state==True:
+        latest_file_completed = 0
+        #out_message = []
+        out_file_paths = []
+    # If out message is string or out_file_paths are blank, change to a list so it can be appended to
+    if isinstance(out_message, str):
+        out_message = [out_message]
+    if not out_file_paths:
+        out_file_paths = []
+    latest_file_completed = int(latest_file_completed)
+    # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed >= len(file_paths):
+        print("Last file reached")
+        # Set to a very high number so as not to mix up with subsequent file processing by the user
+        latest_file_completed = 99
+        final_out_message = '\n'.join(out_message)
+        #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
         estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str
     file_paths_loop = [file_paths[int(latest_file_completed)]]
     else:
         in_allow_list_flat = []
     for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
             if is_a_pdf == False:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
+                in_redact_method = "Quick image analysis - typed text"
         else:
             out_message = "No file selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
+        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
+            #Analyse and redact image-based pdf or image
+            if is_pdf_or_image(file_path) == False:
+                out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
+            print("Redacting file " + file_path_without_ext + " as an image-based file")
+            pdf_images, output_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
             # Save file
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
                 f.write(output_logs_str)
             log_files_output_paths.append(logs_output_file_name)
+           # Save Textract request metadata (if exists)
+            if new_request_metadata:
+                print("Request metadata:", new_request_metadata)
+                all_request_metadata.append(new_request_metadata)
             # Increase latest file completed count unless we are at the last file
             if latest_file_completed != len(file_paths):
                 print("Completed file number:", str(latest_file_completed))
                 latest_file_completed += 1
+        elif in_redact_method == "Simple text analysis - PDFs with selectable text":
             if is_pdf(file_path) == False:
                 return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
             # Convert message
             convert_message="Converting PDF to image-based PDF to embed redactions."
             print(convert_message)
             # Convert document to image-based document to 'embed' redactions
                 f.write(output_logs_str)
             log_files_output_paths.append(logs_output_file_name)
             out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
             out_message.append(out_message_new)
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
     toc = time.perf_counter()
     out_time = f"in {toc - tic:0.1f} seconds."
     out_message_out = '\n'.join(out_message)
     out_message_out = out_message_out + " " + out_time
+   # If textract requests made, write to logging file
     if all_request_metadata:
         all_request_metadata_str = '\n'.join(all_request_metadata)
+        all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
         with open(all_request_metadata_file_path, "w") as f:
             f.write(all_request_metadata_str)
+        # Add the request metadata to the log outputs if not there already
+        if all_request_metadata_file_path not in log_files_output_paths:
+            log_files_output_paths.append(all_request_metadata_file_path)
     return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
+def bounding_boxes_overlap(box1, box2):
+    """Check if two bounding boxes overlap."""
+    return (box1[0] < box2[2] and box2[0] < box1[2] and
+            box1[1] < box2[3] and box2[1] < box1[3])
+def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold=150, vertical_threshold=25):
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
+    # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
             print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
             bboxes.extend(handwriting_recogniser_results)
         if "Redact all identified signatures" in handwrite_signature_checkbox:
+            print("Signature boxes exist at merge:", signature_recogniser_results)
             bboxes.extend(signature_recogniser_results)
+    # Reconstruct bounding boxes for substrings of interest
+    reconstructed_bboxes = []
+    for bbox in bboxes:
+        bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
+        for line_text, line_info in combined_results.items():
+            line_box = line_info['bounding_box']
+            if bounding_boxes_overlap(bbox_box, line_box):
+                if bbox.text in line_text:
+                    start_char = line_text.index(bbox.text)
+                    end_char = start_char + len(bbox.text)
+                    relevant_words = []
+                    current_char = 0
+                    for word in line_info['words']:
+                        word_end = current_char + len(word['text'])
+                        if current_char <= start_char < word_end or current_char < end_char <= word_end:
+                            relevant_words.append(word)
+                        if word_end >= end_char:
+                            break
+                        current_char = word_end  # +1 for space
+                        if not word['text'].endswith(' '):
+                            current_char += 1  # +1 for space if the word doesn't already end with a space
+                    if relevant_words:
+                        print("Relevant words:", relevant_words)
+                        left = min(word['bounding_box'][0] for word in relevant_words)
+                        top = min(word['bounding_box'][1] for word in relevant_words)
+                        right = max(word['bounding_box'][2] for word in relevant_words)
+                        bottom = max(word['bounding_box'][3] for word in relevant_words)
+                        # Combine the text of the relevant words
+                        combined_text = " ".join(word['text'] for word in relevant_words)
+                        reconstructed_bbox = CustomImageRecognizerResult(
+                            bbox.entity_type,
+                            bbox.start,
+                            bbox.end,
+                            bbox.score,
+                            left,
+                            top,
+                            right - left,  # width
+                            bottom - top,  # height
+                            combined_text
+                        )
+                        reconstructed_bboxes.append(reconstructed_bbox)
+                        break
+        else:
+            # If the bbox text is not found in any line in combined_results, keep the original bbox
+            reconstructed_bboxes.append(bbox)
+    # Group reconstructed bboxes by approximate vertical proximity
+    for box in reconstructed_bboxes:
         grouped_bboxes[round(box.top / vertical_threshold)].append(box)
+    # Merge within each group
     for _, group in grouped_bboxes.items():
         group.sort(key=lambda box: box.left)
         merged_box = group[0]
         for next_box in group[1:]:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
                 # Calculate new dimensions for the merged box
                 if merged_box.text == next_box.text:
                     new_text = merged_box.text
                 else:
                 merged_box = next_box
         merged_bboxes.append(merged_box)
     return merged_bboxes
+def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
     fill = (0, 0, 0)   # Fill colour
     decision_process_output_str = ""
     images = []
+    #request_metadata = {}
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if not image_paths:
     all_ocr_results = []
     all_decision_process = []
+    if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
+    elif analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
     for n in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
+        signature_recogniser_results = []
+        handwriting_recogniser_results = []
         try:
             image = image_paths[0][n]#.copy()
             else: ocr_lang = language
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
+            if analysis_type == "Quick image analysis - typed text":
                 ocr_results = image_analyser.perform_ocr(image)
                 # Combine OCR results
+                ocr_results, ocr_results_with_children = combine_ocr_results(ocr_results)
+                # Save decision making process
+                ocr_results_with_children_str = str(ocr_results_with_children)
+                logs_output_file_name = output_folder + "ocr_with_children.txt"
+                with open(logs_output_file_name, "w") as f:
+                    f.write(ocr_results_with_children_str)
             # Import results from json and convert
+            if analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures":
                 # Convert the image to bytes using an in-memory buffer
                 image_buffer = io.BytesIO()
                 image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                 json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
                 if not os.path.exists(json_file_path):
+                    text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
+                    request_metadata = request_metadata + "\n" + new_request_metadata
                 else:
                     # Open the file and load the JSON data
                     print("Found existing Textract json results file for this page.")
                         text_blocks = json.load(json_file)
                         text_blocks = text_blocks['Blocks']
+                ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
+                # Save decision making process
+                ocr_results_with_children_str = str(ocr_results_with_children)
+                logs_output_file_name = output_folder + "ocr_with_children_textract.txt"
+                with open(logs_output_file_name, "w") as f:
+                    f.write(ocr_results_with_children_str)
             # Step 2: Analyze text and identify PII
             bboxes = image_analyser.analyze_text(
                 entities=chosen_redact_entities,
                 allow_list=allow_list,
                 score_threshold=score_threshold,
+            )
+            if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
+            elif analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
+            # Save decision making process
+            bboxes_str = str(bboxes)
+            with open(interim_results_file_path, "w") as f:
+                f.write(bboxes_str)
             # Merge close bounding boxes
+            merged_bboxes = merge_img_bboxes(bboxes, ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
             # Export the decision making process
             if merged_bboxes:
     return [], []
 # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
+def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
     '''
+    Merge identified bounding boxes containing PII that are very close to one another
     '''
     analyzed_bounding_boxes = []
     if len(analyzer_results) > 0 and len(characters) > 0:
+        merged_bounding_boxes = []
+        current_box = None
+        current_y = None
+        for i, result in enumerate(analyzer_results):
+            print("Considering result", str(i))
+            for char in characters[result.start : result.end]:
                 if isinstance(char, LTChar):
                     char_box = list(char.bbox)
                     # Add vertical padding to the top of the box
                             # Reset current_box and current_y after appending
                             current_box = char_box
                             current_y = char_box[1]
             # After finishing with the current result, add the last box for this result
             if current_box:
                 merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
                 current_box = None
                 current_y = None  # Reset for the next result
+        if not merged_bounding_boxes:
+            analyzed_bounding_boxes.extend(
+                {"boundingBox": char.bbox, "result": result}
+                for result in analyzer_results
+                for char in characters[result.start:result.end]
+                if isinstance(char, LTChar)
+            )
+        else:
+            analyzed_bounding_boxes.extend(merged_bounding_boxes)
+        print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
     return analyzed_bounding_boxes
+# def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2, signature_bounding_boxes=None):
+#     '''
+#     Merge identified bounding boxes containing PII or signatures that are very close to one another.
+#     '''
+#     analyzed_bounding_boxes = []
+#     merged_bounding_boxes = []
+#     current_box = None
+#     current_y = None
+#     # Handle PII and text bounding boxes first
+#     if len(analyzer_results) > 0 and len(characters) > 0:
+#         for i, result in enumerate(analyzer_results):
+#             #print("Considering result", str(i))
+#             #print("Result:", result)
+#             #print("Characters:", characters)
+#             for char in characters[result.start: result.end]:
+#                 if isinstance(char, LTChar):
+#                     char_box = list(char.bbox)
+#                     # Add vertical padding to the top of the box
+#                     char_box[3] += vertical_padding
+#                     if current_y is None or current_box is None:
+#                         current_box = char_box
+#                         current_y = char_box[1]
+#                     else:
+#                         vertical_diff_bboxes = abs(char_box[1] - current_y)
+#                         horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
+#                         if (
+#                             vertical_diff_bboxes <= 5
+#                             and horizontal_diff_bboxes <= combine_pixel_dist
+#                         ):
+#                             current_box[2] = char_box[2]  # Extend the current box horizontally
+#                             current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
+#                         else:
+#                             merged_bounding_boxes.append(
+#                                 {"boundingBox": current_box, "result": result})
+#                             # Reset current_box and current_y after appending
+#                             current_box = char_box
+#                             current_y = char_box[1]
+#             # After finishing with the current result, add the last box for this result
+#             if current_box:
+#                 merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
+#                 current_box = None
+#                 current_y = None  # Reset for the next result
+#     # Handle signature bounding boxes (without specific characters)
+#     if signature_bounding_boxes is not None:
+#         for sig_box in signature_bounding_boxes:
+#             sig_box = list(sig_box)  # Ensure it's a list to modify the values
+#             if current_y is None or current_box is None:
+#                 current_box = sig_box
+#                 current_y = sig_box[1]
+#             else:
+#                 vertical_diff_bboxes = abs(sig_box[1] - current_y)
+#                 horizontal_diff_bboxes = abs(sig_box[0] - current_box[2])
+#                 if (
+#                     vertical_diff_bboxes <= 5
+#                     and horizontal_diff_bboxes <= combine_pixel_dist
+#                 ):
+#                     current_box[2] = sig_box[2]  # Extend the current box horizontally
+#                     current_box[3] = max(current_box[3], sig_box[3])  # Ensure the top is the highest
+#                 else:
+#                     merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
+#                     # Reset current_box and current_y after appending
+#                     current_box = sig_box
+#                     current_y = sig_box[1]
+#             # Add the last bounding box for the signature
+#             if current_box:
+#                 merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
+#                 current_box = None
+#                 current_y = None
+#     # If no bounding boxes were merged, add individual character bounding boxes
+#     if not merged_bounding_boxes:
+#         analyzed_bounding_boxes.extend(
+#             {"boundingBox": char.bbox, "result": result}
+#             for result in analyzer_results
+#             for char in characters[result.start:result.end]
+#             if isinstance(char, LTChar)
+#         )
+#     else:
+#         analyzed_bounding_boxes.extend(merged_bounding_boxes)
+#     #print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
+#     return analyzed_bounding_boxes
 def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
         annotations_on_page.append(annotation)
     return annotations_on_page
+def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Simple text analysis - PDFs with selectable text", progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
     annotations_all_pages = []
     decision_process_table_all_pages = []
+    combine_pixel_dist = 200 # Horizontal distance between PII bounding boxes under/equal they are combined into one
     pdf = Pdf.open(filename)
     page_num = 0
             text_container_analyzed_bounding_boxes = []
             characters = []
+            if analysis_type == "Simple text analysis - PDFs with selectable text":
                 for i, text_container in enumerate(page_layout):
                     text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
                     page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
                     page_analyzer_results.extend(text_container_analyzer_results)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -18,7 +18,7 @@ score_threshold = 0.001
 # Custom title recogniser
 import re
 titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
-titles_regex = '\\b' + ' \\b|\\b'.join(rf"{re.escape(street_type)}" for street_type in titles_list) + ' \\b'
 titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
 titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])

 # Custom title recogniser
 import re
 titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
+titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
 titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
 titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])