Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Nov 18, 2024

Commit

e2aae24

1 Parent(s): e69ae00

Only shows AWS options when AWS functions enabled. Can now upload previous review files to continue review later. Some review debugging.

Browse files

Files changed (8) hide show

app.py +43 -28
tools/aws_functions.py +2 -6
tools/aws_textract.py +7 -6
tools/custom_image_analyser_engine.py +2 -2
tools/file_conversion.py +96 -51
tools/file_redaction.py +82 -105
tools/helper_functions.py +10 -0
tools/redaction_review.py +31 -24

app.py CHANGED Viewed

@@ -9,11 +9,11 @@ import pandas as pd
 from datetime import datetime
 from gradio_image_annotation import image_annotator
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars, load_in_default_allow_list
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
-from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -45,12 +45,6 @@ feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
 usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
-text_ocr_option = "Simple text analysis - PDFs with selectable text"
-tesseract_ocr_option = "Quick image analysis - typed text"
-textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
-local_pii_detector = "Local"
-aws_pii_detector  = "AWS Comprehend"
 if RUN_AWS_FUNCTIONS == "1":
     default_ocr_val = textract_option
@@ -104,7 +98,8 @@ with app:
     textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
     comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
-    doc_file_name_textbox = gr.Textbox(label = "doc_file_name_textbox", value="", visible=False)
     doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
     data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
@@ -127,6 +122,9 @@ with app:
     zoom_true_bool = gr.State(True)
     zoom_false_bool = gr.State(False)
     ###
     # UI DESIGN
@@ -145,8 +143,12 @@ with app:
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
-            in_redaction_method = gr.Radio(label="Choose text extract method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
-            pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document(s)", variant="primary")
@@ -178,6 +180,8 @@ with app:
         with gr.Row():
             annotate_zoom_in = gr.Button("Zoom in")
             annotate_zoom_out = gr.Button("Zoom out")
         annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
@@ -199,7 +203,8 @@ with app:
             annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
             annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
-        output_review_files = gr.File(label="Review output files")
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
@@ -231,6 +236,8 @@ with app:
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     # SETTINGS TAB
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
@@ -272,10 +279,10 @@ with app:
     ###
     # PDF/IMAGE REDACTION
     ###
-    in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
-    then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
@@ -283,10 +290,10 @@ with app:
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
-                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
     # If a file has been completed, the function will continue onto the next document
-    latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
                     then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
         # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
     # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
@@ -299,12 +306,12 @@ with app:
     # Page controls at top
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     # Zoom in and out on annotator
     annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
@@ -313,20 +320,28 @@ with app:
     annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
         then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
-    annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
-    annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
     ###
     # TABULAR DATA REDACTION
@@ -364,8 +379,8 @@ with app:
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
-    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], feedback_logs_folder)
-    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
@@ -376,8 +391,8 @@ with app:
     # Log processing time/token usage when making a query
     usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
-    usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
-    latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app

 from datetime import datetime
 from gradio_image_annotation import image_annotator
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
+from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
 usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
 if RUN_AWS_FUNCTIONS == "1":
     default_ocr_val = textract_option
     textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
     comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
+    doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
+    doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
     doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
     data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
     zoom_true_bool = gr.State(True)
     zoom_false_bool = gr.State(False)
+    clear_all_page_redactions = gr.State(True)
+    prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
     ###
     # UI DESIGN
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
+            if RUN_AWS_FUNCTIONS == "1":
+                in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
+                pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
+            else:
+                in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
+                pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document(s)", variant="primary")
         with gr.Row():
             annotate_zoom_in = gr.Button("Zoom in")
             annotate_zoom_out = gr.Button("Zoom out")
+        with gr.Row():
+            clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page")
         annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
             annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
             annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
+        output_review_files = gr.File(label="Review output files", file_count='multiple')
+        upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...redactions.json)")
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     # SETTINGS TAB
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
     ###
     # PDF/IMAGE REDACTION
     ###
+    in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
+    then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
+                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # If a file has been completed, the function will continue onto the next document
+    latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
                     then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
         # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
     # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
     # Page controls at top
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # Zoom in and out on annotator
     annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
     annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
         then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
+    annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
+    clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
+    annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
+    # Upload previous files for modifying redactions
+    upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox]).\
+        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     ###
     # TABULAR DATA REDACTION
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
+    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
     # Log processing time/token usage when making a query
     usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
+    latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app

tools/aws_functions.py CHANGED Viewed

@@ -10,17 +10,13 @@ PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials
 bucket_name=""
-RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
-try:
-    comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
-except Exception as e:
-    print(e)
-    comprehend_client = ""
 def get_assumed_role_info():
         sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'

 # Get AWS credentials
 bucket_name=""
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
 def get_assumed_role_info():
         sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'

tools/aws_textract.py CHANGED Viewed

@@ -23,15 +23,16 @@ def extract_textract_metadata(response):
         #'NumberOfPages': number_of_pages
     })
-def analyse_page_with_textract(pdf_page_bytes, page_no):
     '''
     Analyse page with AWS Textract
     '''
-    try:
-        client = boto3.client('textract')
-    except:
-        print("Cannot connect to AWS Textract")
-        return [], ""  # Return an empty list and an empty string
     print("Analysing page with AWS Textract")

         #'NumberOfPages': number_of_pages
     })
+def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
     '''
     Analyse page with AWS Textract
     '''
+    if client == "":
+        try:
+            client = boto3.client('textract')
+        except:
+            print("Cannot connect to AWS Textract")
+            return [], ""  # Return an empty list and an empty string
     print("Analysing page with AWS Textract")

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -11,7 +11,6 @@ from PIL import ImageDraw, ImageFont, Image
 from typing import Optional, Tuple, Union
 from copy import deepcopy
 from tools.helper_functions import clean_unicode_text
-from tools.aws_functions import comprehend_client
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
 #import string  # Import string to get a list of common punctuation characters
@@ -464,7 +463,8 @@ class CustomImageAnalyzerEngine:
         line_level_ocr_results: List[OCRResult],
         ocr_results_with_children: Dict[str, Dict],
         chosen_redact_comprehend_entities:List[str],
-        pii_identification_method:str="Local",
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified

 from typing import Optional, Tuple, Union
 from copy import deepcopy
 from tools.helper_functions import clean_unicode_text
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
 #import string  # Import string to get a list of common punctuation characters
         line_level_ocr_results: List[OCRResult],
         ocr_results_with_children: Dict[str, Dict],
         chosen_redact_comprehend_entities:List[str],
+        pii_identification_method:str="Local",
+        comprehend_client="",
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified

tools/file_conversion.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from pdf2image import convert_from_path, pdfinfo_from_path
-from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
 from PIL import Image, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 import os
 import gradio as gr
 import time
 import json
@@ -96,8 +97,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag
     return images
-# %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
@@ -127,11 +127,15 @@ def get_input_file_names(file_input):
     '''
     all_relevant_files = []
     #print("file_input:", file_input)
     if isinstance(file_input, str):
         file_input_list = [file_input]
     for file in file_input_list:
         if isinstance(file, str):
@@ -141,21 +145,19 @@ def get_input_file_names(file_input):
         file_path_without_ext = get_file_path_end(file_path)
-        #print("file:", file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
-        file_name_with_extension = file_path_without_ext + file_extension
         # Check if the file is an image type
         if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
             all_relevant_files.append(file_path_without_ext)
     all_relevant_files_str = ", ".join(all_relevant_files)
-    #print("all_relevant_files_str:", all_relevant_files_str)
-    return all_relevant_files_str, file_name_with_extension
 def prepare_image_or_pdf(
     file_paths: List[str],
@@ -166,6 +168,8 @@ def prepare_image_or_pdf(
     first_loop_state: bool = False,
     number_of_pages:int = 1,
     current_loop_page_number:int=0,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
@@ -182,7 +186,10 @@ def prepare_image_or_pdf(
         out_message (List[str]): List to store output messages.
         first_loop_state (bool): Flag indicating if this is the first iteration.
         number_of_pages (int): integer indicating the number of pages in the document
         progress (Progress): Progress tracker for the operation.
     Returns:
         tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
@@ -194,7 +201,8 @@ def prepare_image_or_pdf(
     if first_loop_state==True:
         print("first_loop_state is True")
         latest_file_completed = 0
-        out_message = []
     else:
         print("Now attempting file:", str(latest_file_completed))
@@ -222,7 +230,7 @@ def prepare_image_or_pdf(
     else:
         file_path_number = len(file_paths)
-    print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
     print("Number of file paths:", file_path_number)
     print("Latest_file_completed:", latest_file_completed)
@@ -235,7 +243,7 @@ def prepare_image_or_pdf(
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
-        return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
@@ -245,13 +253,16 @@ def prepare_image_or_pdf(
         file_paths_list = [file_paths]
         file_paths_loop = file_paths_list
     else:
-        file_paths_list = file_paths
-        file_paths_loop = [file_paths_list[int(latest_file_completed)]]
-    #print("file_paths_loop:", str(file_paths_loop))
-    #for file in progress.tqdm(file_paths, desc="Preparing files"):
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
@@ -259,50 +270,87 @@ def prepare_image_or_pdf(
             file_path = file.name
         file_path_without_ext = get_file_path_end(file_path)
-        #print("file:", file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is an image type
         if file_extension in ['.jpg', '.jpeg', '.png']:
-            in_redact_method = "Quick image analysis - typed text"
-        # If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
-        if file_extension in ['.json']:
-            json_contents = json.load(file_path)
-            # Write the response to a JSON file
-            out_folder = output_folder + file_path
-            with open(file_path, 'w') as json_file:
-                json.dump(json_contents, out_folder, indent=4)  # indent=4 makes the JSON file pretty-printed
-            continue
-        #if file_path:
-        #    file_path_without_ext = get_file_path_end(file_path)
-        if not file_path:
-            out_message = "No file selected"
-            print(out_message)
-            return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
-        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
-            # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 print(out_message)
-                return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
             converted_file_path = process_file(file_path)
             image_file_path = converted_file_path
-            #print("Out file path at image conversion step:", converted_file_path)
-        elif in_redact_method == "Simple text analysis - PDFs with selectable text":
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis."
                 print(out_message)
-                return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
             converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
             image_file_path = process_file(file_path)
         converted_file_paths.append(converted_file_path)
         image_file_paths.extend(image_file_path)
@@ -310,7 +358,7 @@ def prepare_image_or_pdf(
         # If a pdf, load as a pymupdf document
         if is_pdf(file_path):
             pymupdf_doc = pymupdf.open(file_path)
-            #print("pymupdf_doc:", pymupdf_doc)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
@@ -318,9 +366,7 @@ def prepare_image_or_pdf(
             rect = pymupdf.Rect(0, 0, img.width, img.height)  # Create a rectangle for the image
             page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
             page.insert_image(rect, filename=file_path)  # Insert the image into the page
-            # Ensure to save the document after processing
-            #pymupdf_doc.save(output_path)  # Uncomment and specify output_path if needed
-            #pymupdf_doc.close()  # Close the PDF document
         toc = time.perf_counter()
         out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
@@ -332,9 +378,8 @@ def prepare_image_or_pdf(
         number_of_pages = len(image_file_paths)
-        print("At end of prepare_image_or_pdf function - current_loop_page_number:", current_loop_page_number)
-    return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
     file_path_without_ext = get_file_path_end(in_file_path)

 from pdf2image import convert_from_path, pdfinfo_from_path
+from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from PIL import Image, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 import os
+import re
 import gradio as gr
 import time
 import json
     return images
+# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
     '''
     all_relevant_files = []
+    file_name_with_extension = ""
+    full_file_name = ""
     #print("file_input:", file_input)
     if isinstance(file_input, str):
         file_input_list = [file_input]
+    else:
+        file_input_list = file_input
     for file in file_input_list:
         if isinstance(file, str):
         file_path_without_ext = get_file_path_end(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is an image type
         if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
             all_relevant_files.append(file_path_without_ext)
+            file_name_with_extension = file_path_without_ext + file_extension
+            full_file_name = file_path
     all_relevant_files_str = ", ".join(all_relevant_files)
+    print("all_relevant_files_str:", all_relevant_files_str)
+    return all_relevant_files_str, file_name_with_extension, full_file_name
 def prepare_image_or_pdf(
     file_paths: List[str],
     first_loop_state: bool = False,
     number_of_pages:int = 1,
     current_loop_page_number:int=0,
+    all_annotations_object:List = [],
+    prepare_for_review:bool = False,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
         out_message (List[str]): List to store output messages.
         first_loop_state (bool): Flag indicating if this is the first iteration.
         number_of_pages (int): integer indicating the number of pages in the document
+        all_annotations_object(List of annotation objects): All annotations for current document
+        prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
         progress (Progress): Progress tracker for the operation.
     Returns:
         tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
     if first_loop_state==True:
         print("first_loop_state is True")
         latest_file_completed = 0
+        out_message = []
+        all_annotations_object = []
     else:
         print("Now attempting file:", str(latest_file_completed))
     else:
         file_path_number = len(file_paths)
+    #print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
     print("Number of file paths:", file_path_number)
     print("Latest_file_completed:", latest_file_completed)
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
+        return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
         file_paths_list = [file_paths]
         file_paths_loop = file_paths_list
     else:
+        if prepare_for_review == False:
+            file_paths_list = file_paths
+            file_paths_loop = [file_paths_list[int(latest_file_completed)]]
+        else:
+            file_paths_list = file_paths
+            file_paths_loop = file_paths
+             # Sort files to prioritise PDF files first, then JSON files. This means that the pdf can be loaded in, and pdf page path locations can be added to the json
+            file_paths_loop = sorted(file_paths_loop, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
+    # Loop through files to load in
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
             file_path = file.name
         file_path_without_ext = get_file_path_end(file_path)
+        if not file_path:
+            out_message = "Please select a file."
+            print(out_message)
+            return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is an image type
         if file_extension in ['.jpg', '.jpeg', '.png']:
+            in_redact_method = tesseract_ocr_option
+        # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
+        if file_path.endswith(".json"):
+            if prepare_for_review == True:
+                if isinstance(file_path, str):
+                    with open(file_path, 'r') as json_file:
+                        all_annotations_object = json.load(json_file)
+                else:
+                    # Assuming file_path is a NamedString or similar
+                    all_annotations_object = json.loads(file_path)  # Use loads for string content
+                # Get list of page numbers
+                image_file_paths_pages = [
+                int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
+                for s in image_file_paths
+                if re.search(r'_(\d+)\.png$', os.path.basename(s))
+                ]
+                image_file_paths_pages = [int(i) for i in image_file_paths_pages]
+                # If PDF pages have been converted to image files, replace the current image paths in the json to this
+                if image_file_paths:
+                    for i, annotation in enumerate(all_annotations_object):
+                        annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
+                        # Check if the annotation page number exists in the image file paths pages
+                        if annotation_page_number in image_file_paths_pages:
+                            # Set the correct image page directly since we know it's in the list
+                            correct_image_page = annotation_page_number
+                            annotation["image"] = image_file_paths[correct_image_page]
+                        else:
+                            print("Page not found.")
+                    #print("all_annotations_object:", all_annotations_object)
+                # Write the response to a JSON file in output folder
+                out_folder = output_folder + file_path_without_ext + file_extension
+                with open(out_folder, 'w') as json_file:
+                    json.dump(all_annotations_object, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                continue
+            else:
+                # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
+                json_contents = json.load(file_path)
+                # Write the response to a JSON file in output folder
+                out_folder = output_folder + file_path_without_ext + file_extension
+                with open(out_folder, 'w') as json_file:
+                    json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                continue
+        # Convert pdf/image file to correct format for redaction
+        if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 print(out_message)
+                return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
             converted_file_path = process_file(file_path)
             image_file_path = converted_file_path
+        elif in_redact_method == text_ocr_option:
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis."
                 print(out_message)
+                return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
             converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
             image_file_path = process_file(file_path)
         converted_file_paths.append(converted_file_path)
         image_file_paths.extend(image_file_path)
         # If a pdf, load as a pymupdf document
         if is_pdf(file_path):
             pymupdf_doc = pymupdf.open(file_path)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
             rect = pymupdf.Rect(0, 0, img.width, img.height)  # Create a rectangle for the image
             page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
             page.insert_image(rect, filename=file_path)  # Insert the image into the page
         toc = time.perf_counter()
         out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
         number_of_pages = len(image_file_paths)
+    return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
     file_path_without_ext = get_file_path_end(in_file_path)

tools/file_redaction.py CHANGED Viewed

@@ -8,7 +8,6 @@ import boto3
 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 from typing import List, Dict, Tuple
 import pandas as pd
@@ -19,32 +18,27 @@ from pikepdf import Pdf, Dictionary, Name
 import pymupdf
 from pymupdf import Rect
 from fitz import Document, Page
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 from tools.file_conversion import process_file, image_dpi
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
-from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
-# from tools.data_anonymise import generate_decision_process_output
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
-from tools.aws_functions import comprehend_client
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
 page_break_value = get_or_create_env_var('page_break_value', '500')
 print(f'The value of page_break_value is {page_break_value}')
 max_time_value = get_or_create_env_var('max_time_value', '105')
 print(f'The value of max_time_value is {max_time_value}')
 def sum_numbers_before_seconds(string:str):
     """Extracts numbers that precede the word 'seconds' from a string and adds them up.
@@ -192,8 +186,33 @@ def choose_and_run_redactor(file_paths:List[str],
     else:
         in_allow_list_flat = []
-    progress(0.5, desc="Redacting file")
     if isinstance(file_paths, str):
         file_paths_list = [file_paths]
@@ -217,28 +236,21 @@ def choose_and_run_redactor(file_paths:List[str],
             if is_a_pdf == False:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
-                in_redact_method = "Quick image analysis - typed text"
         else:
             out_message = "No file selected"
             print(out_message)
             return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
-        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
-            if in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
-                # Try accessing Textract through boto3
-                try:
-                    boto3.client('textract')
-                except:
-                    out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
-                    print(out_message)
-                    return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-                return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
             print("Redacting file " + file_path_without_ext + " as an image-based file")
@@ -262,14 +274,16 @@ def choose_and_run_redactor(file_paths:List[str],
              all_decision_process_table,
              pymupdf_doc,
              pii_identification_method,
-             comprehend_query_number)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
                 print("Request metadata:", new_request_metadata)
                 all_request_metadata.append(new_request_metadata)
-        elif in_redact_method == "Simple text analysis - PDFs with selectable text":
             logging_file_paths = ""
@@ -287,7 +301,7 @@ def choose_and_run_redactor(file_paths:List[str],
             in_allow_list_flat,
             page_min,
             page_max,
-            "Simple text analysis - PDFs with selectable text",
             current_loop_page,
             page_break_return,
             annotations_all_pages,
@@ -295,7 +309,8 @@ def choose_and_run_redactor(file_paths:List[str],
             all_decision_process_table,
             pymupdf_doc,
             pii_identification_method,
-            comprehend_query_number)
         else:
             out_message = "No redaction method selected"
@@ -328,14 +343,21 @@ def choose_and_run_redactor(file_paths:List[str],
             logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
-            #log_files_output_paths.append(logs_output_file_name)
             out_file_paths.append(logs_output_file_name)
             all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
-            #log_files_output_paths.append(all_text_output_file_name)
             out_file_paths.append(all_text_output_file_name)
             # Make a combined message for the file
             if isinstance(out_message, list):
                 combined_out_message = '\n'.join(out_message)  # Ensure out_message is a list of strings
@@ -351,38 +373,6 @@ def choose_and_run_redactor(file_paths:List[str],
             estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
             print("Estimated total processing time:", str(estimate_total_processing_time))
-            #out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
-            #combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
-            # Increase latest file completed count unless we are at the last file
-            # if latest_file_completed != len(file_paths):
-            #     print("Completed file number:", str(latest_file_completed), "more files to do")
-            # if current_loop_page >= number_of_pages:
-            #     print("Current page loop", current_loop_page, "is greater than or equal to number of pages:", number_of_pages)
-            #     latest_file_completed += 1
-            #     # Set to 999 to be a big number not to interrupt processing of large files by user
-            #     current_loop_page = 999
-            #     out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
-            #     pymupdf_doc.save(out_text_file_path)
-            #     out_file_paths.append(out_text_file_path)
-            #     # Write logs to file
-            #     decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
-            #     all_decision_process_table.to_csv(decision_logs_output_file_name)
-            #     log_files_output_paths.append(decision_logs_output_file_name)
-            #     all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
-            #     all_line_level_ocr_results_df.to_csv(all_text_output_file_name)
-            #     log_files_output_paths.append(all_text_output_file_name)
-            #     out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
-            #     if isinstance(out_message, list):
-            #         out_message.append(out_message_new)  # Ensure out_message is a list of strings
         else:
             toc = time.perf_counter()
             time_taken = toc - tic
@@ -501,27 +491,6 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
     return x1, new_y1, x2, new_y2
-# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
-#     '''
-#     Converts coordinates from pymupdf format to image coordinates.
-#     '''
-#     rect_height = pymupdf_page.rect.height
-#     rect_width = pymupdf_page.rect.width
-#     image_page_width, image_page_height = image.size
-#     # Calculate scaling factors between pymupdf and PIL image
-#     scale_width = image_page_width / rect_width
-#     scale_height = image_page_height / rect_height
-#     x1_image = x1 * scale_width
-#     y1_image = ((rect_height - y2) * scale_height)
-#     x2_image = x2 * scale_width
-#     y2_image = ((rect_height - y1) * scale_height)
-#     return x1_image, y1_image, x2_image, y2_image
 def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
     '''
     Converts coordinates from pymupdf format to image coordinates,
@@ -625,10 +594,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
             # Should already be in correct format if img_annotator_box is an input
             if isinstance(annot, dict):
                 img_annotation_box = annot
-                #try:
-                #    img_annotation_box["label"] = annot["label"]
-                #except:
-                #    img_annotation_box["label"] = "Redaction"
                 x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
@@ -823,7 +788,7 @@ def redact_image_pdf(file_path:str,
                      is_a_pdf:bool=True,
                      page_min:int=0,
                      page_max:int=999,
-                     analysis_type:str="Quick image analysis - typed text",
                      handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
                      request_metadata:str="", current_loop_page:int=0,
                      page_break_return:bool=False,
@@ -834,6 +799,8 @@ def redact_image_pdf(file_path:str,
                      pymupdf_doc = [],
                      pii_identification_method:str="Local",
                      comprehend_query_number:int=0,
                      page_break_val:int=int(page_break_value),
                      logging_file_paths:List=[],
                      max_time:int=int(max_time_value),
@@ -851,7 +818,7 @@ def redact_image_pdf(file_path:str,
     - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
     - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
     - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
-    - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
     - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
@@ -862,6 +829,8 @@ def redact_image_pdf(file_path:str,
     - pymupdf_doc (List, optional): The document as a PyMupdf object.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
@@ -874,7 +843,15 @@ def redact_image_pdf(file_path:str,
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     comprehend_query_number_new = 0
-    #print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
     tic = time.perf_counter()
@@ -897,8 +874,8 @@ def redact_image_pdf(file_path:str,
     print("Page range:", str(page_min + 1), "to", str(page_max))
     #print("Current_loop_page:", current_loop_page)
-    if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
-    elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
@@ -942,7 +919,7 @@ def redact_image_pdf(file_path:str,
             else: ocr_lang = language
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
-            if analysis_type == "Quick image analysis - typed text":
                 word_level_ocr_results = image_analyser.perform_ocr(image)
@@ -951,7 +928,7 @@ def redact_image_pdf(file_path:str,
             # Import results from json and convert
-            if analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
                 # Convert the image to bytes using an in-memory buffer
                 image_buffer = io.BytesIO()
@@ -962,7 +939,7 @@ def redact_image_pdf(file_path:str,
                 json_file_path = output_folder + file_name + "_textract.json"
                 if not os.path.exists(json_file_path):
-                    text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number)  # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                     request_metadata = request_metadata + "\n" + new_request_metadata
@@ -1010,7 +987,8 @@ def redact_image_pdf(file_path:str,
                     line_level_ocr_results,
                     line_level_ocr_results_with_children,
                     chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
-                    pii_identification_method = pii_identification_method,
                     language=language,
                     entities=chosen_redact_entities,
                     allow_list=allow_list,
@@ -1018,21 +996,13 @@ def redact_image_pdf(file_path:str,
                 )
                 comprehend_query_number = comprehend_query_number + comprehend_query_number_new
-                # redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
-                #     line_level_ocr_results_with_children,
-                #     language,
-                #     chosen_redact_entities,
-                #     allow_list,
-                #     score_threshold,
-                #     pii_identification_method)
             else:
                 redaction_bboxes = []
-            if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
-            elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
             # Save decision making process
             bboxes_str = str(redaction_bboxes)
@@ -1409,7 +1379,7 @@ def redact_text_pdf(
     allow_list: List[str] = None,  # Optional list of allowed entities
     page_min: int = 0,  # Minimum page number to start redaction
     page_max: int = 999,  # Maximum page number to end redaction
-    analysis_type: str = "Simple text analysis - PDFs with selectable text",  # Type of analysis to perform
     current_loop_page: int = 0,  # Current page being processed in the loop
     page_break_return: bool = False,  # Flag to indicate if a page break should be returned
     annotations_all_pages: List = [],  # List of annotations across all pages
@@ -1418,6 +1388,7 @@ def redact_text_pdf(
     pymupdf_doc: List = [],  # List of PyMuPDF documents
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
@@ -1443,12 +1414,18 @@ def redact_text_pdf(
     - all_decision_process_table: DataFrame for decision process table
     - pymupdf_doc: List of PyMuPDF documents
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
-    - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
@@ -1500,7 +1477,7 @@ def redact_text_pdf(
                 decision_process_table_on_page = pd.DataFrame()
                 page_text_outputs = pd.DataFrame()
-                if analysis_type == "Simple text analysis - PDFs with selectable text":
                     for n, text_container in enumerate(page_layout):
                         text_container_analyser_results = []

 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 from typing import List, Dict, Tuple
 import pandas as pd
 import pymupdf
 from pymupdf import Rect
 from fitz import Document, Page
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
+from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 from tools.file_conversion import process_file, image_dpi
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
+from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
 page_break_value = get_or_create_env_var('page_break_value', '500')
 print(f'The value of page_break_value is {page_break_value}')
 max_time_value = get_or_create_env_var('max_time_value', '105')
 print(f'The value of max_time_value is {max_time_value}')
 def sum_numbers_before_seconds(string:str):
     """Extracts numbers that precede the word 'seconds' from a string and adds them up.
     else:
         in_allow_list_flat = []
+    # Try to connect to AWS services only if RUN_AWS_FUNCTIONS environmental variable is 1
+    if pii_identification_method == "AWS Comprehend":
+        print("Trying to connect to AWS Comprehend service")
+        if RUN_AWS_FUNCTIONS == "1":
+            comprehend_client = boto3.client('comprehend')
+        else:
+            comprehend_client = ""
+            out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
+            print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
+    else:
+        comprehend_client = ""
+    if in_redact_method == textract_option:
+        print("Trying to connect to AWS Comprehend service")
+        if RUN_AWS_FUNCTIONS == "1":
+            textract_client = boto3.client('textract')
+        else:
+            textract_client = ""
+            out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
+            print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
+    else:
+        textract_client = ""
+    progress(0.5, desc="Redacting file")
     if isinstance(file_paths, str):
         file_paths_list = [file_paths]
             if is_a_pdf == False:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
+                in_redact_method = tesseract_ocr_option
         else:
             out_message = "No file selected"
             print(out_message)
             return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
+        if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
             print("Redacting file " + file_path_without_ext + " as an image-based file")
              all_decision_process_table,
              pymupdf_doc,
              pii_identification_method,
+             comprehend_query_number,
+             comprehend_client,
+             textract_client)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
                 print("Request metadata:", new_request_metadata)
                 all_request_metadata.append(new_request_metadata)
+        elif in_redact_method == text_ocr_option:
             logging_file_paths = ""
             in_allow_list_flat,
             page_min,
             page_max,
+            text_ocr_option,
             current_loop_page,
             page_break_return,
             annotations_all_pages,
             all_decision_process_table,
             pymupdf_doc,
             pii_identification_method,
+            comprehend_query_number,
+            comprehend_client)
         else:
             out_message = "No redaction method selected"
             logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
             out_file_paths.append(logs_output_file_name)
             all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
             out_file_paths.append(all_text_output_file_name)
+            # Save the gradio_annotation_boxes to a JSON file
+            try:
+                out_annotation_file_path = out_image_file_path + '_redactions.json'
+                with open(out_annotation_file_path, 'w') as f:
+                    json.dump(annotations_all_pages, f)
+                out_file_paths.append(out_annotation_file_path)
+            except:
+                print("Could not save annotations to json file.")
             # Make a combined message for the file
             if isinstance(out_message, list):
                 combined_out_message = '\n'.join(out_message)  # Ensure out_message is a list of strings
             estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
             print("Estimated total processing time:", str(estimate_total_processing_time))
         else:
             toc = time.perf_counter()
             time_taken = toc - tic
     return x1, new_y1, x2, new_y2
 def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
     '''
     Converts coordinates from pymupdf format to image coordinates,
             # Should already be in correct format if img_annotator_box is an input
             if isinstance(annot, dict):
                 img_annotation_box = annot
                 x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
                      is_a_pdf:bool=True,
                      page_min:int=0,
                      page_max:int=999,
+                     analysis_type:str=tesseract_ocr_option,
                      handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
                      request_metadata:str="", current_loop_page:int=0,
                      page_break_return:bool=False,
                      pymupdf_doc = [],
                      pii_identification_method:str="Local",
                      comprehend_query_number:int=0,
+                     comprehend_client="",
+                     textract_client="",
                      page_break_val:int=int(page_break_value),
                      logging_file_paths:List=[],
                      max_time:int=int(max_time_value),
     - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
     - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
     - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
+    - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
     - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
     - pymupdf_doc (List, optional): The document as a PyMupdf object.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
+    - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     comprehend_query_number_new = 0
+    if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
+        print("Connection to AWS Comprehend service unsuccessful.")
+        return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
+    if analysis_type == textract_option and textract_client == "":
+        print("Connection to AWS Textract service unsuccessful.")
+        return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     tic = time.perf_counter()
     print("Page range:", str(page_min + 1), "to", str(page_max))
     #print("Current_loop_page:", current_loop_page)
+    if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
+    elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
             else: ocr_lang = language
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
+            if analysis_type == tesseract_ocr_option:
                 word_level_ocr_results = image_analyser.perform_ocr(image)
             # Import results from json and convert
+            if analysis_type == textract_option:
                 # Convert the image to bytes using an in-memory buffer
                 image_buffer = io.BytesIO()
                 json_file_path = output_folder + file_name + "_textract.json"
                 if not os.path.exists(json_file_path):
+                    text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client)  # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                     request_metadata = request_metadata + "\n" + new_request_metadata
                     line_level_ocr_results,
                     line_level_ocr_results_with_children,
                     chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
+                    pii_identification_method = pii_identification_method,
+                    comprehend_client=comprehend_client,
                     language=language,
                     entities=chosen_redact_entities,
                     allow_list=allow_list,
                 )
                 comprehend_query_number = comprehend_query_number + comprehend_query_number_new
             else:
                 redaction_bboxes = []
+            if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
+            elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
             # Save decision making process
             bboxes_str = str(redaction_bboxes)
     allow_list: List[str] = None,  # Optional list of allowed entities
     page_min: int = 0,  # Minimum page number to start redaction
     page_max: int = 999,  # Maximum page number to end redaction
+    analysis_type: str = text_ocr_option,  # Type of analysis to perform
     current_loop_page: int = 0,  # Current page being processed in the loop
     page_break_return: bool = False,  # Flag to indicate if a page break should be returned
     annotations_all_pages: List = [],  # List of annotations across all pages
     pymupdf_doc: List = [],  # List of PyMuPDF documents
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
+    comprehend_client="",
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
     - all_decision_process_table: DataFrame for decision process table
     - pymupdf_doc: List of PyMuPDF documents
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+    - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
+    if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
+        print("Connection to AWS Comprehend service not found.")
+        return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
                 decision_process_table_on_page = pd.DataFrame()
                 page_text_outputs = pd.DataFrame()
+                if analysis_type == text_ocr_option:
                     for n, text_container in enumerate(page_layout):
                         text_container_analyser_results = []

tools/helper_functions.py CHANGED Viewed

@@ -29,6 +29,16 @@ def get_or_create_env_var(var_name, default_value):
     return value
 # Retrieving or setting output folder
 env_var_name = 'GRADIO_OUTPUT_FOLDER'
 default_value = 'output/'

     return value
+# Names for options labels
+text_ocr_option = "Simple text analysis - docs with selectable text"
+tesseract_ocr_option = "OCR analysis for documents without selectable text - best for typed text"
+textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
+local_pii_detector = "Local"
+aws_pii_detector  = "AWS Comprehend"
 # Retrieving or setting output folder
 env_var_name = 'GRADIO_OUTPUT_FOLDER'
 default_value = 'output/'

tools/redaction_review.py CHANGED Viewed

@@ -47,24 +47,32 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
     return current_zoom_level, annotate_current_page
 def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
-    # print("\nImage annotator object:", image_annotator_object)
     zoom_str = str(zoom) + '%'
     if not image_annotator_object:
-        return image_annotator(
         label="Modify redaction boxes",
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
         show_label=False,
-        sources=["upload"],
         show_clear_button=False,
         show_share_button=False,
         show_remove_button=False,
-        interactive=False
-    ), gr.Number(label = "Page (press enter to change)", value=1, precision=0)
     if page_num is None:
         page_num = 0
@@ -72,8 +80,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
     # Check bounding values for current page and page max
     if page_num > 0:
         page_num_reported = page_num
-        #page_num = page_num - 1
     elif page_num == 0: page_num_reported = 1
     else:
         page_num = 0
         page_num_reported = 1
@@ -83,7 +92,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
     if page_num_reported > page_max_reported:
         page_num_reported = page_max_reported
-    out_image_annotator = image_annotator(value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
         box_thickness=1,
         #label_list=["Redaction"],
@@ -104,30 +115,26 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
     number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
-    return out_image_annotator, number_reported, number_reported
-def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]):
     '''
     Overwrite current image annotations with modifications
     '''
     if not current_page:
         current_page = 1
     #If no previous page or is 0, i.e. first time run, then rewrite current page
-    if not previous_page:
-        previous_page = current_page
-        #return all_image_annotations, current_page, current_page
-    #print("all_image_annotations before:",all_image_annotations)
     image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
-    #print("image_annotated:", image_annotated)
-    all_image_annotations[previous_page - 1] = image_annotated
-    #print("all_image_annotations after:",all_image_annotations)
     return all_image_annotations, current_page, current_page
@@ -178,7 +185,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
             draw.rectangle(coords, fill=fill)
-            image.save(output_folder + file_base + "_redacted_mod.png")
         doc = [image]
@@ -213,13 +220,13 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
             pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
     #try:
-    out_pdf_file_path = output_folder + file_base + "_redacted_mod.pdf"
     unredacted_doc.save(out_pdf_file_path)
     output_files.append(out_pdf_file_path)
     # Save the gradio_annotation_boxes to a JSON file
     try:
-        out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
         with open(out_annotation_file_path, 'w') as f:
             json.dump(all_image_annotations, f)
         output_files.append(out_annotation_file_path)

     return current_zoom_level, annotate_current_page
 def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
+    '''
+    Update a gradio_image_annotation object with new annotation data
+    '''
     zoom_str = str(zoom) + '%'
     if not image_annotator_object:
+        out_image_annotator = image_annotator(
         label="Modify redaction boxes",
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
+        height=zoom_str,
+        width=zoom_str,
         show_label=False,
+        sources=None,
         show_clear_button=False,
         show_share_button=False,
         show_remove_button=False,
+        interactive=False)
+        number_reported = gr.Number(label = "Page (press enter to change)", value=1, precision=0)
+        return out_image_annotator, number_reported, number_reported
+    print("page_num at start of update_annotator function:", page_num)
     if page_num is None:
         page_num = 0
     # Check bounding values for current page and page max
     if page_num > 0:
         page_num_reported = page_num
     elif page_num == 0: page_num_reported = 1
     else:
         page_num = 0
         page_num_reported = 1
     if page_num_reported > page_max_reported:
         page_num_reported = page_max_reported
+    out_image_annotator = image_annotator(
+        value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
         box_thickness=1,
         #label_list=["Redaction"],
     number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
+    return out_image_annotator, number_reported, number_reported, page_num_reported
+def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
     '''
     Overwrite current image annotations with modifications
     '''
     if not current_page:
         current_page = 1
     #If no previous page or is 0, i.e. first time run, then rewrite current page
+    #if not previous_page:
+    #    previous_page = current_page
     image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
+    if clear_all == False:
+        all_image_annotations[previous_page - 1] = image_annotated
+    else:
+        all_image_annotations[previous_page - 1]["boxes"] = []
     return all_image_annotations, current_page, current_page
             draw.rectangle(coords, fill=fill)
+            image.save(output_folder + file_base + "_redacted.png")
         doc = [image]
             pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
     #try:
+    out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
     unredacted_doc.save(out_pdf_file_path)
     output_files.append(out_pdf_file_path)
     # Save the gradio_annotation_boxes to a JSON file
     try:
+        out_annotation_file_path = output_folder + file_base + '_redactions.json'
         with open(out_annotation_file_path, 'w') as f:
             json.dump(all_image_annotations, f)
         output_files.append(out_annotation_file_path)