Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Nov 6, 2024

Commit

8235bbb

1 Parent(s): f0f9378

Improved logging

Browse files

Files changed (5) hide show

README.md +4 -2
app.py +45 -47
tools/custom_image_analyser_engine.py +4 -1
tools/file_redaction.py +43 -29
tools/helper_functions.py +54 -55

README.md CHANGED Viewed

@@ -11,13 +11,15 @@ license: agpl-3.0
 # Document redaction
-Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
 You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
 NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
-This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
 # USER GUIDE

 # Document redaction
+Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
+See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
 You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
 NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
+This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
 # USER GUIDE

app.py CHANGED Viewed

@@ -4,20 +4,20 @@ import socket
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 from gradio_image_annotation import image_annotator
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
 from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
-#from tools.aws_functions import load_data_from_aws
-import gradio as gr
-import pandas as pd
-from datetime import datetime
 today_rev = datetime.now().strftime("%Y%m%d")
 add_folder_to_path("tesseract/")
@@ -36,12 +36,10 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
 language = 'en'
 host_name = socket.gethostname()
 feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
 usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
 text_ocr_option = "Simple text analysis - PDFs with selectable text"
 tesseract_ocr_option = "Quick image analysis - typed text"
 textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
@@ -70,10 +68,6 @@ with app:
     all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
     all_decision_process_table_state = gr.State(pd.DataFrame())
-    def reset_state_vars():
-        return [], [], pd.DataFrame(), pd.DataFrame()
     in_allow_list_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
@@ -88,25 +82,32 @@ with app:
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
     text_output_file_list_state = gr.State([])
-    log_files_output_list_state = gr.State([])
     # Logging state
-    feedback_logs_state = gr.State(feedback_logs_folder + 'dataset1.csv') #'log.csv')
     feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
-    access_logs_state = gr.State(access_logs_folder + 'dataset1.csv') #'log.csv')
     access_s3_logs_loc_state = gr.State(access_logs_folder)
-    usage_logs_state = gr.State(usage_logs_folder + 'dataset1.csv') #'log.csv')
     usage_s3_logs_loc_state = gr.State(usage_logs_folder)
-    # Invisible elements effectively used as state variables
-    session_hash_textbox = gr.Textbox(value="", visible=False) # Invisible text box to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
-    textract_metadata_textbox = gr.Textbox(value="", visible=False)
-    doc_file_name_textbox = gr.Textbox(value="", visible=False)
-    doc_file_name_with_extension_textbox = gr.Textbox(value="", visible=False)
-    data_file_name_textbox = gr.Textbox(value="", visible=False)
     s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
-    estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
-    annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
     ###
@@ -114,17 +115,17 @@ with app:
     ###
     gr.Markdown(
-    """
-    # Document redaction
-    Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
     You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
     NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
-    This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
-    """)
     # PDF / IMAGES TAB
     with gr.Tab("PDFs/images"):
@@ -148,7 +149,7 @@ with app:
         # Feedback elements are invisible until revealed by redaction action
         pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
-        pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
@@ -226,9 +227,6 @@ with app:
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             with gr.Row():
                     in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
@@ -257,15 +255,15 @@ with app:
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
-    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
                     #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
     # If a file has been completed, the function will continue onto the next document
     latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
@@ -321,27 +319,27 @@ with app:
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
-    access_callback = gr.CSVLogger()
     access_callback.setup([session_hash_textbox], access_logs_folder)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for pdf redactions
-    pdf_callback = gr.CSVLogger()
-    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_doc_files], feedback_logs_folder)
-    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_doc_files], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
-    data_callback = gr.CSVLogger()
-    data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_logs_folder)
-    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     # Log processing time/token usage when making a query
-    usage_callback = gr.CSVLogger()
-    usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
-    estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app

 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
+import gradio as gr
+import pandas as pd
+from datetime import datetime
 from gradio_image_annotation import image_annotator
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars
 from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 today_rev = datetime.now().strftime("%Y%m%d")
 add_folder_to_path("tesseract/")
 language = 'en'
 host_name = socket.gethostname()
 feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
 usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
 text_ocr_option = "Simple text analysis - PDFs with selectable text"
 tesseract_ocr_option = "Quick image analysis - typed text"
 textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
     all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
     all_decision_process_table_state = gr.State(pd.DataFrame())
     in_allow_list_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
     text_output_file_list_state = gr.State([])
+    log_files_output_list_state = gr.State([])
     # Logging state
+    log_file_name = 'log.csv'
+    feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
     feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
+    access_logs_state = gr.State(access_logs_folder + log_file_name)
     access_s3_logs_loc_state = gr.State(access_logs_folder)
+    usage_logs_state = gr.State(usage_logs_folder + log_file_name)
     usage_s3_logs_loc_state = gr.State(usage_logs_folder)
+    # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
+    session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
+    textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
+    comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
+    doc_file_name_textbox = gr.Textbox(label = "doc_file_name_textbox", value="", visible=False)
+    doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
+    data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
+    estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
+    annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
     s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     ###
     ###
     gr.Markdown(
+    """# Document redaction
+    Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
+    See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
     You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
     NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
+    This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.""")
     # PDF / IMAGES TAB
     with gr.Tab("PDFs/images"):
         # Feedback elements are invisible until revealed by redaction action
         pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
+        pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             with gr.Row():
                     in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
+    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc")#.\
                     #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number])
     # If a file has been completed, the function will continue onto the next document
     latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
+    access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
     access_callback.setup([session_hash_textbox], access_logs_folder)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for pdf redactions
+    pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], feedback_logs_folder)
+    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
+    data_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_textbox], feedback_logs_folder)
+    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_name_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     # Log processing time/token usage when making a query
+    usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
+    estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -471,6 +471,7 @@ class CustomImageAnalyzerEngine:
         horizontal_buffer = 0 # add pixels to right of width
         height_buffer = 2 # add pixels to bounding box height
         allow_list = text_analyzer_kwargs.get('allow_list', [])
@@ -494,6 +495,8 @@ class CustomImageAnalyzerEngine:
                     LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
                 )
                 for result in response["Entities"]:
                     result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
@@ -577,7 +580,7 @@ class CustomImageAnalyzerEngine:
                 combined_results.extend(line_results)
-        return combined_results
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(

         horizontal_buffer = 0 # add pixels to right of width
         height_buffer = 2 # add pixels to bounding box height
+        comprehend_query_number = 0
         allow_list = text_analyzer_kwargs.get('allow_list', [])
                     LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
                 )
+                comprehend_query_number += 1
                 for result in response["Entities"]:
                     result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
                 combined_results.extend(line_results)
+        return combined_results, comprehend_query_number
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(

tools/file_redaction.py CHANGED Viewed

@@ -91,6 +91,7 @@ def choose_and_run_redactor(file_paths:List[str],
  current_loop_page:int=0,
  page_break_return:bool=False,
  pii_identification_method:str="Local",
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -120,6 +121,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
@@ -171,7 +173,7 @@ def choose_and_run_redactor(file_paths:List[str],
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
     # If we have reached the last page, return message
     if current_loop_page >= number_of_pages:
@@ -181,7 +183,7 @@ def choose_and_run_redactor(file_paths:List[str],
         current_loop_page = 999
         combined_out_message = out_message
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
     # Create allow list
     if not in_allow_list.empty:
@@ -220,7 +222,7 @@ def choose_and_run_redactor(file_paths:List[str],
             out_message = "No file selected"
             print(out_message)
-            return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
         if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
@@ -231,16 +233,16 @@ def choose_and_run_redactor(file_paths:List[str],
                 except:
                     out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
                     print(out_message)
-                    return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-                return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
             print("Redacting file " + file_path_without_ext + " as an image-based file")
-            pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
              language,
              chosen_redact_entities,
@@ -259,7 +261,8 @@ def choose_and_run_redactor(file_paths:List[str],
              all_line_level_ocr_results_df,
              all_decision_process_table,
              pymupdf_doc,
-             pii_identification_method)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
@@ -272,12 +275,12 @@ def choose_and_run_redactor(file_paths:List[str],
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
-                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path,
             prepared_pdf_image_paths,language,
             chosen_redact_entities,
             chosen_redact_comprehend_entities,
@@ -291,12 +294,13 @@ def choose_and_run_redactor(file_paths:List[str],
             all_line_level_ocr_results_df,
             all_decision_process_table,
             pymupdf_doc,
-            pii_identification_method)
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
         # If at last page, save to file
         if current_loop_page >= number_of_pages:
@@ -392,7 +396,7 @@ def choose_and_run_redactor(file_paths:List[str],
    # If textract requests made, write to logging file
     if all_request_metadata:
-        all_request_metadata_str = '\n'.join(all_request_metadata)
         all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
@@ -412,7 +416,7 @@ def choose_and_run_redactor(file_paths:List[str],
     out_file_paths = list(set(out_file_paths))
-    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
 def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
     '''
@@ -769,9 +773,10 @@ def redact_image_pdf(file_path:str,
                      all_decision_process_table = pd.DataFrame(),
                      pymupdf_doc = [],
                      pii_identification_method:str="Local",
                      page_break_val:int=int(page_break_value),
                      logging_file_paths:List=[],
-                     max_time:int=int(max_time_value),
                      progress=Progress(track_tqdm=True)):
     '''
@@ -796,9 +801,10 @@ def redact_image_pdf(file_path:str,
     - all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
     - pymupdf_doc (List, optional): The document as a PyMupdf object.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
-    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a fully or partially-redacted PDF document.
@@ -806,6 +812,7 @@ def redact_image_pdf(file_path:str,
     file_name = get_file_path_end(file_path)
     fill = (0, 0, 0)   # Fill colour
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     #print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
@@ -836,7 +843,6 @@ def redact_image_pdf(file_path:str,
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
-    #progress_bar = progress.tqdm(range(page_loop_start, number_of_pages), unit="pages", desc="Redacting pages")
     progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
     for page_no in progress_bar:
@@ -872,8 +878,7 @@ def redact_image_pdf(file_path:str,
             page_width, page_height = image.size
             # Possibility to use different languages
-            if language == 'en':
-                ocr_lang = 'eng'
             else: ocr_lang = language
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
@@ -943,7 +948,7 @@ def redact_image_pdf(file_path:str,
                 pii_identification_method= "AWS Comprehend" #"Local"
-                redaction_bboxes = image_analyser.analyze_text(
                     line_level_ocr_results,
                     line_level_ocr_results_with_children,
                     chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
@@ -954,6 +959,8 @@ def redact_image_pdf(file_path:str,
                     score_threshold=score_threshold
                 )
                 # redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
                 #     line_level_ocr_results_with_children,
                 #     language,
@@ -1063,7 +1070,7 @@ def redact_image_pdf(file_path:str,
                 current_loop_page += 1
-                return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
         if is_pdf(file_path) == False:
             images.append(image)
@@ -1079,9 +1086,9 @@ def redact_image_pdf(file_path:str,
             progress.close(_tqdm=progress_bar)
             tqdm._instances.clear()
-            return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
-    return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
 ###
@@ -1299,7 +1306,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
     '''
     Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
     '''
     analyser_results = []
     #text_to_analyse = initial_clean(text_container.text).strip()
@@ -1323,6 +1330,8 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
                 LanguageCode=language  # Specify the language of the text
             )
             for result in response["Entities"]:
                 result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
@@ -1340,7 +1349,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
         analyser_results = []
-    return analyser_results
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
@@ -1397,6 +1406,7 @@ def redact_text_pdf(
     all_decision_process_table: pd.DataFrame = pd.DataFrame(),  # DataFrame for decision process table
     pymupdf_doc: List = [],  # List of PyMuPDF documents
     pii_identification_method: str = "Local",
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
@@ -1422,12 +1432,14 @@ def redact_text_pdf(
     - all_decision_process_table: DataFrame for decision process table
     - pymupdf_doc: List of PyMuPDF documents
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - page_break_val: Value for page break
-    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
     pikepdf_pdf = Pdf.open(filename)
@@ -1517,7 +1529,9 @@ def redact_text_pdf(
                             if chosen_redact_entities:
-                                text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
                             else:
                                 text_line_analyser_result = []
@@ -1576,7 +1590,7 @@ def redact_text_pdf(
                     current_loop_page += 1
-                    return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
         annotations_all_pages.append(image_annotations)
@@ -1588,7 +1602,7 @@ def redact_text_pdf(
             page_break_return = True
             progress.close(_tqdm=progress_bar)
-            return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
-    return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return

  current_loop_page:int=0,
  page_break_return:bool=False,
  pii_identification_method:str="Local",
+ comprehend_query_number:int=0,
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
     - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+    - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
     # If we have reached the last page, return message
     if current_loop_page >= number_of_pages:
         current_loop_page = 999
         combined_out_message = out_message
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
     # Create allow list
     if not in_allow_list.empty:
             out_message = "No file selected"
             print(out_message)
+            return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
         if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
                 except:
                     out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
                     print(out_message)
+                    return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
             print("Redacting file " + file_path_without_ext + " as an image-based file")
+            pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
              language,
              chosen_redact_entities,
              all_line_level_ocr_results_df,
              all_decision_process_table,
              pymupdf_doc,
+             pii_identification_method,
+             comprehend_query_number)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
+                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(file_path,
             prepared_pdf_image_paths,language,
             chosen_redact_entities,
             chosen_redact_comprehend_entities,
             all_line_level_ocr_results_df,
             all_decision_process_table,
             pymupdf_doc,
+            pii_identification_method,
+            comprehend_query_number)
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
         # If at last page, save to file
         if current_loop_page >= number_of_pages:
    # If textract requests made, write to logging file
     if all_request_metadata:
+        all_request_metadata_str = '\n'.join(all_request_metadata).strip()
         all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
     out_file_paths = list(set(out_file_paths))
+    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
 def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
     '''
                      all_decision_process_table = pd.DataFrame(),
                      pymupdf_doc = [],
                      pii_identification_method:str="Local",
+                     comprehend_query_number:int=0,
                      page_break_val:int=int(page_break_value),
                      logging_file_paths:List=[],
+                     max_time:int=int(max_time_value),
                      progress=Progress(track_tqdm=True)):
     '''
     - all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
     - pymupdf_doc (List, optional): The document as a PyMupdf object.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+    - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
+    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a fully or partially-redacted PDF document.
     file_name = get_file_path_end(file_path)
     fill = (0, 0, 0)   # Fill colour
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
+    comprehend_query_number_new = 0
     #print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
     progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
     for page_no in progress_bar:
             page_width, page_height = image.size
             # Possibility to use different languages
+            if language == 'en': ocr_lang = 'eng'
             else: ocr_lang = language
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
                 pii_identification_method= "AWS Comprehend" #"Local"
+                redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
                     line_level_ocr_results,
                     line_level_ocr_results_with_children,
                     chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
                     score_threshold=score_threshold
                 )
+                comprehend_query_number = comprehend_query_number_new
                 # redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
                 #     line_level_ocr_results_with_children,
                 #     language,
                 current_loop_page += 1
+                return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
         if is_pdf(file_path) == False:
             images.append(image)
             progress.close(_tqdm=progress_bar)
             tqdm._instances.clear()
+            return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
+    return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 ###
     '''
     Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
     '''
+    comprehend_query_number = 0
     analyser_results = []
     #text_to_analyse = initial_clean(text_container.text).strip()
                 LanguageCode=language  # Specify the language of the text
             )
+            comprehend_query_number += 1
             for result in response["Entities"]:
                 result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
         analyser_results = []
+    return analyser_results, comprehend_query_number
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
     all_decision_process_table: pd.DataFrame = pd.DataFrame(),  # DataFrame for decision process table
     pymupdf_doc: List = [],  # List of PyMuPDF documents
     pii_identification_method: str = "Local",
+    comprehend_query_number:int = 0,
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
     - all_decision_process_table: DataFrame for decision process table
     - pymupdf_doc: List of PyMuPDF documents
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+    - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - page_break_val: Value for page break
+    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
     tic = time.perf_counter()
+    comprehend_query_number_new = 0
     # Open with Pikepdf to get text lines
     pikepdf_pdf = Pdf.open(filename)
                             if chosen_redact_entities:
+                                text_line_analyser_result, comprehend_query_number_new = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
+                                comprehend_query_number = comprehend_query_number + comprehend_query_number_new
                             else:
                                 text_line_analyser_result = []
                     current_loop_page += 1
+                    return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
         annotations_all_pages.append(image_annotations)
             page_break_return = True
             progress.close(_tqdm=progress_bar)
+            return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
+    return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number

tools/helper_functions.py CHANGED Viewed

@@ -4,6 +4,9 @@ import gradio as gr
 import pandas as pd
 import unicodedata
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
@@ -183,64 +186,60 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
 async def get_connection_params(request: gr.Request):
     base_folder = ""
-    if request:
-        #print("request user:", request.username)
-        #request_data = await request.json()  # Parse JSON body
-        #print("All request data:", request_data)
-        #context_value = request_data.get('context')
-        #if 'context' in request_data:
-        #     print("Request context dictionary:", request_data['context'])
-        # print("Request headers dictionary:", request.headers)
-        # print("All host elements", request.client)
-        # print("IP address:", request.client.host)
-        # print("Query parameters:", dict(request.query_params))
-        # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
-        #print("Request dictionary to object:", request.request.body())
-        print("Session hash:", request.session_hash)
-        # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
-        CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
-        #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
-        # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
-        CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
-        #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
-        if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
-            if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
-                supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
-                if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
-                    print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
-                else:
-                    raise(ValueError, "Custom Cloudfront header value does not match expected value.")
-        # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
-        if request.username:
-            out_session_hash = request.username
-            base_folder = "user-files/"
-            print("Request username found:", out_session_hash)
-        elif 'x-cognito-id' in request.headers:
-            out_session_hash = request.headers['x-cognito-id']
-            base_folder = "user-files/"
-            print("Cognito ID found:", out_session_hash)
-        else:
-            out_session_hash = request.session_hash
-            base_folder = "temp-files/"
-            # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
-        output_folder = base_folder + out_session_hash + "/"
-        #if bucket_name:
-        #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
-        return out_session_hash, output_folder, out_session_hash
-    else:
-        print("No session parameters found.")
-        return "",""
 def clean_unicode_text(text):

 import pandas as pd
 import unicodedata
+def reset_state_vars():
+    return [], [], pd.DataFrame(), pd.DataFrame(), 0, ""
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
 async def get_connection_params(request: gr.Request):
     base_folder = ""
+    #print("request user:", request.username)
+    #request_data = await request.json()  # Parse JSON body
+    #print("All request data:", request_data)
+    #context_value = request_data.get('context')
+    #if 'context' in request_data:
+    #     print("Request context dictionary:", request_data['context'])
+    # print("Request headers dictionary:", request.headers)
+    # print("All host elements", request.client)
+    # print("IP address:", request.client.host)
+    # print("Query parameters:", dict(request.query_params))
+    # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
+    #print("Request dictionary to object:", request.request.body())
+    print("Session hash:", request.session_hash)
+    # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
+    CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
+    #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
+    # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
+    CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
+    #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
+    if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
+        if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
+            supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
+            if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
+                print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
+            else:
+                raise(ValueError, "Custom Cloudfront header value does not match expected value.")
+    # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
+    if request.username:
+        out_session_hash = request.username
+        base_folder = "user-files/"
+        print("Request username found:", out_session_hash)
+    elif 'x-cognito-id' in request.headers:
+        out_session_hash = request.headers['x-cognito-id']
+        base_folder = "user-files/"
+        print("Cognito ID found:", out_session_hash)
+    else:
+        out_session_hash = request.session_hash
+        base_folder = "temp-files/"
+        # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
+    output_folder = base_folder + out_session_hash + "/"
+    #if bucket_name:
+    #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
+    return out_session_hash, output_folder, out_session_hash
 def clean_unicode_text(text):