Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 15 days ago

Commit

42180e4

1 Parent(s): dea568f

Fixed issues with log file list picking up logs from other file runs. Updated packages.

Browse files

Files changed (4) hide show

app.py +26 -25
requirements.txt +13 -13
tools/file_redaction.py +21 -104
tools/redaction_review.py +2 -2

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import socket
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
-os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 import gradio as gr
 import pandas as pd
@@ -65,7 +65,8 @@ with app:
     ###
     # STATE VARIABLES
     ###
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
@@ -73,12 +74,12 @@ with app:
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    session_hash_state = gr.State()
-    s3_output_folder_state = gr.State()
-    first_loop_state = gr.State(True)
-    second_loop_state = gr.State(False)
-    do_not_save_pdf_state = gr.State(False)
     prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
@@ -86,18 +87,18 @@ with app:
     output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
-    log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     # Logging state
     log_file_name = 'log.csv'
-    feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
-    feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
-    access_logs_state = gr.State(access_logs_folder + log_file_name)
-    access_s3_logs_loc_state = gr.State(access_logs_folder)
-    usage_logs_state = gr.State(usage_logs_folder + log_file_name)
-    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -121,11 +122,11 @@ with app:
     ## Annotator zoom value
     annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
-    zoom_true_bool = gr.State(True)
-    zoom_false_bool = gr.State(False)
-    clear_all_page_redactions = gr.State(True)
-    prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
@@ -148,11 +149,11 @@ with app:
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     # Base dataframe for recognisers that is not modified subsequent to load
-    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
-    duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
@@ -178,8 +179,8 @@ with app:
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             if RUN_AWS_FUNCTIONS == "1":
-                in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
-                pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
             else:
                 in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
                 pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
@@ -336,7 +337,7 @@ with app:
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
-        with gr.Accordion("AWS Textract specific options", open = False):
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
@@ -542,8 +543,8 @@ print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
 ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
 print(f'The value of ROOT_PATH is {ROOT_PATH}')
-DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
-print(f'The value of ROOT_PATH is {DEFAULT_CONCURRENCY_LIMIT}')
 if __name__ == "__main__":

 import socket
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
+#os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 import gradio as gr
 import pandas as pd
     ###
     # STATE VARIABLES
     ###
+    # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False) #.State()
+    s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False) #.State()
+    first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False) #.State(True)
+    second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False) #.State(False)
+    do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False) #.State(False)
     prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
     output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
+    log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=True) #gr.State([])
     # Logging state
     log_file_name = 'log.csv'
+    feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False) #State(feedback_logs_folder + log_file_name)
+    feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False) #State(feedback_logs_folder)
+    access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False) #State(access_logs_folder + log_file_name)
+    access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False) #State(access_logs_folder)
+    usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False) #State(usage_logs_folder + log_file_name)
+    usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False) #State(usage_logs_folder)
     # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     ## Annotator zoom value
     annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
+    zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False) #State(True)
+    zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False) #State(False)
+    clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False) #State(True)
+    prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     # Base dataframe for recognisers that is not modified subsequent to load
+    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False, label="recogniser_entity_dataframe_base")
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
+    duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             if RUN_AWS_FUNCTIONS == "1":
+                in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
+                pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
             else:
                 in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
                 pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
+        with gr.Accordion("AWS Textract options", open = False):
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
 ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
 print(f'The value of ROOT_PATH is {ROOT_PATH}')
+DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
+print(f'The value of DEFAULT_CONCURRENCY_LIMIT is {DEFAULT_CONCURRENCY_LIMIT}')
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,22 +1,22 @@
-pdfminer.six==20231228
 pdf2image==1.17.0
-pymupdf==1.24.10
 opencv-python==4.10.0.84
-presidio_analyzer==2.2.355
-presidio_anonymizer==2.2.355
-presidio-image-redactor==0.0.53
-pikepdf==8.15.1
 pandas==2.2.3
 nltk==3.9.1
-scikit-learn==1.5.2
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.16.0
-boto3==1.36.15
-pyarrow==18.1.0
-openpyxl==3.1.2
-Faker==22.2.0
 python-levenshtein==0.26.1
 spaczz==0.6.1
 #gradio_image_annotation==0.2.5
@@ -24,7 +24,7 @@ spaczz==0.6.1
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
 rapidfuzz==3.12.1
 numpy==1.26.4
-awslambdaric==3.0.0

+pdfminer.six==20240706
 pdf2image==1.17.0
+pymupdf==1.25.3
 opencv-python==4.10.0.84
+presidio_analyzer==2.2.357
+presidio_anonymizer==2.2.357
+presidio-image-redactor==0.0.55
+pikepdf==9.5.2
 pandas==2.2.3
 nltk==3.9.1
+scikit-learn==1.6.1
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.18.0
+boto3==1.36.26
+pyarrow==19.0.1
+openpyxl==3.1.5
+Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
 #gradio_image_annotation==0.2.5
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
 rapidfuzz==3.12.1
 numpy==1.26.4
+awslambdaric==3.0.1

tools/file_redaction.py CHANGED Viewed

@@ -78,9 +78,9 @@ def choose_and_run_redactor(file_paths:List[str],
  custom_recogniser_word_list:List[str]=None,
  redact_whole_page_list:List[str]=None,
  latest_file_completed:int=0,
- out_message:list=[],
- out_file_paths:list=[],
- log_files_output_paths:list=[],
  first_loop_state:bool=False,
  page_min:int=0,
  page_max:int=999,
@@ -301,9 +301,6 @@ def choose_and_run_redactor(file_paths:List[str],
         file_paths_list = file_paths
         file_paths_loop = [file_paths_list[int(latest_file_completed)]]
-    # print("file_paths_list in choose_redactor function:", file_paths_list)
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
@@ -313,7 +310,6 @@ def choose_and_run_redactor(file_paths:List[str],
         if file_path:
             pdf_file_name_without_ext = get_file_name_without_type(file_path)
             pdf_file_name_with_ext = os.path.basename(file_path)
-            # print("Redacting file:", pdf_file_name_with_ext)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -361,14 +357,11 @@ def choose_and_run_redactor(file_paths:List[str],
              custom_recogniser_word_list,
              redact_whole_page_list,
              max_fuzzy_spelling_mistakes_num,
-             match_fuzzy_whole_phrase_bool)
-            #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
-                #print("Request metadata:", new_request_metadata)
                 all_request_metadata.append(new_request_metadata)
         elif in_redact_method == text_ocr_option:
@@ -422,9 +415,6 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save file
             if is_pdf(file_path) == False:
                 out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
-                #pymupdf_doc[0].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)
-                #print("pymupdf_doc", pymupdf_doc)
-                #print("pymupdf_doc[0]", pymupdf_doc[0])
                 pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
                 out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
@@ -434,10 +424,6 @@ def choose_and_run_redactor(file_paths:List[str],
             out_file_paths.append(out_redacted_pdf_file_path)
-            #if log_files_output_paths:
-            #    log_files_output_paths.extend(log_files_output_paths)
             out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
             logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
@@ -450,27 +436,20 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save the gradio_annotation_boxes to a JSON file
             try:
-                #print("Saving annotations to CSV")
-                # Convert json to csv and also save this
-                #print("annotations_all_pages:", annotations_all_pages)
-                #print("all_decision_process_table:", all_decision_process_table)
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
                 out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
                 review_df.to_csv(out_review_file_path, index=None)
                 out_file_paths.append(out_review_file_path)
-                print("Saved review file to csv")
                 out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
                 with open(out_annotation_file_path, 'w') as f:
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
-                print("Saving annotations to JSON")
             except Exception as e:
                 print("Could not save annotations to json or csv file:", e)
@@ -488,7 +467,6 @@ def choose_and_run_redactor(file_paths:List[str],
             combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
             estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
-            #print("Estimated total processing time:", str(estimate_total_processing_time))
         else:
             toc = time.perf_counter()
@@ -511,18 +489,11 @@ def choose_and_run_redactor(file_paths:List[str],
     if combined_out_message: out_message = combined_out_message
-    #print("\nout_message at choose_and_run_redactor end is:", out_message)
     # Ensure no duplicated output files
     log_files_output_paths = list(set(log_files_output_paths))
     out_file_paths = list(set(out_file_paths))
     review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
-    #print("log_files_output_paths:", log_files_output_paths)
-    #print("out_file_paths:", out_file_paths)
-    #print("review_out_file_paths:", review_out_file_paths)
     return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
@@ -646,9 +617,6 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot, image:Image, type="imag
         # Unpack coordinates
         x1, y1, x2, y2 = rect_coordinates
-        #print("scale_width:", scale_width)
-        #print("scale_height:", scale_height)
         x1 = (x1* scale_width)# + page_x_adjust
         new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust  # Calculate y1 correctly
         x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust  # Calculate x1
@@ -1005,12 +973,10 @@ def redact_image_pdf(file_path:str,
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
-        #print("new_custom_recogniser:", new_custom_recogniser)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
-        #print("new_custom_recogniser:", new_custom_recogniser)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
@@ -1045,22 +1011,15 @@ def redact_image_pdf(file_path:str,
     else: page_min = page_min - 1
     print("Page range:", str(page_min + 1), "to", str(page_max))
-    #print("Current_loop_page:", current_loop_page)
     # If running Textract, check if file already exists. If it does, load in existing data
-    # Import results from json and convert
     if analysis_type == textract_option:
         json_file_path = output_folder + file_name + "_textract.json"
         if not os.path.exists(json_file_path):
             print("No existing Textract results file found.")
             textract_data = {}
-            #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-            #log_files_output_paths.append(json_file_path)
-            #request_metadata = request_metadata + "\n" + new_request_metadata
-            #wrapped_text_blocks = {"pages":[text_blocks]}
         else:
             # Open the file and load the JSON data
             no_textract_file = False
@@ -1073,7 +1032,6 @@ def redact_image_pdf(file_path:str,
                 textract_data = json.load(json_file)
     ###
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
@@ -1087,7 +1045,6 @@ def redact_image_pdf(file_path:str,
         page_break_return = False
         reported_page_number = str(page_no + 1)
-        #print("Redacting page:", reported_page_number)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
@@ -1104,7 +1061,6 @@ def redact_image_pdf(file_path:str,
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
-                #print("image is a file path", image)
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
@@ -1192,13 +1148,13 @@ def redact_image_pdf(file_path:str,
                 redaction_bboxes = []
-            if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
-            elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
-            # Save decision making process
-            bboxes_str = str(redaction_bboxes)
-            with open(interim_results_file_path, "w") as f:
-                f.write(bboxes_str)
             # Merge close bounding boxes
             merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
@@ -1210,7 +1166,6 @@ def redact_image_pdf(file_path:str,
                 all_image_annotations_boxes = []
                 for box in merged_redaction_bboxes:
-                    #print("box:", box)
                     x0 = box.left
                     y0 = box.top
@@ -1238,8 +1193,6 @@ def redact_image_pdf(file_path:str,
             ## Apply annotations with pymupdf
             else:
-                #print("merged_redaction_boxes:", merged_redaction_bboxes)
-                #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
                     if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
@@ -1284,8 +1237,6 @@ def redact_image_pdf(file_path:str,
             time_taken = toc - tic
-            #print("toc - tic:", time_taken)
             # Break if time taken is greater than max_time seconds
             if time_taken > max_time:
                 print("Processing for", max_time, "seconds, breaking loop.")
@@ -1298,7 +1249,6 @@ def redact_image_pdf(file_path:str,
                     pymupdf_doc = images
                 # Check if the image already exists in annotations_all_pages
-                #print("annotations_all_pages:", annotations_all_pages)
                 existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
                 if existing_index is not None:
                     # Replace the existing annotation
@@ -1315,6 +1265,8 @@ def redact_image_pdf(file_path:str,
                         if json_file_path not in log_files_output_paths:
                             log_files_output_paths.append(json_file_path)
                 current_loop_page += 1
                 return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
@@ -1324,7 +1276,6 @@ def redact_image_pdf(file_path:str,
             pymupdf_doc = images
         # Check if the image already exists in annotations_all_pages
-        #print("annotations_all_pages:", annotations_all_pages)
         existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
         if existing_index is not None:
             # Replace the existing annotation
@@ -1409,9 +1360,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         if isinstance(char, LTAnno):
-            # print("Character line:", "".join(character_text_objects_out))
-            # print("Char is an annotation object:", char)
             added_text = char.get_text()
             # Handle double quotes
@@ -1427,7 +1375,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
             # Check for line break (assuming a new line is indicated by a specific character)
             if '\n' in added_text:
-                #print("char_anno:", char)
                 # Finalize the current line
                 if current_word:
                     word_bboxes.append((current_word, current_word_bbox))
@@ -1475,13 +1423,12 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         word_bboxes.append((current_word, current_word_bbox))
     if full_text:
-        #print("full_text before:", full_text)
         if re.search(r'[^\x00-\x7F]', full_text):  # Matches any non-ASCII character
             # Convert special characters to a human-readable format
-            #full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
             full_text = clean_unicode_text(full_text)
             full_text = full_text.strip()
-        #print("full_text:", full_text)
         line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
@@ -1498,9 +1445,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
         analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
         # Remove brackets and split the string into four separate columns
-        #print("analysed_bounding_boxes_df_new:", analysed_bounding_boxes_df_new['boundingBox'])
-        # analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].str.strip('[]').str.split(',', expand=True)
         # Split the boundingBox list into four separate columns
         analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
@@ -1512,8 +1456,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
         analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
         analysed_bounding_boxes_df_new['page'] = page_num + 1
         decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
-        #print('\n\ndecision_process_table:\n\n', decision_process_table)
     return decision_process_table
@@ -1607,7 +1549,6 @@ def redact_text_pdf(
         return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Update custom word list analyser object with any new words that have been added to the custom deny list
-    #print("custom_recogniser_word_list:", custom_recogniser_word_list)
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
@@ -1617,16 +1558,6 @@ def redact_text_pdf(
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
-        # List all elements currently in the nlp_analyser registry
-        #print("Current recognizers in nlp_analyser registry:")
-        #for recognizer_name in nlp_analyser.registry.recognizers:
-           #print(recognizer_name)
-           #print(recognizer_name.name)
-        #print("Custom recogniser:", nlp_analyser.registry)
-        #print("custom_recogniser_word_list:", custom_recogniser_word_list)
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
@@ -1641,7 +1572,6 @@ def redact_text_pdf(
     else: page_min = page_min - 1
     print("Page range is",str(page_min + 1), "to", str(page_max))
-    print("Current_loop_page:", current_loop_page)
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
@@ -1716,8 +1646,6 @@ def redact_text_pdf(
                     ### REDACTION
                     if chosen_redact_entities or chosen_redact_comprehend_entities:
-                        #print("Identifying redactions on page.")
                         page_analysed_bounding_boxes = run_page_text_redaction(
                                                             language,
                                                             chosen_redact_entities,
@@ -1735,24 +1663,18 @@ def redact_text_pdf(
                                                             comprehend_query_number
                                                             )
-                    #print("page_analyser_results:", page_analyser_results)
-                    #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
-                    #print("image:", image)
                     else:
                         page_analysed_bounding_boxes = []
                 page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
-                #print("page_analysed_bounding_boxes_out_converted:", page_analysed_bounding_boxes)
                 # Annotate redactions on page
                 pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
-                # print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
                 # Make pymupdf page redactions
-                #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
                     if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
@@ -1761,9 +1683,6 @@ def redact_text_pdf(
                 pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
-                #print("image_annotations:", image_annotations)
-                #print("Did redact_page_with_pymupdf function")
                 reported_page_no = page_no + 1
                 print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
@@ -1778,14 +1697,12 @@ def redact_text_pdf(
                 if not decision_process_table_on_page.empty:
                     all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
-                    #print("all_decision_process_table:", all_decision_process_table)
                 toc = time.perf_counter()
                 time_taken = toc - tic
-                #print("toc - tic:", time_taken)
                 # Break if time taken is greater than max_time seconds
                 if time_taken > max_time:
                     print("Processing for", max_time, "seconds, breaking.")

  custom_recogniser_word_list:List[str]=None,
  redact_whole_page_list:List[str]=None,
  latest_file_completed:int=0,
+ out_message:List=[],
+ out_file_paths:List=[],
+ log_files_output_paths:List=[],
  first_loop_state:bool=False,
  page_min:int=0,
  page_max:int=999,
         file_paths_list = file_paths
         file_paths_loop = [file_paths_list[int(latest_file_completed)]]
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
         if file_path:
             pdf_file_name_without_ext = get_file_name_without_type(file_path)
             pdf_file_name_with_ext = os.path.basename(file_path)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
              custom_recogniser_word_list,
              redact_whole_page_list,
              max_fuzzy_spelling_mistakes_num,
+             match_fuzzy_whole_phrase_bool,
+             log_files_output_paths=log_files_output_paths)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
                 all_request_metadata.append(new_request_metadata)
         elif in_redact_method == text_ocr_option:
             # Save file
             if is_pdf(file_path) == False:
                 out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
                 pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
                 out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
             out_file_paths.append(out_redacted_pdf_file_path)
             out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
             logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
             # Save the gradio_annotation_boxes to a JSON file
             try:
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
                 out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
                 review_df.to_csv(out_review_file_path, index=None)
                 out_file_paths.append(out_review_file_path)
+                #print("Saved review file to csv")
                 out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
                 with open(out_annotation_file_path, 'w') as f:
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
+                #print("Saving annotations to JSON")
             except Exception as e:
                 print("Could not save annotations to json or csv file:", e)
             combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
             estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         else:
             toc = time.perf_counter()
     if combined_out_message: out_message = combined_out_message
     # Ensure no duplicated output files
     log_files_output_paths = list(set(log_files_output_paths))
     out_file_paths = list(set(out_file_paths))
     review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
     return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
         # Unpack coordinates
         x1, y1, x2, y2 = rect_coordinates
         x1 = (x1* scale_width)# + page_x_adjust
         new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust  # Calculate y1 correctly
         x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust  # Calculate x1
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     else: page_min = page_min - 1
     print("Page range:", str(page_min + 1), "to", str(page_max))
     # If running Textract, check if file already exists. If it does, load in existing data
     if analysis_type == textract_option:
         json_file_path = output_folder + file_name + "_textract.json"
         if not os.path.exists(json_file_path):
             print("No existing Textract results file found.")
             textract_data = {}
         else:
             # Open the file and load the JSON data
             no_textract_file = False
                 textract_data = json.load(json_file)
     ###
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
         page_break_return = False
         reported_page_number = str(page_no + 1)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
                 redaction_bboxes = []
+            # if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
+            # elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
+            # # Save decision making process
+            # bboxes_str = str(redaction_bboxes)
+            # with open(interim_results_file_path, "w") as f:
+            #     f.write(bboxes_str)
             # Merge close bounding boxes
             merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
                 all_image_annotations_boxes = []
                 for box in merged_redaction_bboxes:
                     x0 = box.left
                     y0 = box.top
             ## Apply annotations with pymupdf
             else:
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
                     if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
             time_taken = toc - tic
             # Break if time taken is greater than max_time seconds
             if time_taken > max_time:
                 print("Processing for", max_time, "seconds, breaking loop.")
                     pymupdf_doc = images
                 # Check if the image already exists in annotations_all_pages
                 existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
                 if existing_index is not None:
                     # Replace the existing annotation
                         if json_file_path not in log_files_output_paths:
                             log_files_output_paths.append(json_file_path)
+                            print("At end of redact_image_pdf function where time over max.", json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
                 current_loop_page += 1
                 return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
             pymupdf_doc = images
         # Check if the image already exists in annotations_all_pages
         existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
         if existing_index is not None:
             # Replace the existing annotation
         if isinstance(char, LTAnno):
             added_text = char.get_text()
             # Handle double quotes
             # Check for line break (assuming a new line is indicated by a specific character)
             if '\n' in added_text:
                 # Finalize the current line
                 if current_word:
                     word_bboxes.append((current_word, current_word_bbox))
         word_bboxes.append((current_word, current_word_bbox))
     if full_text:
         if re.search(r'[^\x00-\x7F]', full_text):  # Matches any non-ASCII character
             # Convert special characters to a human-readable format
             full_text = clean_unicode_text(full_text)
             full_text = full_text.strip()
         line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
         analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
         # Remove brackets and split the string into four separate columns
         # Split the boundingBox list into four separate columns
         analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
         analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
         analysed_bounding_boxes_df_new['page'] = page_num + 1
         decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
     return decision_process_table
         return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
     else: page_min = page_min - 1
     print("Page range is",str(page_min + 1), "to", str(page_max))
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
                     ### REDACTION
                     if chosen_redact_entities or chosen_redact_comprehend_entities:
                         page_analysed_bounding_boxes = run_page_text_redaction(
                                                             language,
                                                             chosen_redact_entities,
                                                             comprehend_query_number
                                                             )
                     else:
                         page_analysed_bounding_boxes = []
                 page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
                 # Annotate redactions on page
                 pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
                 # Make pymupdf page redactions
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
                     if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
                 pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
                 reported_page_no = page_no + 1
                 print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
                 if not decision_process_table_on_page.empty:
                     all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
                 toc = time.perf_counter()
                 time_taken = toc - tic
                 # Break if time taken is greater than max_time seconds
                 if time_taken > max_time:
                     print("Processing for", max_time, "seconds, breaking.")

tools/redaction_review.py CHANGED Viewed

@@ -396,7 +396,7 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
-def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
     '''
     Converts coordinates from image space to Adobe PDF space.
@@ -431,7 +431,7 @@ def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width,
     return pdf_x1, pdf_y1, pdf_x2, pdf_y2
-def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''

         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
+def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
     '''
     Converts coordinates from image space to Adobe PDF space.
     return pdf_x1, pdf_y1, pdf_x2, pdf_y2
+def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str]):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''