Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on about 14 hours ago

Commit

e5dfae7

•

1 Parent(s): e2aae24

Added option for running redact function through CLI (i.e. not going through Gradio UI or API). Test functions for running this through AWS Lambda.

Browse files

Files changed (9) hide show

Dockerfile +7 -1
app.py +19 -4
entrypoint_router.py +23 -0
lambda_entrypoint.py +66 -0
tools/aws_functions.py +1 -1
tools/cli_redact.py +83 -0
tools/file_conversion.py +22 -5
tools/file_redaction.py +12 -2
tools/redaction_review.py +1 -1

Dockerfile CHANGED Viewed

@@ -14,6 +14,9 @@ RUN pip install --no-cache-dir --target=/install -r requirements.txt
 RUN rm requirements.txt
 # Stage 2: Final runtime image
 FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
@@ -62,4 +65,7 @@ WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
-CMD ["python", "app.py"]

 RUN rm requirements.txt
+# Add lambda_entrypoint.py to the container
+COPY lambda_entrypoint.py .
 # Stage 2: Final runtime image
 FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
+# Keep the default entrypoint as flexible
+ENTRYPOINT ["python", "-u", "entrypoint_router.py"]
+#CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -364,7 +364,7 @@ with app:
     # If running on AWS, load in the default allow list file from S3
     if RUN_AWS_FUNCTIONS == "1":
-        print("default_allow_list_output_folder_location:", default_allow_list_output_folder_location)
         if not os.path.exists(default_allow_list_loc):
             app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
             then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
@@ -399,11 +399,26 @@ with app:
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":
-    if os.environ['COGNITO_AUTH'] == "1":
-        app.queue(max_size=5).launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
     else:
-        app.queue(max_size=5).launch(show_error=True, inbrowser=True, max_file_size='100mb')
 # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app

     # If running on AWS, load in the default allow list file from S3
     if RUN_AWS_FUNCTIONS == "1":
+        print("default_allow_list_output_folder_location:", default_allow_list_loc)
         if not os.path.exists(default_allow_list_loc):
             app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
             then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
+RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
+print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
 if __name__ == "__main__":
+    if RUN_DIRECT_MODE == "0":
+        max_queue_size = 5
+        max_file_size = '100mb'
+        if os.environ['COGNITO_AUTH'] == "1":
+            app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size)
+        else:
+            app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size)
     else:
+        from tools.cli_redact import main
+        main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
+         log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
+         current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
 # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app

entrypoint_router.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import subprocess
+if __name__ == "__main__":
+    run_direct_mode = os.getenv("RUN_DIRECT_MODE", "0")
+    if run_direct_mode == "1":
+        # Lambda execution or CLI invocation (Direct Mode)
+        from lambda_entrypoint import lambda_handler
+        # Simulate the Lambda event and context for local testing
+        event = os.getenv("LAMBDA_TEST_EVENT", '{}')
+        context = None  # Add mock context if needed
+        response = lambda_handler(eval(event), context)
+        print(response)
+    else:
+        # Gradio App execution
+        from app import app  # Replace with actual import if needed
+        if os.getenv("COGNITO_AUTH", "0") == "1":
+            app.queue(max_size=app.max_queue_size).launch(show_error=True, auth=app.authenticate_user, max_file_size=app.max_file_size)
+        else:
+            app.queue(max_size=app.max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=app.max_file_size)

lambda_entrypoint.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import boto3
+import os
+import subprocess
+from urllib.parse import unquote_plus
+s3_client = boto3.client("s3")
+def download_file_from_s3(bucket_name, key, download_path):
+    """Download a file from S3 to the local filesystem."""
+    s3_client.download_file(bucket_name, key, download_path)
+    print(f"Downloaded {key} to {download_path}")
+def upload_file_to_s3(file_path, bucket_name, key):
+    """Upload a file to S3."""
+    s3_client.upload_file(file_path, bucket_name, key)
+    print(f"Uploaded {file_path} to {key}")
+def lambda_handler(event, context):
+    """Main Lambda function handler."""
+    # Parse the S3 event
+    for record in event["Records"]:
+        bucket_name = record["s3"]["bucket"]["name"]
+        input_key = unquote_plus(record["s3"]["object"]["key"])
+        print(f"Processing file {input_key} from bucket {bucket_name}")
+        # Prepare paths
+        input_file_path = f"/tmp/{os.path.basename(input_key)}"
+        allow_list_path = f"/tmp/allow_list.csv"  # Adjust this as needed
+        output_dir = "/tmp/output"
+        os.makedirs(output_dir, exist_ok=True)
+        # Download input file
+        download_file_from_s3(bucket_name, input_key, input_file_path)
+        # (Optional) Download allow_list if needed
+        allow_list_key = "path/to/allow_list.csv"  # Adjust path as required
+        download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
+        # Construct and run the command
+        command = [
+            "python",
+            "app.py",
+            "--input_file", input_file_path,
+            "--ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)",
+            "--pii_detector", "AWS Comprehend",
+            "--page_min", "0",
+            "--page_max", "0",
+            "--allow_list", allow_list_path,
+            "--output_dir", output_dir,
+        ]
+        try:
+            result = subprocess.run(command, capture_output=True, text=True, check=True)
+            print("Processing succeeded:", result.stdout)
+        except subprocess.CalledProcessError as e:
+            print("Error during processing:", e.stderr)
+            raise e
+        # Upload output files back to S3
+        for root, _, files in os.walk(output_dir):
+            for file_name in files:
+                local_file_path = os.path.join(root, file_name)
+                output_key = f"{os.path.dirname(input_key)}/output/{file_name}"
+                upload_file_to_s3(local_file_path, bucket_name, output_key)
+    return {"statusCode": 200, "body": "Processing complete."}

tools/aws_functions.py CHANGED Viewed

@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials
 bucket_name=""
-RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')

 # Get AWS credentials
 bucket_name=""
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')

tools/cli_redact.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import argparse
+import os
+from tools.helper_functions import ensure_output_folder_exists, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
+from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
+from tools.file_redaction import choose_and_run_redactor
+import pandas as pd
+from datetime import datetime
+chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV',          'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
+                                'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
+                                'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
+                                'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
+                                'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
+                                'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
+chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
+                            "STREETNAME", "UKPOSTCODE"]
+def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None,
+         log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
+         current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):
+    if output_file_list is None:
+        output_file_list = []
+    if log_files_list is None:
+        log_files_list = []
+    parser = argparse.ArgumentParser(description='Redact PII from documents via command line')
+    # Required arguments
+    parser.add_argument('--input_file', help='Path to input file (PDF, JPG, or PNG)')
+    # Optional arguments with defaults matching the GUI app
+    parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
+                       default='Quick image analysis', help='OCR method to use')
+    parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
+                       default='Local', help='PII detection method')
+    parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
+    parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
+    parser.add_argument('--allow_list', help='Path to allow list CSV file')
+    parser.add_argument('--output_dir', default='output', help='Output directory')
+    args = parser.parse_args()
+    # Ensure output directory exists
+    ensure_output_folder_exists()
+    # Create file object similar to what Gradio provides
+    file_obj = {"name": args.input_file}
+    # Load allow list if provided
+    allow_list_df = pd.DataFrame()
+    if args.allow_list:
+        allow_list_df = pd.read_csv(args.allow_list)
+    # Get file names
+    file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)
+    # Initialize empty states for PDF processing
+    # Prepare PDF/image
+    output_summary, prepared_pdf, images_pdf, max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations = prepare_image_or_pdf(
+        file_obj, args.ocr_method, allow_list_df, latest_file_completed,
+        output_summary, first_loop_state, args.page_max, current_loop_page, all_image_annotations
+    )
+    output_summary, output_files, output_file_list, latest_file_completed, log_files, \
+    log_files_list, estimated_time, textract_metadata, pdf_doc_state, all_image_annotations, \
+    current_loop_page, page_break, all_line_level_ocr_results, all_decision_process_table, \
+    comprehend_query_num = choose_and_run_redactor(
+        file_obj, prepared_pdf, images_pdf, "en", chosen_redact_entities,
+        chosen_comprehend_entities, args.ocr_method, allow_list_df,
+        latest_file_completed, output_summary, output_file_list, log_files_list,
+        first_loop_state, args.page_min, args.page_max, estimated_time,
+        handwrite_signature_checkbox, textract_metadata, all_image_annotations,
+        all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
+        current_loop_page, page_break, args.pii_detector, comprehend_query_num
+    )
+    print(f"\nRedaction complete. Output summary:\n{output_summary}")
+    print(f"\nOutput files saved to: {args.output_dir}")
+if __name__ == "__main__":
+    main()

tools/file_conversion.py CHANGED Viewed

@@ -9,6 +9,7 @@ import gradio as gr
 import time
 import json
 import pymupdf
 from gradio import Progress
 from typing import List, Optional
@@ -47,6 +48,8 @@ def is_pdf(filename):
 def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
     print("Number of pages in PDF: ", str(page_count))
@@ -55,7 +58,9 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag
     # Open the PDF file
     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
-    for page_num in progress.tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
         print("Converting page: ", str(page_num + 1))
@@ -98,7 +103,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag
     return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
-def process_file(file_path):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
@@ -130,7 +135,9 @@ def get_input_file_names(file_input):
     file_name_with_extension = ""
     full_file_name = ""
-    #print("file_input:", file_input)
     if isinstance(file_input, str):
         file_input_list = [file_input]
@@ -225,6 +232,9 @@ def prepare_image_or_pdf(
     if not file_paths:
         file_paths = []
     if isinstance(file_paths, str):
         file_path_number = 1
     else:
@@ -277,8 +287,9 @@ def prepare_image_or_pdf(
         file_extension = os.path.splitext(file_path)[1].lower()
-        # Check if the file is an image type
-        if file_extension in ['.jpg', '.jpeg', '.png']:
             in_redact_method = tesseract_ocr_option
@@ -333,6 +344,9 @@ def prepare_image_or_pdf(
                     json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
         # Convert pdf/image file to correct format for redaction
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             if is_pdf_or_image(file_path) == False:
@@ -340,6 +354,9 @@ def prepare_image_or_pdf(
                 print(out_message)
                 return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
             converted_file_path = process_file(file_path)
             image_file_path = converted_file_path

 import time
 import json
 import pymupdf
+from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
 def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
+    print("pdf_path in convert_pdf_to_images:", pdf_path)
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
     print("Number of pages in PDF: ", str(page_count))
     # Open the PDF file
     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
+    for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
+        print("page_num in convert_pdf_to_images:", page_num)
         print("Converting page: ", str(page_num + 1))
     return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
+def process_file(file_path:str):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
     file_name_with_extension = ""
     full_file_name = ""
+    print("file_input in input file names:", file_input)
+    if isinstance(file_input, dict):
+        file_input = os.path.abspath(file_input["name"])
     if isinstance(file_input, str):
         file_input_list = [file_input]
     if not file_paths:
         file_paths = []
+    if isinstance(file_paths, dict):
+        file_paths = os.path.abspath(file_paths["name"])
     if isinstance(file_paths, str):
         file_path_number = 1
     else:
         file_extension = os.path.splitext(file_path)[1].lower()
+        # Check if the file is an image type and the user selected text ocr option
+        if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
             in_redact_method = tesseract_ocr_option
                     json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
+        print("in_redact_method:", in_redact_method)
         # Convert pdf/image file to correct format for redaction
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             if is_pdf_or_image(file_path) == False:
                 print(out_message)
                 return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
+            print("In correct preparation area.")
+            print("file_path at process_file:", file_path)
             converted_file_path = process_file(file_path)
             image_file_path = converted_file_path

tools/file_redaction.py CHANGED Viewed

@@ -180,8 +180,12 @@ def choose_and_run_redactor(file_paths:List[str],
         return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
     # Create allow list
     if not in_allow_list.empty:
-        in_allow_list_flat = in_allow_list[0].tolist()
         print("In allow list:", in_allow_list_flat)
     else:
         in_allow_list_flat = []
@@ -215,12 +219,18 @@ def choose_and_run_redactor(file_paths:List[str],
     progress(0.5, desc="Redacting file")
     if isinstance(file_paths, str):
-        file_paths_list = [file_paths]
         file_paths_loop = file_paths_list
     else:
         file_paths_list = file_paths
         file_paths_loop = [file_paths_list[int(latest_file_completed)]]
     for file in file_paths_loop:
         if isinstance(file, str):

         return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
     # Create allow list
+    # If string, assume file path
+    if isinstance(in_allow_list, str):
+        in_allow_list = pd.read_csv(in_allow_list)
     if not in_allow_list.empty:
+        in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
         print("In allow list:", in_allow_list_flat)
     else:
         in_allow_list_flat = []
     progress(0.5, desc="Redacting file")
     if isinstance(file_paths, str):
+        file_paths_list = [os.path.abspath(file_paths)]
+        file_paths_loop = file_paths_list
+    elif isinstance(file_paths, dict):
+        file_paths = file_paths["name"]
+        file_paths_list = [os.path.abspath(file_paths)]
         file_paths_loop = file_paths_list
     else:
         file_paths_list = file_paths
         file_paths_loop = [file_paths_list[int(latest_file_completed)]]
+    print("file_paths_list in choose_redactor function:", file_paths_list)
     for file in file_paths_loop:
         if isinstance(file, str):

tools/redaction_review.py CHANGED Viewed

@@ -72,7 +72,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
         return out_image_annotator, number_reported, number_reported
-    print("page_num at start of update_annotator function:", page_num)
     if page_num is None:
         page_num = 0

         return out_image_annotator, number_reported, number_reported
+    #print("page_num at start of update_annotator function:", page_num)
     if page_num is None:
         page_num = 0