seanpedrickcase
commited on
Commit
•
e5dfae7
1
Parent(s):
e2aae24
Added option for running redact function through CLI (i.e. not going through Gradio UI or API). Test functions for running this through AWS Lambda.
Browse files- Dockerfile +7 -1
- app.py +19 -4
- entrypoint_router.py +23 -0
- lambda_entrypoint.py +66 -0
- tools/aws_functions.py +1 -1
- tools/cli_redact.py +83 -0
- tools/file_conversion.py +22 -5
- tools/file_redaction.py +12 -2
- tools/redaction_review.py +1 -1
Dockerfile
CHANGED
@@ -14,6 +14,9 @@ RUN pip install --no-cache-dir --target=/install -r requirements.txt
|
|
14 |
|
15 |
RUN rm requirements.txt
|
16 |
|
|
|
|
|
|
|
17 |
# Stage 2: Final runtime image
|
18 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
19 |
|
@@ -62,4 +65,7 @@ WORKDIR $HOME/app
|
|
62 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
63 |
COPY --chown=user . $HOME/app
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
14 |
|
15 |
RUN rm requirements.txt
|
16 |
|
17 |
+
# Add lambda_entrypoint.py to the container
|
18 |
+
COPY lambda_entrypoint.py .
|
19 |
+
|
20 |
# Stage 2: Final runtime image
|
21 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
22 |
|
|
|
65 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
66 |
COPY --chown=user . $HOME/app
|
67 |
|
68 |
+
# Keep the default entrypoint as flexible
|
69 |
+
ENTRYPOINT ["python", "-u", "entrypoint_router.py"]
|
70 |
+
|
71 |
+
#CMD ["python", "app.py"]
|
app.py
CHANGED
@@ -364,7 +364,7 @@ with app:
|
|
364 |
|
365 |
# If running on AWS, load in the default allow list file from S3
|
366 |
if RUN_AWS_FUNCTIONS == "1":
|
367 |
-
print("default_allow_list_output_folder_location:",
|
368 |
if not os.path.exists(default_allow_list_loc):
|
369 |
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
|
370 |
then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
@@ -399,11 +399,26 @@ with app:
|
|
399 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
400 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
401 |
|
|
|
|
|
|
|
402 |
if __name__ == "__main__":
|
403 |
-
|
404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
else:
|
406 |
-
|
|
|
|
|
|
|
|
|
407 |
|
408 |
|
409 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
|
|
364 |
|
365 |
# If running on AWS, load in the default allow list file from S3
|
366 |
if RUN_AWS_FUNCTIONS == "1":
|
367 |
+
print("default_allow_list_output_folder_location:", default_allow_list_loc)
|
368 |
if not os.path.exists(default_allow_list_loc):
|
369 |
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
|
370 |
then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
|
|
399 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
400 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
401 |
|
402 |
+
RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
|
403 |
+
print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
|
404 |
+
|
405 |
if __name__ == "__main__":
|
406 |
+
|
407 |
+
if RUN_DIRECT_MODE == "0":
|
408 |
+
max_queue_size = 5
|
409 |
+
max_file_size = '100mb'
|
410 |
+
|
411 |
+
if os.environ['COGNITO_AUTH'] == "1":
|
412 |
+
app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size)
|
413 |
+
else:
|
414 |
+
app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size)
|
415 |
+
|
416 |
else:
|
417 |
+
from tools.cli_redact import main
|
418 |
+
|
419 |
+
main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
|
420 |
+
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
421 |
+
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
|
422 |
|
423 |
|
424 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
entrypoint_router.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
run_direct_mode = os.getenv("RUN_DIRECT_MODE", "0")
|
6 |
+
|
7 |
+
if run_direct_mode == "1":
|
8 |
+
# Lambda execution or CLI invocation (Direct Mode)
|
9 |
+
from lambda_entrypoint import lambda_handler
|
10 |
+
|
11 |
+
# Simulate the Lambda event and context for local testing
|
12 |
+
event = os.getenv("LAMBDA_TEST_EVENT", '{}')
|
13 |
+
context = None # Add mock context if needed
|
14 |
+
response = lambda_handler(eval(event), context)
|
15 |
+
print(response)
|
16 |
+
else:
|
17 |
+
# Gradio App execution
|
18 |
+
from app import app # Replace with actual import if needed
|
19 |
+
|
20 |
+
if os.getenv("COGNITO_AUTH", "0") == "1":
|
21 |
+
app.queue(max_size=app.max_queue_size).launch(show_error=True, auth=app.authenticate_user, max_file_size=app.max_file_size)
|
22 |
+
else:
|
23 |
+
app.queue(max_size=app.max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=app.max_file_size)
|
lambda_entrypoint.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import boto3
|
2 |
+
import os
|
3 |
+
import subprocess
|
4 |
+
from urllib.parse import unquote_plus
|
5 |
+
|
6 |
+
s3_client = boto3.client("s3")
|
7 |
+
|
8 |
+
def download_file_from_s3(bucket_name, key, download_path):
|
9 |
+
"""Download a file from S3 to the local filesystem."""
|
10 |
+
s3_client.download_file(bucket_name, key, download_path)
|
11 |
+
print(f"Downloaded {key} to {download_path}")
|
12 |
+
|
13 |
+
def upload_file_to_s3(file_path, bucket_name, key):
|
14 |
+
"""Upload a file to S3."""
|
15 |
+
s3_client.upload_file(file_path, bucket_name, key)
|
16 |
+
print(f"Uploaded {file_path} to {key}")
|
17 |
+
|
18 |
+
def lambda_handler(event, context):
|
19 |
+
"""Main Lambda function handler."""
|
20 |
+
# Parse the S3 event
|
21 |
+
for record in event["Records"]:
|
22 |
+
bucket_name = record["s3"]["bucket"]["name"]
|
23 |
+
input_key = unquote_plus(record["s3"]["object"]["key"])
|
24 |
+
print(f"Processing file {input_key} from bucket {bucket_name}")
|
25 |
+
|
26 |
+
# Prepare paths
|
27 |
+
input_file_path = f"/tmp/{os.path.basename(input_key)}"
|
28 |
+
allow_list_path = f"/tmp/allow_list.csv" # Adjust this as needed
|
29 |
+
output_dir = "/tmp/output"
|
30 |
+
os.makedirs(output_dir, exist_ok=True)
|
31 |
+
|
32 |
+
# Download input file
|
33 |
+
download_file_from_s3(bucket_name, input_key, input_file_path)
|
34 |
+
|
35 |
+
# (Optional) Download allow_list if needed
|
36 |
+
allow_list_key = "path/to/allow_list.csv" # Adjust path as required
|
37 |
+
download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
|
38 |
+
|
39 |
+
# Construct and run the command
|
40 |
+
command = [
|
41 |
+
"python",
|
42 |
+
"app.py",
|
43 |
+
"--input_file", input_file_path,
|
44 |
+
"--ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)",
|
45 |
+
"--pii_detector", "AWS Comprehend",
|
46 |
+
"--page_min", "0",
|
47 |
+
"--page_max", "0",
|
48 |
+
"--allow_list", allow_list_path,
|
49 |
+
"--output_dir", output_dir,
|
50 |
+
]
|
51 |
+
|
52 |
+
try:
|
53 |
+
result = subprocess.run(command, capture_output=True, text=True, check=True)
|
54 |
+
print("Processing succeeded:", result.stdout)
|
55 |
+
except subprocess.CalledProcessError as e:
|
56 |
+
print("Error during processing:", e.stderr)
|
57 |
+
raise e
|
58 |
+
|
59 |
+
# Upload output files back to S3
|
60 |
+
for root, _, files in os.walk(output_dir):
|
61 |
+
for file_name in files:
|
62 |
+
local_file_path = os.path.join(root, file_name)
|
63 |
+
output_key = f"{os.path.dirname(input_key)}/output/{file_name}"
|
64 |
+
upload_file_to_s3(local_file_path, bucket_name, output_key)
|
65 |
+
|
66 |
+
return {"statusCode": 200, "body": "Processing complete."}
|
tools/aws_functions.py
CHANGED
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
10 |
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
|
13 |
-
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "
|
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
15 |
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
|
|
10 |
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
|
13 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
|
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
15 |
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
tools/cli_redact.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
from tools.helper_functions import ensure_output_folder_exists, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
4 |
+
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
|
5 |
+
from tools.file_redaction import choose_and_run_redactor
|
6 |
+
import pandas as pd
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
+
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
|
10 |
+
'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
|
11 |
+
'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
|
12 |
+
'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
|
13 |
+
'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
|
14 |
+
'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
15 |
+
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
|
16 |
+
"STREETNAME", "UKPOSTCODE"]
|
17 |
+
|
18 |
+
def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None,
|
19 |
+
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
20 |
+
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):
|
21 |
+
|
22 |
+
if output_file_list is None:
|
23 |
+
output_file_list = []
|
24 |
+
if log_files_list is None:
|
25 |
+
log_files_list = []
|
26 |
+
|
27 |
+
parser = argparse.ArgumentParser(description='Redact PII from documents via command line')
|
28 |
+
|
29 |
+
# Required arguments
|
30 |
+
parser.add_argument('--input_file', help='Path to input file (PDF, JPG, or PNG)')
|
31 |
+
|
32 |
+
# Optional arguments with defaults matching the GUI app
|
33 |
+
parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
|
34 |
+
default='Quick image analysis', help='OCR method to use')
|
35 |
+
parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
|
36 |
+
default='Local', help='PII detection method')
|
37 |
+
parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
|
38 |
+
parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
|
39 |
+
parser.add_argument('--allow_list', help='Path to allow list CSV file')
|
40 |
+
parser.add_argument('--output_dir', default='output', help='Output directory')
|
41 |
+
|
42 |
+
args = parser.parse_args()
|
43 |
+
|
44 |
+
# Ensure output directory exists
|
45 |
+
ensure_output_folder_exists()
|
46 |
+
|
47 |
+
# Create file object similar to what Gradio provides
|
48 |
+
file_obj = {"name": args.input_file}
|
49 |
+
|
50 |
+
# Load allow list if provided
|
51 |
+
allow_list_df = pd.DataFrame()
|
52 |
+
if args.allow_list:
|
53 |
+
allow_list_df = pd.read_csv(args.allow_list)
|
54 |
+
|
55 |
+
# Get file names
|
56 |
+
file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)
|
57 |
+
|
58 |
+
# Initialize empty states for PDF processing
|
59 |
+
|
60 |
+
# Prepare PDF/image
|
61 |
+
output_summary, prepared_pdf, images_pdf, max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations = prepare_image_or_pdf(
|
62 |
+
file_obj, args.ocr_method, allow_list_df, latest_file_completed,
|
63 |
+
output_summary, first_loop_state, args.page_max, current_loop_page, all_image_annotations
|
64 |
+
)
|
65 |
+
|
66 |
+
output_summary, output_files, output_file_list, latest_file_completed, log_files, \
|
67 |
+
log_files_list, estimated_time, textract_metadata, pdf_doc_state, all_image_annotations, \
|
68 |
+
current_loop_page, page_break, all_line_level_ocr_results, all_decision_process_table, \
|
69 |
+
comprehend_query_num = choose_and_run_redactor(
|
70 |
+
file_obj, prepared_pdf, images_pdf, "en", chosen_redact_entities,
|
71 |
+
chosen_comprehend_entities, args.ocr_method, allow_list_df,
|
72 |
+
latest_file_completed, output_summary, output_file_list, log_files_list,
|
73 |
+
first_loop_state, args.page_min, args.page_max, estimated_time,
|
74 |
+
handwrite_signature_checkbox, textract_metadata, all_image_annotations,
|
75 |
+
all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
|
76 |
+
current_loop_page, page_break, args.pii_detector, comprehend_query_num
|
77 |
+
)
|
78 |
+
|
79 |
+
print(f"\nRedaction complete. Output summary:\n{output_summary}")
|
80 |
+
print(f"\nOutput files saved to: {args.output_dir}")
|
81 |
+
|
82 |
+
if __name__ == "__main__":
|
83 |
+
main()
|
tools/file_conversion.py
CHANGED
@@ -9,6 +9,7 @@ import gradio as gr
|
|
9 |
import time
|
10 |
import json
|
11 |
import pymupdf
|
|
|
12 |
from gradio import Progress
|
13 |
from typing import List, Optional
|
14 |
|
@@ -47,6 +48,8 @@ def is_pdf(filename):
|
|
47 |
|
48 |
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
49 |
|
|
|
|
|
50 |
# Get the number of pages in the PDF
|
51 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
52 |
print("Number of pages in PDF: ", str(page_count))
|
@@ -55,7 +58,9 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag
|
|
55 |
|
56 |
# Open the PDF file
|
57 |
#for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
|
58 |
-
for page_num in
|
|
|
|
|
59 |
|
60 |
print("Converting page: ", str(page_num + 1))
|
61 |
|
@@ -98,7 +103,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag
|
|
98 |
return images
|
99 |
|
100 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
101 |
-
def process_file(file_path):
|
102 |
# Get the file extension
|
103 |
file_extension = os.path.splitext(file_path)[1].lower()
|
104 |
|
@@ -130,7 +135,9 @@ def get_input_file_names(file_input):
|
|
130 |
file_name_with_extension = ""
|
131 |
full_file_name = ""
|
132 |
|
133 |
-
|
|
|
|
|
134 |
|
135 |
if isinstance(file_input, str):
|
136 |
file_input_list = [file_input]
|
@@ -225,6 +232,9 @@ def prepare_image_or_pdf(
|
|
225 |
if not file_paths:
|
226 |
file_paths = []
|
227 |
|
|
|
|
|
|
|
228 |
if isinstance(file_paths, str):
|
229 |
file_path_number = 1
|
230 |
else:
|
@@ -277,8 +287,9 @@ def prepare_image_or_pdf(
|
|
277 |
|
278 |
file_extension = os.path.splitext(file_path)[1].lower()
|
279 |
|
280 |
-
|
281 |
-
if
|
|
|
282 |
in_redact_method = tesseract_ocr_option
|
283 |
|
284 |
|
@@ -333,6 +344,9 @@ def prepare_image_or_pdf(
|
|
333 |
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
334 |
continue
|
335 |
|
|
|
|
|
|
|
336 |
# Convert pdf/image file to correct format for redaction
|
337 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
338 |
if is_pdf_or_image(file_path) == False:
|
@@ -340,6 +354,9 @@ def prepare_image_or_pdf(
|
|
340 |
print(out_message)
|
341 |
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
342 |
|
|
|
|
|
|
|
343 |
converted_file_path = process_file(file_path)
|
344 |
image_file_path = converted_file_path
|
345 |
|
|
|
9 |
import time
|
10 |
import json
|
11 |
import pymupdf
|
12 |
+
from tqdm import tqdm
|
13 |
from gradio import Progress
|
14 |
from typing import List, Optional
|
15 |
|
|
|
48 |
|
49 |
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
50 |
|
51 |
+
print("pdf_path in convert_pdf_to_images:", pdf_path)
|
52 |
+
|
53 |
# Get the number of pages in the PDF
|
54 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
55 |
print("Number of pages in PDF: ", str(page_count))
|
|
|
58 |
|
59 |
# Open the PDF file
|
60 |
#for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
|
61 |
+
for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
|
62 |
+
|
63 |
+
print("page_num in convert_pdf_to_images:", page_num)
|
64 |
|
65 |
print("Converting page: ", str(page_num + 1))
|
66 |
|
|
|
103 |
return images
|
104 |
|
105 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
106 |
+
def process_file(file_path:str):
|
107 |
# Get the file extension
|
108 |
file_extension = os.path.splitext(file_path)[1].lower()
|
109 |
|
|
|
135 |
file_name_with_extension = ""
|
136 |
full_file_name = ""
|
137 |
|
138 |
+
print("file_input in input file names:", file_input)
|
139 |
+
if isinstance(file_input, dict):
|
140 |
+
file_input = os.path.abspath(file_input["name"])
|
141 |
|
142 |
if isinstance(file_input, str):
|
143 |
file_input_list = [file_input]
|
|
|
232 |
if not file_paths:
|
233 |
file_paths = []
|
234 |
|
235 |
+
if isinstance(file_paths, dict):
|
236 |
+
file_paths = os.path.abspath(file_paths["name"])
|
237 |
+
|
238 |
if isinstance(file_paths, str):
|
239 |
file_path_number = 1
|
240 |
else:
|
|
|
287 |
|
288 |
file_extension = os.path.splitext(file_path)[1].lower()
|
289 |
|
290 |
+
|
291 |
+
# Check if the file is an image type and the user selected text ocr option
|
292 |
+
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
293 |
in_redact_method = tesseract_ocr_option
|
294 |
|
295 |
|
|
|
344 |
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
345 |
continue
|
346 |
|
347 |
+
|
348 |
+
print("in_redact_method:", in_redact_method)
|
349 |
+
|
350 |
# Convert pdf/image file to correct format for redaction
|
351 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
352 |
if is_pdf_or_image(file_path) == False:
|
|
|
354 |
print(out_message)
|
355 |
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
356 |
|
357 |
+
print("In correct preparation area.")
|
358 |
+
|
359 |
+
print("file_path at process_file:", file_path)
|
360 |
converted_file_path = process_file(file_path)
|
361 |
image_file_path = converted_file_path
|
362 |
|
tools/file_redaction.py
CHANGED
@@ -180,8 +180,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
180 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
181 |
|
182 |
# Create allow list
|
|
|
|
|
|
|
|
|
183 |
if not in_allow_list.empty:
|
184 |
-
in_allow_list_flat = in_allow_list[0].tolist()
|
185 |
print("In allow list:", in_allow_list_flat)
|
186 |
else:
|
187 |
in_allow_list_flat = []
|
@@ -215,12 +219,18 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
215 |
progress(0.5, desc="Redacting file")
|
216 |
|
217 |
if isinstance(file_paths, str):
|
218 |
-
file_paths_list = [file_paths]
|
|
|
|
|
|
|
|
|
219 |
file_paths_loop = file_paths_list
|
220 |
else:
|
221 |
file_paths_list = file_paths
|
222 |
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
223 |
|
|
|
|
|
224 |
|
225 |
for file in file_paths_loop:
|
226 |
if isinstance(file, str):
|
|
|
180 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
181 |
|
182 |
# Create allow list
|
183 |
+
# If string, assume file path
|
184 |
+
if isinstance(in_allow_list, str):
|
185 |
+
in_allow_list = pd.read_csv(in_allow_list)
|
186 |
+
|
187 |
if not in_allow_list.empty:
|
188 |
+
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
189 |
print("In allow list:", in_allow_list_flat)
|
190 |
else:
|
191 |
in_allow_list_flat = []
|
|
|
219 |
progress(0.5, desc="Redacting file")
|
220 |
|
221 |
if isinstance(file_paths, str):
|
222 |
+
file_paths_list = [os.path.abspath(file_paths)]
|
223 |
+
file_paths_loop = file_paths_list
|
224 |
+
elif isinstance(file_paths, dict):
|
225 |
+
file_paths = file_paths["name"]
|
226 |
+
file_paths_list = [os.path.abspath(file_paths)]
|
227 |
file_paths_loop = file_paths_list
|
228 |
else:
|
229 |
file_paths_list = file_paths
|
230 |
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
231 |
|
232 |
+
print("file_paths_list in choose_redactor function:", file_paths_list)
|
233 |
+
|
234 |
|
235 |
for file in file_paths_loop:
|
236 |
if isinstance(file, str):
|
tools/redaction_review.py
CHANGED
@@ -72,7 +72,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
|
|
72 |
|
73 |
return out_image_annotator, number_reported, number_reported
|
74 |
|
75 |
-
print("page_num at start of update_annotator function:", page_num)
|
76 |
|
77 |
if page_num is None:
|
78 |
page_num = 0
|
|
|
72 |
|
73 |
return out_image_annotator, number_reported, number_reported
|
74 |
|
75 |
+
#print("page_num at start of update_annotator function:", page_num)
|
76 |
|
77 |
if page_num is None:
|
78 |
page_num = 0
|