Spaces:

seanpedrickcase
/

document_redaction

Running

File size: 3,713 Bytes

e5dfae7
 
 
18fb7ec
 
e5dfae7
a3ba5e2
 
 
 
 
 
18fb7ec
a3ba5e2
e5dfae7
 
 
 
 
 
 
 
 
 
 
 
18fb7ec
9337aae
 
 
 
18fb7ec
 
 
9337aae
 
 
 
e5dfae7
 
9337aae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5dfae7
 
9337aae
e5dfae7
 
9337aae
e5dfae7
 
 
 
9337aae
 
 
 
e5dfae7
 
 
9337aae
 
 
 
 
 
e5dfae7

import boto3
import os
import subprocess

print("In lambda_entrypoint function")

try:
    s3_client = boto3.client("s3", region_name="eu-west-2")
    print("s3_client is initialized:", s3_client)
except Exception as e:
    print(f"Error initializing s3_client: {e}")
    raise e

TMP_DIR = "/tmp/"

def download_file_from_s3(bucket_name, key, download_path):
    """Download a file from S3 to the local filesystem."""
    s3_client.download_file(bucket_name, key, download_path)
    print(f"Downloaded {key} to {download_path}")

def upload_file_to_s3(file_path, bucket_name, key):
    """Upload a file to S3."""
    s3_client.upload_file(file_path, bucket_name, key)
    print(f"Uploaded {file_path} to {key}")

def lambda_handler(event, context):
    print("In lambda_handler function")
    # Create necessary directories
    os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True)
    os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True)

    print("Got to record loop")
    print("Event records is:", event["Records"])

    # Extract S3 bucket and object key from the Records
    for record in event.get("Records", [{}]):
        bucket_name = record.get("s3", {}).get("bucket", {}).get("name")
        input_key = record.get("s3", {}).get("object", {}).get("key")
        print(f"Processing file {input_key} from bucket {bucket_name}")

        # Extract additional arguments
        arguments = event.get("arguments", {})

        if not input_key:
            input_key = arguments.get("input_file", "")

        ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)")
        pii_detector = arguments.get("pii_detector", "AWS Comprehend")
        page_min = str(arguments.get("page_min", 0))
        page_max = str(arguments.get("page_max", 0))
        allow_list = arguments.get("allow_list", None)
        output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output"))
        
        print(f"OCR Method: {ocr_method}")
        print(f"PII Detector: {pii_detector}")
        print(f"Page Range: {page_min} - {page_max}")
        print(f"Allow List: {allow_list}")
        print(f"Output Directory: {output_dir}")

        # Download input file
        input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key))
        download_file_from_s3(bucket_name, input_key, input_file_path)

        # Construct command
        command = [
            "python",
            "app.py",
            "--input_file", input_file_path,
            "--ocr_method", ocr_method,
            "--pii_detector", pii_detector,
            "--page_min", page_min,
            "--page_max", page_max,
            "--output_dir", output_dir,
        ]

        # Add allow_list only if provided
        if allow_list:
            allow_list_path = os.path.join(TMP_DIR, "allow_list.csv")
            download_file_from_s3(bucket_name, allow_list, allow_list_path)
            command.extend(["--allow_list", allow_list_path])

        try:
            result = subprocess.run(command, capture_output=True, text=True, check=True)
            print("Processing succeeded:", result.stdout)
        except subprocess.CalledProcessError as e:
            print("Error during processing:", e.stderr)
            raise e

        # Upload output files back to S3
        for root, _, files in os.walk(output_dir):
            for file_name in files:
                local_file_path = os.path.join(root, file_name)
                output_key = f"{os.path.dirname(input_key)}/output/{file_name}"
                upload_file_to_s3(local_file_path, bucket_name, output_key)

    return {"statusCode": 200, "body": "Processing complete."}