Spaces:

seanpedrickcase
/

document_redaction

Running

File size: 4,585 Bytes

e5dfae7
 
 
18fb7ec
 
e5dfae7
a3ba5e2
 
 
 
 
 
18fb7ec
a3ba5e2
e5dfae7
6622361
 
1cfa6e8
 
 
 
 
 
 
 
 
 
e5dfae7
 
 
 
 
 
 
 
 
 
 
6622361
18fb7ec
1cfa6e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653bd2d
 
1cfa6e8
 
5d649ba
68a91f4
1cfa6e8
 
 
653bd2d
 
 
1cfa6e8
e85b74e
68a91f4
1cfa6e8
 
 
e85b74e
1cfa6e8
e85b74e
 
1cfa6e8
e5dfae7

import boto3
import os
import subprocess

print("In lambda_entrypoint function")

try:
    s3_client = boto3.client("s3", region_name="eu-west-2")
    print("s3_client is initialized:", s3_client)
except Exception as e:
    print(f"Error initializing s3_client: {e}")
    raise e

TMP_DIR = "/tmp/"

run_direct_mode = os.getenv("RUN_DIRECT_MODE", "0")

if run_direct_mode == "0":
        # Gradio App execution
        from app import app, max_queue_size, max_file_size  # Replace with actual import if needed
        from tools.auth import authenticate_user

        if os.getenv("COGNITO_AUTH", "0") == "1":
            app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size)
        else:
            app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size)

def download_file_from_s3(bucket_name, key, download_path):
    """Download a file from S3 to the local filesystem."""
    s3_client.download_file(bucket_name, key, download_path)
    print(f"Downloaded {key} to {download_path}")

def upload_file_to_s3(file_path, bucket_name, key):
    """Upload a file to S3."""
    s3_client.upload_file(file_path, bucket_name, key)
    print(f"Uploaded {file_path} to {key}")

def lambda_handler(event, context):

    print("In lambda_handler function")
  
    # Create necessary directories
    os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True)
    os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True)

    print("Got to record loop")
    print("Event records is:", event["Records"])

    # Extract S3 bucket and object key from the Records
    for record in event.get("Records", [{}]):
        bucket_name = record.get("s3", {}).get("bucket", {}).get("name")
        input_key = record.get("s3", {}).get("object", {}).get("key")
        print(f"Processing file {input_key} from bucket {bucket_name}")

        # Extract additional arguments
        arguments = event.get("arguments", {})

        if not input_key:
            input_key = arguments.get("input_file", "")

        ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)")
        pii_detector = arguments.get("pii_detector", "AWS Comprehend")
        page_min = str(arguments.get("page_min", 0))
        page_max = str(arguments.get("page_max", 0))
        allow_list = arguments.get("allow_list", None)
        output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output"))
        
        print(f"OCR Method: {ocr_method}")
        print(f"PII Detector: {pii_detector}")
        print(f"Page Range: {page_min} - {page_max}")
        print(f"Allow List: {allow_list}")
        print(f"Output Directory: {output_dir}")

        # Download input file
        input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key))
        download_file_from_s3(bucket_name, input_key, input_file_path)

        # Construct command
        command = [
            "python",
            "app.py",
            "--input_file", input_file_path,
            "--ocr_method", ocr_method,
            "--pii_detector", pii_detector,
            "--page_min", page_min,
            "--page_max", page_max,
            "--output_dir", output_dir,
        ]

        # Add allow_list only if provided
        if allow_list:
            allow_list_path = os.path.join(TMP_DIR, "allow_list.csv")
            download_file_from_s3(bucket_name, allow_list, allow_list_path)
            command.extend(["--allow_list", allow_list_path])

        print(f"Running command: {command}")
        
        try:
            result = subprocess.run(command, capture_output=True, text=True, check=True)
            print("Processing succeeded.")
            print(result.stdout)
        except subprocess.CalledProcessError as e:
            print("Error during processing:", e.stderr)
            raise e
        except Exception as e:
            print(f"Unexpected error: {str(e)}")
            raise e

        print("Now uploading files from:", output_dir)

        # Upload output files back to S3
        for root, _, files in os.walk(output_dir):
            for file_name in files:
                print("file_name:", file_name)
                local_file_path = os.path.join(root, file_name)
                output_key = f"output/{file_name}"
                print("Output location is:", output_key)
                upload_file_to_s3(local_file_path, bucket_name, output_key)

    return {"statusCode": 200, "body": "Processing complete."}