File size: 3,713 Bytes
e5dfae7
 
 
18fb7ec
 
e5dfae7
a3ba5e2
 
 
 
 
 
18fb7ec
a3ba5e2
e5dfae7
 
 
 
 
 
 
 
 
 
 
 
18fb7ec
9337aae
 
 
 
18fb7ec
 
 
9337aae
 
 
 
e5dfae7
 
9337aae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5dfae7
 
9337aae
e5dfae7
 
9337aae
e5dfae7
 
 
 
9337aae
 
 
 
e5dfae7
 
 
9337aae
 
 
 
 
 
e5dfae7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import boto3
import os
import subprocess

print("In lambda_entrypoint function")

try:
    s3_client = boto3.client("s3", region_name="eu-west-2")
    print("s3_client is initialized:", s3_client)
except Exception as e:
    print(f"Error initializing s3_client: {e}")
    raise e

TMP_DIR = "/tmp/"

def download_file_from_s3(bucket_name, key, download_path):
    """Download a file from S3 to the local filesystem."""
    s3_client.download_file(bucket_name, key, download_path)
    print(f"Downloaded {key} to {download_path}")

def upload_file_to_s3(file_path, bucket_name, key):
    """Upload a file to S3."""
    s3_client.upload_file(file_path, bucket_name, key)
    print(f"Uploaded {file_path} to {key}")

def lambda_handler(event, context):
    print("In lambda_handler function")
    # Create necessary directories
    os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True)
    os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True)

    print("Got to record loop")
    print("Event records is:", event["Records"])

    # Extract S3 bucket and object key from the Records
    for record in event.get("Records", [{}]):
        bucket_name = record.get("s3", {}).get("bucket", {}).get("name")
        input_key = record.get("s3", {}).get("object", {}).get("key")
        print(f"Processing file {input_key} from bucket {bucket_name}")

        # Extract additional arguments
        arguments = event.get("arguments", {})

        if not input_key:
            input_key = arguments.get("input_file", "")

        ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)")
        pii_detector = arguments.get("pii_detector", "AWS Comprehend")
        page_min = str(arguments.get("page_min", 0))
        page_max = str(arguments.get("page_max", 0))
        allow_list = arguments.get("allow_list", None)
        output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output"))
        
        print(f"OCR Method: {ocr_method}")
        print(f"PII Detector: {pii_detector}")
        print(f"Page Range: {page_min} - {page_max}")
        print(f"Allow List: {allow_list}")
        print(f"Output Directory: {output_dir}")

        # Download input file
        input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key))
        download_file_from_s3(bucket_name, input_key, input_file_path)

        # Construct command
        command = [
            "python",
            "app.py",
            "--input_file", input_file_path,
            "--ocr_method", ocr_method,
            "--pii_detector", pii_detector,
            "--page_min", page_min,
            "--page_max", page_max,
            "--output_dir", output_dir,
        ]

        # Add allow_list only if provided
        if allow_list:
            allow_list_path = os.path.join(TMP_DIR, "allow_list.csv")
            download_file_from_s3(bucket_name, allow_list, allow_list_path)
            command.extend(["--allow_list", allow_list_path])

        try:
            result = subprocess.run(command, capture_output=True, text=True, check=True)
            print("Processing succeeded:", result.stdout)
        except subprocess.CalledProcessError as e:
            print("Error during processing:", e.stderr)
            raise e

        # Upload output files back to S3
        for root, _, files in os.walk(output_dir):
            for file_name in files:
                local_file_path = os.path.join(root, file_name)
                output_key = f"{os.path.dirname(input_key)}/output/{file_name}"
                upload_file_to_s3(local_file_path, bucket_name, output_key)

    return {"statusCode": 200, "body": "Processing complete."}