File size: 3,713 Bytes
e5dfae7 18fb7ec e5dfae7 a3ba5e2 18fb7ec a3ba5e2 e5dfae7 18fb7ec 9337aae 18fb7ec 9337aae e5dfae7 9337aae e5dfae7 9337aae e5dfae7 9337aae e5dfae7 9337aae e5dfae7 9337aae e5dfae7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import boto3
import os
import subprocess
print("In lambda_entrypoint function")
try:
s3_client = boto3.client("s3", region_name="eu-west-2")
print("s3_client is initialized:", s3_client)
except Exception as e:
print(f"Error initializing s3_client: {e}")
raise e
TMP_DIR = "/tmp/"
def download_file_from_s3(bucket_name, key, download_path):
"""Download a file from S3 to the local filesystem."""
s3_client.download_file(bucket_name, key, download_path)
print(f"Downloaded {key} to {download_path}")
def upload_file_to_s3(file_path, bucket_name, key):
"""Upload a file to S3."""
s3_client.upload_file(file_path, bucket_name, key)
print(f"Uploaded {file_path} to {key}")
def lambda_handler(event, context):
print("In lambda_handler function")
# Create necessary directories
os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True)
os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True)
print("Got to record loop")
print("Event records is:", event["Records"])
# Extract S3 bucket and object key from the Records
for record in event.get("Records", [{}]):
bucket_name = record.get("s3", {}).get("bucket", {}).get("name")
input_key = record.get("s3", {}).get("object", {}).get("key")
print(f"Processing file {input_key} from bucket {bucket_name}")
# Extract additional arguments
arguments = event.get("arguments", {})
if not input_key:
input_key = arguments.get("input_file", "")
ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)")
pii_detector = arguments.get("pii_detector", "AWS Comprehend")
page_min = str(arguments.get("page_min", 0))
page_max = str(arguments.get("page_max", 0))
allow_list = arguments.get("allow_list", None)
output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output"))
print(f"OCR Method: {ocr_method}")
print(f"PII Detector: {pii_detector}")
print(f"Page Range: {page_min} - {page_max}")
print(f"Allow List: {allow_list}")
print(f"Output Directory: {output_dir}")
# Download input file
input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key))
download_file_from_s3(bucket_name, input_key, input_file_path)
# Construct command
command = [
"python",
"app.py",
"--input_file", input_file_path,
"--ocr_method", ocr_method,
"--pii_detector", pii_detector,
"--page_min", page_min,
"--page_max", page_max,
"--output_dir", output_dir,
]
# Add allow_list only if provided
if allow_list:
allow_list_path = os.path.join(TMP_DIR, "allow_list.csv")
download_file_from_s3(bucket_name, allow_list, allow_list_path)
command.extend(["--allow_list", allow_list_path])
try:
result = subprocess.run(command, capture_output=True, text=True, check=True)
print("Processing succeeded:", result.stdout)
except subprocess.CalledProcessError as e:
print("Error during processing:", e.stderr)
raise e
# Upload output files back to S3
for root, _, files in os.walk(output_dir):
for file_name in files:
local_file_path = os.path.join(root, file_name)
output_key = f"{os.path.dirname(input_key)}/output/{file_name}"
upload_file_to_s3(local_file_path, bucket_name, output_key)
return {"statusCode": 200, "body": "Processing complete."} |