Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / lambda_entrypoint.py

seanpedrickcase

Some more debugging. Added aws-lambda-adapter just in case that's useful in AWS Lambda

a3ba5e2 8 days ago

raw

history blame

3.71 kB

	import boto3
	import os
	import subprocess

	print("In lambda_entrypoint function")

	try:
	s3_client = boto3.client("s3", region_name="eu-west-2")
	print("s3_client is initialized:", s3_client)
	except Exception as e:
	print(f"Error initializing s3_client: {e}")
	raise e

	TMP_DIR = "/tmp/"

	def download_file_from_s3(bucket_name, key, download_path):
	"""Download a file from S3 to the local filesystem."""
	s3_client.download_file(bucket_name, key, download_path)
	print(f"Downloaded {key} to {download_path}")

	def upload_file_to_s3(file_path, bucket_name, key):
	"""Upload a file to S3."""
	s3_client.upload_file(file_path, bucket_name, key)
	print(f"Uploaded {file_path} to {key}")

	def lambda_handler(event, context):
	print("In lambda_handler function")
	# Create necessary directories
	os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True)
	os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True)

	print("Got to record loop")
	print("Event records is:", event["Records"])

	# Extract S3 bucket and object key from the Records
	for record in event.get("Records", [{}]):
	bucket_name = record.get("s3", {}).get("bucket", {}).get("name")
	input_key = record.get("s3", {}).get("object", {}).get("key")
	print(f"Processing file {input_key} from bucket {bucket_name}")

	# Extract additional arguments
	arguments = event.get("arguments", {})

	if not input_key:
	input_key = arguments.get("input_file", "")

	ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)")
	pii_detector = arguments.get("pii_detector", "AWS Comprehend")
	page_min = str(arguments.get("page_min", 0))
	page_max = str(arguments.get("page_max", 0))
	allow_list = arguments.get("allow_list", None)
	output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output"))

	print(f"OCR Method: {ocr_method}")
	print(f"PII Detector: {pii_detector}")
	print(f"Page Range: {page_min} - {page_max}")
	print(f"Allow List: {allow_list}")
	print(f"Output Directory: {output_dir}")

	# Download input file
	input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key))
	download_file_from_s3(bucket_name, input_key, input_file_path)

	# Construct command
	command = [
	"python",
	"app.py",
	"--input_file", input_file_path,
	"--ocr_method", ocr_method,
	"--pii_detector", pii_detector,
	"--page_min", page_min,
	"--page_max", page_max,
	"--output_dir", output_dir,
	]

	# Add allow_list only if provided
	if allow_list:
	allow_list_path = os.path.join(TMP_DIR, "allow_list.csv")
	download_file_from_s3(bucket_name, allow_list, allow_list_path)
	command.extend(["--allow_list", allow_list_path])

	try:
	result = subprocess.run(command, capture_output=True, text=True, check=True)
	print("Processing succeeded:", result.stdout)
	except subprocess.CalledProcessError as e:
	print("Error during processing:", e.stderr)
	raise e

	# Upload output files back to S3
	for root, _, files in os.walk(output_dir):
	for file_name in files:
	local_file_path = os.path.join(root, file_name)
	output_key = f"{os.path.dirname(input_key)}/output/{file_name}"
	upload_file_to_s3(local_file_path, bucket_name, output_key)

	return {"statusCode": 200, "body": "Processing complete."}