seanpedrickcase commited on
Commit
9337aae
1 Parent(s): 35e6d45

Added lambda_entrypoint.py to main folder

Browse files
Files changed (1) hide show
  1. lambda_entrypoint.py +39 -20
lambda_entrypoint.py CHANGED
@@ -4,6 +4,7 @@ import subprocess
4
  from urllib.parse import unquote_plus
5
 
6
  s3_client = boto3.client("s3")
 
7
 
8
  def download_file_from_s3(bucket_name, key, download_path):
9
  """Download a file from S3 to the local filesystem."""
@@ -16,39 +17,57 @@ def upload_file_to_s3(file_path, bucket_name, key):
16
  print(f"Uploaded {file_path} to {key}")
17
 
18
  def lambda_handler(event, context):
19
- """Main Lambda function handler."""
20
- # Parse the S3 event
21
- for record in event["Records"]:
22
- bucket_name = record["s3"]["bucket"]["name"]
23
- input_key = unquote_plus(record["s3"]["object"]["key"])
 
 
 
24
  print(f"Processing file {input_key} from bucket {bucket_name}")
25
 
26
- # Prepare paths
27
- input_file_path = f"/tmp/{os.path.basename(input_key)}"
28
- allow_list_path = f"/tmp/allow_list.csv" # Adjust this as needed
29
- output_dir = "/tmp/output"
30
- os.makedirs(output_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Download input file
 
33
  download_file_from_s3(bucket_name, input_key, input_file_path)
34
 
35
- # (Optional) Download allow_list if needed
36
- allow_list_key = "path/to/allow_list.csv" # Adjust path as required
37
- download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
38
-
39
- # Construct and run the command
40
  command = [
41
  "python",
42
  "app.py",
43
  "--input_file", input_file_path,
44
- "--ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)",
45
- "--pii_detector", "AWS Comprehend",
46
- "--page_min", "0",
47
- "--page_max", "0",
48
- "--allow_list", allow_list_path,
49
  "--output_dir", output_dir,
50
  ]
51
 
 
 
 
 
 
 
52
  try:
53
  result = subprocess.run(command, capture_output=True, text=True, check=True)
54
  print("Processing succeeded:", result.stdout)
 
4
  from urllib.parse import unquote_plus
5
 
6
  s3_client = boto3.client("s3")
7
+ TMP_DIR = "/tmp" # Use absolute path
8
 
9
  def download_file_from_s3(bucket_name, key, download_path):
10
  """Download a file from S3 to the local filesystem."""
 
17
  print(f"Uploaded {file_path} to {key}")
18
 
19
  def lambda_handler(event, context):
20
+ # Create necessary directories
21
+ os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True)
22
+ os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True)
23
+
24
+ # Extract S3 bucket and object key from the Records
25
+ for record in event.get("Records", [{}]):
26
+ bucket_name = record.get("s3", {}).get("bucket", {}).get("name")
27
+ input_key = record.get("s3", {}).get("object", {}).get("key")
28
  print(f"Processing file {input_key} from bucket {bucket_name}")
29
 
30
+ # Extract additional arguments
31
+ arguments = event.get("arguments", {})
32
+
33
+ if not input_key:
34
+ input_key = arguments.get("input_file", "")
35
+
36
+ ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)")
37
+ pii_detector = arguments.get("pii_detector", "AWS Comprehend")
38
+ page_min = str(arguments.get("page_min", 0))
39
+ page_max = str(arguments.get("page_max", 0))
40
+ allow_list = arguments.get("allow_list", None)
41
+ output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output"))
42
+
43
+ print(f"OCR Method: {ocr_method}")
44
+ print(f"PII Detector: {pii_detector}")
45
+ print(f"Page Range: {page_min} - {page_max}")
46
+ print(f"Allow List: {allow_list}")
47
+ print(f"Output Directory: {output_dir}")
48
 
49
  # Download input file
50
+ input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key))
51
  download_file_from_s3(bucket_name, input_key, input_file_path)
52
 
53
+ # Construct command
 
 
 
 
54
  command = [
55
  "python",
56
  "app.py",
57
  "--input_file", input_file_path,
58
+ "--ocr_method", ocr_method,
59
+ "--pii_detector", pii_detector,
60
+ "--page_min", page_min,
61
+ "--page_max", page_max,
 
62
  "--output_dir", output_dir,
63
  ]
64
 
65
+ # Add allow_list only if provided
66
+ if allow_list:
67
+ allow_list_path = os.path.join(TMP_DIR, "allow_list.csv")
68
+ download_file_from_s3(bucket_name, allow_list, allow_list_path)
69
+ command.extend(["--allow_list", allow_list_path])
70
+
71
  try:
72
  result = subprocess.run(command, capture_output=True, text=True, check=True)
73
  print("Processing succeeded:", result.stdout)