seanpedrickcase commited on
Commit
68a91f4
1 Parent(s): e85b74e

Allowed for overwriting of default output folder in choose_and_run_redactor function.

Browse files
lambda_entrypoint.py CHANGED
@@ -98,7 +98,7 @@ def lambda_handler(event, context):
98
  try:
99
  result = subprocess.run(command, capture_output=True, text=True, check=True)
100
  print("Processing succeeded.")
101
- #print("Processing succeeded:", result.stdout)
102
  except subprocess.CalledProcessError as e:
103
  print("Error during processing:", e.stderr)
104
  raise e
@@ -107,7 +107,7 @@ def lambda_handler(event, context):
107
  raise e
108
 
109
  print("Now uploading files from:", output_dir)
110
-
111
  # Upload output files back to S3
112
  for root, _, files in os.walk(output_dir):
113
  for file_name in files:
 
98
  try:
99
  result = subprocess.run(command, capture_output=True, text=True, check=True)
100
  print("Processing succeeded.")
101
+ print(result.stdout)
102
  except subprocess.CalledProcessError as e:
103
  print("Error during processing:", e.stderr)
104
  raise e
 
107
  raise e
108
 
109
  print("Now uploading files from:", output_dir)
110
+
111
  # Upload output files back to S3
112
  for root, _, files in os.walk(output_dir):
113
  for file_name in files:
tools/cli_redact.py CHANGED
@@ -37,7 +37,7 @@ def main(first_loop_state=True, latest_file_completed=0, output_summary="", outp
37
  parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
38
  parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
39
  parser.add_argument('--allow_list', help='Path to allow list CSV file')
40
- parser.add_argument('--output_dir', default='output', help='Output directory')
41
 
42
  args = parser.parse_args()
43
 
@@ -73,7 +73,7 @@ def main(first_loop_state=True, latest_file_completed=0, output_summary="", outp
73
  first_loop_state, args.page_min, args.page_max, estimated_time,
74
  handwrite_signature_checkbox, textract_metadata, all_image_annotations,
75
  all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
76
- current_loop_page, page_break, args.pii_detector, comprehend_query_num
77
  )
78
 
79
  print(f"\nRedaction complete. Output file_list:\n{output_file_list}")
 
37
  parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
38
  parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
39
  parser.add_argument('--allow_list', help='Path to allow list CSV file')
40
+ parser.add_argument('--output_dir', default='output/', help='Output directory')
41
 
42
  args = parser.parse_args()
43
 
 
73
  first_loop_state, args.page_min, args.page_max, estimated_time,
74
  handwrite_signature_checkbox, textract_metadata, all_image_annotations,
75
  all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
76
+ current_loop_page, page_break, args.pii_detector, comprehend_query_num, args.output_dir
77
  )
78
 
79
  print(f"\nRedaction complete. Output file_list:\n{output_file_list}")
tools/file_redaction.py CHANGED
@@ -86,6 +86,7 @@ def choose_and_run_redactor(file_paths:List[str],
86
  page_break_return:bool=False,
87
  pii_identification_method:str="Local",
88
  comprehend_query_number:int=0,
 
89
  progress=gr.Progress(track_tqdm=True)):
90
  '''
91
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -116,6 +117,7 @@ def choose_and_run_redactor(file_paths:List[str],
116
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
117
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
118
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
 
119
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
120
 
121
  The function returns a redacted document along with processing logs.
@@ -216,6 +218,11 @@ def choose_and_run_redactor(file_paths:List[str],
216
  else:
217
  textract_client = ""
218
 
 
 
 
 
 
219
  progress(0.5, desc="Redacting file")
220
 
221
  if isinstance(file_paths, str):
@@ -255,8 +262,6 @@ def choose_and_run_redactor(file_paths:List[str],
255
 
256
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
257
 
258
-
259
-
260
  #Analyse and redact image-based pdf or image
261
  if is_pdf_or_image(file_path) == False:
262
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
 
86
  page_break_return:bool=False,
87
  pii_identification_method:str="Local",
88
  comprehend_query_number:int=0,
89
+ output_folder:str=output_folder,
90
  progress=gr.Progress(track_tqdm=True)):
91
  '''
92
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
 
117
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
118
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
119
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
120
+ - output_folder (str, optional): Output folder for results.
121
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
122
 
123
  The function returns a redacted document along with processing logs.
 
218
  else:
219
  textract_client = ""
220
 
221
+ # Check if output_folder exists, create it if it doesn't
222
+ if not os.path.exists(output_folder):
223
+ os.makedirs(output_folder)
224
+
225
+
226
  progress(0.5, desc="Redacting file")
227
 
228
  if isinstance(file_paths, str):
 
262
 
263
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
264
 
 
 
265
  #Analyse and redact image-based pdf or image
266
  if is_pdf_or_image(file_path) == False:
267
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."