Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Nov 24, 2024

Commit

68a91f4

1 Parent(s): e85b74e

Allowed for overwriting of default output folder in choose_and_run_redactor function.

Browse files

Files changed (3) hide show

lambda_entrypoint.py +2 -2
tools/cli_redact.py +2 -2
tools/file_redaction.py +7 -2

lambda_entrypoint.py CHANGED Viewed

@@ -98,7 +98,7 @@ def lambda_handler(event, context):
         try:
             result = subprocess.run(command, capture_output=True, text=True, check=True)
             print("Processing succeeded.")
-            #print("Processing succeeded:", result.stdout)
         except subprocess.CalledProcessError as e:
             print("Error during processing:", e.stderr)
             raise e
@@ -107,7 +107,7 @@ def lambda_handler(event, context):
             raise e
         print("Now uploading files from:", output_dir)
         # Upload output files back to S3
         for root, _, files in os.walk(output_dir):
             for file_name in files:

         try:
             result = subprocess.run(command, capture_output=True, text=True, check=True)
             print("Processing succeeded.")
+            print(result.stdout)
         except subprocess.CalledProcessError as e:
             print("Error during processing:", e.stderr)
             raise e
             raise e
         print("Now uploading files from:", output_dir)
         # Upload output files back to S3
         for root, _, files in os.walk(output_dir):
             for file_name in files:

tools/cli_redact.py CHANGED Viewed

@@ -37,7 +37,7 @@ def main(first_loop_state=True, latest_file_completed=0, output_summary="", outp
     parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
     parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
     parser.add_argument('--allow_list', help='Path to allow list CSV file')
-    parser.add_argument('--output_dir', default='output', help='Output directory')
     args = parser.parse_args()
@@ -73,7 +73,7 @@ def main(first_loop_state=True, latest_file_completed=0, output_summary="", outp
         first_loop_state, args.page_min, args.page_max, estimated_time,
         handwrite_signature_checkbox, textract_metadata, all_image_annotations,
         all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
-        current_loop_page, page_break, args.pii_detector, comprehend_query_num
     )
     print(f"\nRedaction complete. Output file_list:\n{output_file_list}")

     parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
     parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
     parser.add_argument('--allow_list', help='Path to allow list CSV file')
+    parser.add_argument('--output_dir', default='output/', help='Output directory')
     args = parser.parse_args()
         first_loop_state, args.page_min, args.page_max, estimated_time,
         handwrite_signature_checkbox, textract_metadata, all_image_annotations,
         all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
+        current_loop_page, page_break, args.pii_detector, comprehend_query_num, args.output_dir
     )
     print(f"\nRedaction complete. Output file_list:\n{output_file_list}")

tools/file_redaction.py CHANGED Viewed

@@ -86,6 +86,7 @@ def choose_and_run_redactor(file_paths:List[str],
  page_break_return:bool=False,
  pii_identification_method:str="Local",
  comprehend_query_number:int=0,
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -116,6 +117,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
@@ -216,6 +218,11 @@ def choose_and_run_redactor(file_paths:List[str],
     else:
         textract_client = ""
     progress(0.5, desc="Redacting file")
     if isinstance(file_paths, str):
@@ -255,8 +262,6 @@ def choose_and_run_redactor(file_paths:List[str],
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."

  page_break_return:bool=False,
  pii_identification_method:str="Local",
  comprehend_query_number:int=0,
+ output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
     else:
         textract_client = ""
+    # Check if output_folder exists, create it if it doesn't
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
     progress(0.5, desc="Redacting file")
     if isinstance(file_paths, str):
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."