seanpedrickcase
commited on
Commit
•
68a91f4
1
Parent(s):
e85b74e
Allowed for overwriting of default output folder in choose_and_run_redactor function.
Browse files- lambda_entrypoint.py +2 -2
- tools/cli_redact.py +2 -2
- tools/file_redaction.py +7 -2
lambda_entrypoint.py
CHANGED
@@ -98,7 +98,7 @@ def lambda_handler(event, context):
|
|
98 |
try:
|
99 |
result = subprocess.run(command, capture_output=True, text=True, check=True)
|
100 |
print("Processing succeeded.")
|
101 |
-
|
102 |
except subprocess.CalledProcessError as e:
|
103 |
print("Error during processing:", e.stderr)
|
104 |
raise e
|
@@ -107,7 +107,7 @@ def lambda_handler(event, context):
|
|
107 |
raise e
|
108 |
|
109 |
print("Now uploading files from:", output_dir)
|
110 |
-
|
111 |
# Upload output files back to S3
|
112 |
for root, _, files in os.walk(output_dir):
|
113 |
for file_name in files:
|
|
|
98 |
try:
|
99 |
result = subprocess.run(command, capture_output=True, text=True, check=True)
|
100 |
print("Processing succeeded.")
|
101 |
+
print(result.stdout)
|
102 |
except subprocess.CalledProcessError as e:
|
103 |
print("Error during processing:", e.stderr)
|
104 |
raise e
|
|
|
107 |
raise e
|
108 |
|
109 |
print("Now uploading files from:", output_dir)
|
110 |
+
|
111 |
# Upload output files back to S3
|
112 |
for root, _, files in os.walk(output_dir):
|
113 |
for file_name in files:
|
tools/cli_redact.py
CHANGED
@@ -37,7 +37,7 @@ def main(first_loop_state=True, latest_file_completed=0, output_summary="", outp
|
|
37 |
parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
|
38 |
parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
|
39 |
parser.add_argument('--allow_list', help='Path to allow list CSV file')
|
40 |
-
parser.add_argument('--output_dir', default='output', help='Output directory')
|
41 |
|
42 |
args = parser.parse_args()
|
43 |
|
@@ -73,7 +73,7 @@ def main(first_loop_state=True, latest_file_completed=0, output_summary="", outp
|
|
73 |
first_loop_state, args.page_min, args.page_max, estimated_time,
|
74 |
handwrite_signature_checkbox, textract_metadata, all_image_annotations,
|
75 |
all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
|
76 |
-
current_loop_page, page_break, args.pii_detector, comprehend_query_num
|
77 |
)
|
78 |
|
79 |
print(f"\nRedaction complete. Output file_list:\n{output_file_list}")
|
|
|
37 |
parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
|
38 |
parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
|
39 |
parser.add_argument('--allow_list', help='Path to allow list CSV file')
|
40 |
+
parser.add_argument('--output_dir', default='output/', help='Output directory')
|
41 |
|
42 |
args = parser.parse_args()
|
43 |
|
|
|
73 |
first_loop_state, args.page_min, args.page_max, estimated_time,
|
74 |
handwrite_signature_checkbox, textract_metadata, all_image_annotations,
|
75 |
all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
|
76 |
+
current_loop_page, page_break, args.pii_detector, comprehend_query_num, args.output_dir
|
77 |
)
|
78 |
|
79 |
print(f"\nRedaction complete. Output file_list:\n{output_file_list}")
|
tools/file_redaction.py
CHANGED
@@ -86,6 +86,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
86 |
page_break_return:bool=False,
|
87 |
pii_identification_method:str="Local",
|
88 |
comprehend_query_number:int=0,
|
|
|
89 |
progress=gr.Progress(track_tqdm=True)):
|
90 |
'''
|
91 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
@@ -116,6 +117,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
116 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
117 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
118 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
|
|
119 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
120 |
|
121 |
The function returns a redacted document along with processing logs.
|
@@ -216,6 +218,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
216 |
else:
|
217 |
textract_client = ""
|
218 |
|
|
|
|
|
|
|
|
|
|
|
219 |
progress(0.5, desc="Redacting file")
|
220 |
|
221 |
if isinstance(file_paths, str):
|
@@ -255,8 +262,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
255 |
|
256 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
257 |
|
258 |
-
|
259 |
-
|
260 |
#Analyse and redact image-based pdf or image
|
261 |
if is_pdf_or_image(file_path) == False:
|
262 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
|
|
86 |
page_break_return:bool=False,
|
87 |
pii_identification_method:str="Local",
|
88 |
comprehend_query_number:int=0,
|
89 |
+
output_folder:str=output_folder,
|
90 |
progress=gr.Progress(track_tqdm=True)):
|
91 |
'''
|
92 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
|
|
117 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
118 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
119 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
120 |
+
- output_folder (str, optional): Output folder for results.
|
121 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
122 |
|
123 |
The function returns a redacted document along with processing logs.
|
|
|
218 |
else:
|
219 |
textract_client = ""
|
220 |
|
221 |
+
# Check if output_folder exists, create it if it doesn't
|
222 |
+
if not os.path.exists(output_folder):
|
223 |
+
os.makedirs(output_folder)
|
224 |
+
|
225 |
+
|
226 |
progress(0.5, desc="Redacting file")
|
227 |
|
228 |
if isinstance(file_paths, str):
|
|
|
262 |
|
263 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
264 |
|
|
|
|
|
265 |
#Analyse and redact image-based pdf or image
|
266 |
if is_pdf_or_image(file_path) == False:
|
267 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|