Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / tools /cli_redact.py

Added option for running redact function through CLI (i.e. not going through Gradio UI or API). Test functions for running this through AWS Lambda.

e5dfae7 10 days ago

raw

history blame

4.71 kB

	import argparse
	import os
	from tools.helper_functions import ensure_output_folder_exists, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
	from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
	from tools.file_redaction import choose_and_run_redactor
	import pandas as pd
	from datetime import datetime

	chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
	'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
	'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
	'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
	'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
	'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
	chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
	"STREETNAME", "UKPOSTCODE"]

	def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None,
	log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
	current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):

	if output_file_list is None:
	output_file_list = []
	if log_files_list is None:
	log_files_list = []

	parser = argparse.ArgumentParser(description='Redact PII from documents via command line')

	# Required arguments
	parser.add_argument('--input_file', help='Path to input file (PDF, JPG, or PNG)')

	# Optional arguments with defaults matching the GUI app
	parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
	default='Quick image analysis', help='OCR method to use')
	parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
	default='Local', help='PII detection method')
	parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
	parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
	parser.add_argument('--allow_list', help='Path to allow list CSV file')
	parser.add_argument('--output_dir', default='output', help='Output directory')

	args = parser.parse_args()

	# Ensure output directory exists
	ensure_output_folder_exists()

	# Create file object similar to what Gradio provides
	file_obj = {"name": args.input_file}

	# Load allow list if provided
	allow_list_df = pd.DataFrame()
	if args.allow_list:
	allow_list_df = pd.read_csv(args.allow_list)

	# Get file names
	file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)

	# Initialize empty states for PDF processing

	# Prepare PDF/image
	output_summary, prepared_pdf, images_pdf, max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations = prepare_image_or_pdf(
	file_obj, args.ocr_method, allow_list_df, latest_file_completed,
	output_summary, first_loop_state, args.page_max, current_loop_page, all_image_annotations
	)

	output_summary, output_files, output_file_list, latest_file_completed, log_files, \
	log_files_list, estimated_time, textract_metadata, pdf_doc_state, all_image_annotations, \
	current_loop_page, page_break, all_line_level_ocr_results, all_decision_process_table, \
	comprehend_query_num = choose_and_run_redactor(
	file_obj, prepared_pdf, images_pdf, "en", chosen_redact_entities,
	chosen_comprehend_entities, args.ocr_method, allow_list_df,
	latest_file_completed, output_summary, output_file_list, log_files_list,
	first_loop_state, args.page_min, args.page_max, estimated_time,
	handwrite_signature_checkbox, textract_metadata, all_image_annotations,
	all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
	current_loop_page, page_break, args.pii_detector, comprehend_query_num
	)

	print(f"\nRedaction complete. Output summary:\n{output_summary}")
	print(f"\nOutput files saved to: {args.output_dir}")

	if __name__ == "__main__":
	main()