Commit
·
42180e4
1
Parent(s):
dea568f
Fixed issues with log file list picking up logs from other file runs. Updated packages.
Browse files- app.py +26 -25
- requirements.txt +13 -13
- tools/file_redaction.py +21 -104
- tools/redaction_review.py +2 -2
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import socket
|
3 |
|
4 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
5 |
-
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
@@ -65,7 +65,8 @@ with app:
|
|
65 |
###
|
66 |
# STATE VARIABLES
|
67 |
###
|
68 |
-
|
|
|
69 |
pdf_doc_state = gr.State([])
|
70 |
all_image_annotations_state = gr.State([])
|
71 |
|
@@ -73,12 +74,12 @@ with app:
|
|
73 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
74 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
75 |
|
76 |
-
session_hash_state = gr.State()
|
77 |
-
s3_output_folder_state = gr.State()
|
78 |
|
79 |
-
first_loop_state = gr.State(True)
|
80 |
-
second_loop_state = gr.State(False)
|
81 |
-
do_not_save_pdf_state = gr.State(False)
|
82 |
|
83 |
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
84 |
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
|
@@ -86,18 +87,18 @@ with app:
|
|
86 |
output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
87 |
output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
88 |
text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
89 |
-
log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=
|
90 |
|
91 |
|
92 |
# Logging state
|
93 |
log_file_name = 'log.csv'
|
94 |
|
95 |
-
feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
|
96 |
-
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
97 |
-
access_logs_state = gr.State(access_logs_folder + log_file_name)
|
98 |
-
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
99 |
-
usage_logs_state = gr.State(usage_logs_folder + log_file_name)
|
100 |
-
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
101 |
|
102 |
# Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
103 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
@@ -121,11 +122,11 @@ with app:
|
|
121 |
|
122 |
## Annotator zoom value
|
123 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
124 |
-
zoom_true_bool = gr.State(True)
|
125 |
-
zoom_false_bool = gr.State(False)
|
126 |
|
127 |
-
clear_all_page_redactions = gr.State(True)
|
128 |
-
prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
|
129 |
|
130 |
## Settings page variables
|
131 |
default_allow_list_file_name = "default_allow_list.csv"
|
@@ -148,11 +149,11 @@ with app:
|
|
148 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
149 |
|
150 |
# Base dataframe for recognisers that is not modified subsequent to load
|
151 |
-
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
|
152 |
|
153 |
# Duplicate page detection
|
154 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
155 |
-
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="
|
156 |
|
157 |
|
158 |
|
@@ -178,8 +179,8 @@ with app:
|
|
178 |
with gr.Accordion("Redact document", open = True):
|
179 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
180 |
if RUN_AWS_FUNCTIONS == "1":
|
181 |
-
in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
182 |
-
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per
|
183 |
else:
|
184 |
in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
|
185 |
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
|
@@ -336,7 +337,7 @@ with app:
|
|
336 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
337 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
338 |
|
339 |
-
with gr.Accordion("AWS Textract
|
340 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
341 |
#with gr.Row():
|
342 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
@@ -542,8 +543,8 @@ print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
|
|
542 |
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
|
543 |
print(f'The value of ROOT_PATH is {ROOT_PATH}')
|
544 |
|
545 |
-
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '
|
546 |
-
print(f'The value of
|
547 |
|
548 |
if __name__ == "__main__":
|
549 |
|
|
|
2 |
import socket
|
3 |
|
4 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
5 |
+
#os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
|
|
65 |
###
|
66 |
# STATE VARIABLES
|
67 |
###
|
68 |
+
|
69 |
+
# Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
|
70 |
pdf_doc_state = gr.State([])
|
71 |
all_image_annotations_state = gr.State([])
|
72 |
|
|
|
74 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
75 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
76 |
|
77 |
+
session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False) #.State()
|
78 |
+
s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False) #.State()
|
79 |
|
80 |
+
first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False) #.State(True)
|
81 |
+
second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False) #.State(False)
|
82 |
+
do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False) #.State(False)
|
83 |
|
84 |
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
85 |
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
|
|
|
87 |
output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
88 |
output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
89 |
text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
90 |
+
log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=True) #gr.State([])
|
91 |
|
92 |
|
93 |
# Logging state
|
94 |
log_file_name = 'log.csv'
|
95 |
|
96 |
+
feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False) #State(feedback_logs_folder + log_file_name)
|
97 |
+
feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False) #State(feedback_logs_folder)
|
98 |
+
access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False) #State(access_logs_folder + log_file_name)
|
99 |
+
access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False) #State(access_logs_folder)
|
100 |
+
usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False) #State(usage_logs_folder + log_file_name)
|
101 |
+
usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False) #State(usage_logs_folder)
|
102 |
|
103 |
# Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
104 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
|
|
122 |
|
123 |
## Annotator zoom value
|
124 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
125 |
+
zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False) #State(True)
|
126 |
+
zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False) #State(False)
|
127 |
|
128 |
+
clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False) #State(True)
|
129 |
+
prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
|
130 |
|
131 |
## Settings page variables
|
132 |
default_allow_list_file_name = "default_allow_list.csv"
|
|
|
149 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
150 |
|
151 |
# Base dataframe for recognisers that is not modified subsequent to load
|
152 |
+
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False, label="recogniser_entity_dataframe_base")
|
153 |
|
154 |
# Duplicate page detection
|
155 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
156 |
+
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
|
157 |
|
158 |
|
159 |
|
|
|
179 |
with gr.Accordion("Redact document", open = True):
|
180 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
181 |
if RUN_AWS_FUNCTIONS == "1":
|
182 |
+
in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
183 |
+
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
184 |
else:
|
185 |
in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
|
186 |
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
|
|
|
337 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
338 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
339 |
|
340 |
+
with gr.Accordion("AWS Textract options", open = False):
|
341 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
342 |
#with gr.Row():
|
343 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
|
|
543 |
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
|
544 |
print(f'The value of ROOT_PATH is {ROOT_PATH}')
|
545 |
|
546 |
+
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
|
547 |
+
print(f'The value of DEFAULT_CONCURRENCY_LIMIT is {DEFAULT_CONCURRENCY_LIMIT}')
|
548 |
|
549 |
if __name__ == "__main__":
|
550 |
|
requirements.txt
CHANGED
@@ -1,22 +1,22 @@
|
|
1 |
-
pdfminer.six==
|
2 |
pdf2image==1.17.0
|
3 |
-
pymupdf==1.
|
4 |
opencv-python==4.10.0.84
|
5 |
-
presidio_analyzer==2.2.
|
6 |
-
presidio_anonymizer==2.2.
|
7 |
-
presidio-image-redactor==0.0.
|
8 |
-
pikepdf==
|
9 |
pandas==2.2.3
|
10 |
nltk==3.9.1
|
11 |
-
scikit-learn==1.
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
-
gradio==5.
|
16 |
-
boto3==1.36.
|
17 |
-
pyarrow==
|
18 |
-
openpyxl==3.1.
|
19 |
-
Faker==
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
#gradio_image_annotation==0.2.5
|
@@ -24,7 +24,7 @@ spaczz==0.6.1
|
|
24 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
|
25 |
rapidfuzz==3.12.1
|
26 |
numpy==1.26.4
|
27 |
-
awslambdaric==3.0.
|
28 |
|
29 |
|
30 |
|
|
|
1 |
+
pdfminer.six==20240706
|
2 |
pdf2image==1.17.0
|
3 |
+
pymupdf==1.25.3
|
4 |
opencv-python==4.10.0.84
|
5 |
+
presidio_analyzer==2.2.357
|
6 |
+
presidio_anonymizer==2.2.357
|
7 |
+
presidio-image-redactor==0.0.55
|
8 |
+
pikepdf==9.5.2
|
9 |
pandas==2.2.3
|
10 |
nltk==3.9.1
|
11 |
+
scikit-learn==1.6.1
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
+
gradio==5.18.0
|
16 |
+
boto3==1.36.26
|
17 |
+
pyarrow==19.0.1
|
18 |
+
openpyxl==3.1.5
|
19 |
+
Faker==36.1.1
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
#gradio_image_annotation==0.2.5
|
|
|
24 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
|
25 |
rapidfuzz==3.12.1
|
26 |
numpy==1.26.4
|
27 |
+
awslambdaric==3.0.1
|
28 |
|
29 |
|
30 |
|
tools/file_redaction.py
CHANGED
@@ -78,9 +78,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
78 |
custom_recogniser_word_list:List[str]=None,
|
79 |
redact_whole_page_list:List[str]=None,
|
80 |
latest_file_completed:int=0,
|
81 |
-
out_message:
|
82 |
-
out_file_paths:
|
83 |
-
log_files_output_paths:
|
84 |
first_loop_state:bool=False,
|
85 |
page_min:int=0,
|
86 |
page_max:int=999,
|
@@ -301,9 +301,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
301 |
file_paths_list = file_paths
|
302 |
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
303 |
|
304 |
-
# print("file_paths_list in choose_redactor function:", file_paths_list)
|
305 |
-
|
306 |
-
|
307 |
for file in file_paths_loop:
|
308 |
if isinstance(file, str):
|
309 |
file_path = file
|
@@ -313,7 +310,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
313 |
if file_path:
|
314 |
pdf_file_name_without_ext = get_file_name_without_type(file_path)
|
315 |
pdf_file_name_with_ext = os.path.basename(file_path)
|
316 |
-
# print("Redacting file:", pdf_file_name_with_ext)
|
317 |
|
318 |
is_a_pdf = is_pdf(file_path) == True
|
319 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
@@ -361,14 +357,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
361 |
custom_recogniser_word_list,
|
362 |
redact_whole_page_list,
|
363 |
max_fuzzy_spelling_mistakes_num,
|
364 |
-
match_fuzzy_whole_phrase_bool
|
365 |
-
|
366 |
-
|
367 |
-
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
368 |
-
|
369 |
# Save Textract request metadata (if exists)
|
370 |
if new_request_metadata:
|
371 |
-
#print("Request metadata:", new_request_metadata)
|
372 |
all_request_metadata.append(new_request_metadata)
|
373 |
|
374 |
elif in_redact_method == text_ocr_option:
|
@@ -422,9 +415,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
422 |
# Save file
|
423 |
if is_pdf(file_path) == False:
|
424 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
|
425 |
-
#pymupdf_doc[0].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)
|
426 |
-
#print("pymupdf_doc", pymupdf_doc)
|
427 |
-
#print("pymupdf_doc[0]", pymupdf_doc[0])
|
428 |
pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
429 |
out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
|
430 |
|
@@ -434,10 +424,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
434 |
|
435 |
out_file_paths.append(out_redacted_pdf_file_path)
|
436 |
|
437 |
-
#if log_files_output_paths:
|
438 |
-
# log_files_output_paths.extend(log_files_output_paths)
|
439 |
-
|
440 |
-
|
441 |
out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
442 |
|
443 |
logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
@@ -450,27 +436,20 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
450 |
|
451 |
# Save the gradio_annotation_boxes to a JSON file
|
452 |
try:
|
453 |
-
|
454 |
-
#print("Saving annotations to CSV")
|
455 |
-
|
456 |
-
# Convert json to csv and also save this
|
457 |
-
#print("annotations_all_pages:", annotations_all_pages)
|
458 |
-
#print("all_decision_process_table:", all_decision_process_table)
|
459 |
-
|
460 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
461 |
|
462 |
out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
|
463 |
review_df.to_csv(out_review_file_path, index=None)
|
464 |
out_file_paths.append(out_review_file_path)
|
465 |
|
466 |
-
print("Saved review file to csv")
|
467 |
|
468 |
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
469 |
with open(out_annotation_file_path, 'w') as f:
|
470 |
json.dump(annotations_all_pages, f)
|
471 |
log_files_output_paths.append(out_annotation_file_path)
|
472 |
|
473 |
-
print("Saving annotations to JSON")
|
474 |
|
475 |
except Exception as e:
|
476 |
print("Could not save annotations to json or csv file:", e)
|
@@ -488,7 +467,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
488 |
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
489 |
|
490 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
491 |
-
#print("Estimated total processing time:", str(estimate_total_processing_time))
|
492 |
|
493 |
else:
|
494 |
toc = time.perf_counter()
|
@@ -511,18 +489,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
511 |
|
512 |
if combined_out_message: out_message = combined_out_message
|
513 |
|
514 |
-
#print("\nout_message at choose_and_run_redactor end is:", out_message)
|
515 |
-
|
516 |
# Ensure no duplicated output files
|
517 |
log_files_output_paths = list(set(log_files_output_paths))
|
518 |
out_file_paths = list(set(out_file_paths))
|
519 |
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
520 |
|
521 |
-
#print("log_files_output_paths:", log_files_output_paths)
|
522 |
-
#print("out_file_paths:", out_file_paths)
|
523 |
-
#print("review_out_file_paths:", review_out_file_paths)
|
524 |
-
|
525 |
-
|
526 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
527 |
|
528 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
@@ -646,9 +617,6 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot, image:Image, type="imag
|
|
646 |
# Unpack coordinates
|
647 |
x1, y1, x2, y2 = rect_coordinates
|
648 |
|
649 |
-
#print("scale_width:", scale_width)
|
650 |
-
#print("scale_height:", scale_height)
|
651 |
-
|
652 |
x1 = (x1* scale_width)# + page_x_adjust
|
653 |
new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
|
654 |
x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
|
@@ -1005,12 +973,10 @@ def redact_image_pdf(file_path:str,
|
|
1005 |
if custom_recogniser_word_list:
|
1006 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1007 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
1008 |
-
#print("new_custom_recogniser:", new_custom_recogniser)
|
1009 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1010 |
|
1011 |
nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
|
1012 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1013 |
-
#print("new_custom_recogniser:", new_custom_recogniser)
|
1014 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1015 |
|
1016 |
|
@@ -1045,22 +1011,15 @@ def redact_image_pdf(file_path:str,
|
|
1045 |
else: page_min = page_min - 1
|
1046 |
|
1047 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
1048 |
-
#print("Current_loop_page:", current_loop_page)
|
1049 |
|
1050 |
# If running Textract, check if file already exists. If it does, load in existing data
|
1051 |
-
# Import results from json and convert
|
1052 |
if analysis_type == textract_option:
|
1053 |
|
1054 |
json_file_path = output_folder + file_name + "_textract.json"
|
1055 |
|
1056 |
-
|
1057 |
if not os.path.exists(json_file_path):
|
1058 |
print("No existing Textract results file found.")
|
1059 |
textract_data = {}
|
1060 |
-
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1061 |
-
#log_files_output_paths.append(json_file_path)
|
1062 |
-
#request_metadata = request_metadata + "\n" + new_request_metadata
|
1063 |
-
#wrapped_text_blocks = {"pages":[text_blocks]}
|
1064 |
else:
|
1065 |
# Open the file and load the JSON data
|
1066 |
no_textract_file = False
|
@@ -1073,7 +1032,6 @@ def redact_image_pdf(file_path:str,
|
|
1073 |
textract_data = json.load(json_file)
|
1074 |
|
1075 |
###
|
1076 |
-
|
1077 |
if current_loop_page == 0: page_loop_start = 0
|
1078 |
else: page_loop_start = current_loop_page
|
1079 |
|
@@ -1087,7 +1045,6 @@ def redact_image_pdf(file_path:str,
|
|
1087 |
page_break_return = False
|
1088 |
|
1089 |
reported_page_number = str(page_no + 1)
|
1090 |
-
#print("Redacting page:", reported_page_number)
|
1091 |
|
1092 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1093 |
try:
|
@@ -1104,7 +1061,6 @@ def redact_image_pdf(file_path:str,
|
|
1104 |
|
1105 |
#print("Image is in range of pages to redact")
|
1106 |
if isinstance(image, str):
|
1107 |
-
#print("image is a file path", image)
|
1108 |
image = Image.open(image)
|
1109 |
|
1110 |
# Need image size to convert textract OCR outputs to the correct sizes
|
@@ -1192,13 +1148,13 @@ def redact_image_pdf(file_path:str,
|
|
1192 |
redaction_bboxes = []
|
1193 |
|
1194 |
|
1195 |
-
if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
1196 |
-
elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
1197 |
|
1198 |
-
# Save decision making process
|
1199 |
-
bboxes_str = str(redaction_bboxes)
|
1200 |
-
with open(interim_results_file_path, "w") as f:
|
1201 |
-
|
1202 |
|
1203 |
# Merge close bounding boxes
|
1204 |
merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
|
@@ -1210,7 +1166,6 @@ def redact_image_pdf(file_path:str,
|
|
1210 |
all_image_annotations_boxes = []
|
1211 |
|
1212 |
for box in merged_redaction_bboxes:
|
1213 |
-
#print("box:", box)
|
1214 |
|
1215 |
x0 = box.left
|
1216 |
y0 = box.top
|
@@ -1238,8 +1193,6 @@ def redact_image_pdf(file_path:str,
|
|
1238 |
|
1239 |
## Apply annotations with pymupdf
|
1240 |
else:
|
1241 |
-
#print("merged_redaction_boxes:", merged_redaction_bboxes)
|
1242 |
-
#print("redact_whole_page_list:", redact_whole_page_list)
|
1243 |
if redact_whole_page_list:
|
1244 |
int_reported_page_number = int(reported_page_number)
|
1245 |
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
@@ -1284,8 +1237,6 @@ def redact_image_pdf(file_path:str,
|
|
1284 |
|
1285 |
time_taken = toc - tic
|
1286 |
|
1287 |
-
#print("toc - tic:", time_taken)
|
1288 |
-
|
1289 |
# Break if time taken is greater than max_time seconds
|
1290 |
if time_taken > max_time:
|
1291 |
print("Processing for", max_time, "seconds, breaking loop.")
|
@@ -1298,7 +1249,6 @@ def redact_image_pdf(file_path:str,
|
|
1298 |
pymupdf_doc = images
|
1299 |
|
1300 |
# Check if the image already exists in annotations_all_pages
|
1301 |
-
#print("annotations_all_pages:", annotations_all_pages)
|
1302 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1303 |
if existing_index is not None:
|
1304 |
# Replace the existing annotation
|
@@ -1315,6 +1265,8 @@ def redact_image_pdf(file_path:str,
|
|
1315 |
if json_file_path not in log_files_output_paths:
|
1316 |
log_files_output_paths.append(json_file_path)
|
1317 |
|
|
|
|
|
1318 |
current_loop_page += 1
|
1319 |
|
1320 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
@@ -1324,7 +1276,6 @@ def redact_image_pdf(file_path:str,
|
|
1324 |
pymupdf_doc = images
|
1325 |
|
1326 |
# Check if the image already exists in annotations_all_pages
|
1327 |
-
#print("annotations_all_pages:", annotations_all_pages)
|
1328 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1329 |
if existing_index is not None:
|
1330 |
# Replace the existing annotation
|
@@ -1409,9 +1360,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1409 |
|
1410 |
if isinstance(char, LTAnno):
|
1411 |
|
1412 |
-
# print("Character line:", "".join(character_text_objects_out))
|
1413 |
-
# print("Char is an annotation object:", char)
|
1414 |
-
|
1415 |
added_text = char.get_text()
|
1416 |
|
1417 |
# Handle double quotes
|
@@ -1427,7 +1375,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1427 |
|
1428 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1429 |
if '\n' in added_text:
|
1430 |
-
|
1431 |
# Finalize the current line
|
1432 |
if current_word:
|
1433 |
word_bboxes.append((current_word, current_word_bbox))
|
@@ -1475,13 +1423,12 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1475 |
word_bboxes.append((current_word, current_word_bbox))
|
1476 |
|
1477 |
if full_text:
|
1478 |
-
#print("full_text before:", full_text)
|
1479 |
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
1480 |
# Convert special characters to a human-readable format
|
1481 |
-
|
1482 |
full_text = clean_unicode_text(full_text)
|
1483 |
full_text = full_text.strip()
|
1484 |
-
|
1485 |
|
1486 |
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
1487 |
|
@@ -1498,9 +1445,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1498 |
analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
|
1499 |
|
1500 |
# Remove brackets and split the string into four separate columns
|
1501 |
-
#print("analysed_bounding_boxes_df_new:", analysed_bounding_boxes_df_new['boundingBox'])
|
1502 |
-
# analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].str.strip('[]').str.split(',', expand=True)
|
1503 |
-
|
1504 |
# Split the boundingBox list into four separate columns
|
1505 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1506 |
|
@@ -1512,8 +1456,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1512 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
1513 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
1514 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
1515 |
-
|
1516 |
-
#print('\n\ndecision_process_table:\n\n', decision_process_table)
|
1517 |
|
1518 |
return decision_process_table
|
1519 |
|
@@ -1607,7 +1549,6 @@ def redact_text_pdf(
|
|
1607 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1608 |
|
1609 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1610 |
-
#print("custom_recogniser_word_list:", custom_recogniser_word_list)
|
1611 |
if custom_recogniser_word_list:
|
1612 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1613 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
@@ -1617,16 +1558,6 @@ def redact_text_pdf(
|
|
1617 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1618 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1619 |
|
1620 |
-
# List all elements currently in the nlp_analyser registry
|
1621 |
-
#print("Current recognizers in nlp_analyser registry:")
|
1622 |
-
#for recognizer_name in nlp_analyser.registry.recognizers:
|
1623 |
-
#print(recognizer_name)
|
1624 |
-
#print(recognizer_name.name)
|
1625 |
-
|
1626 |
-
#print("Custom recogniser:", nlp_analyser.registry)
|
1627 |
-
|
1628 |
-
#print("custom_recogniser_word_list:", custom_recogniser_word_list)
|
1629 |
-
|
1630 |
tic = time.perf_counter()
|
1631 |
|
1632 |
# Open with Pikepdf to get text lines
|
@@ -1641,7 +1572,6 @@ def redact_text_pdf(
|
|
1641 |
else: page_min = page_min - 1
|
1642 |
|
1643 |
print("Page range is",str(page_min + 1), "to", str(page_max))
|
1644 |
-
print("Current_loop_page:", current_loop_page)
|
1645 |
|
1646 |
if current_loop_page == 0: page_loop_start = 0
|
1647 |
else: page_loop_start = current_loop_page
|
@@ -1716,8 +1646,6 @@ def redact_text_pdf(
|
|
1716 |
### REDACTION
|
1717 |
|
1718 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
1719 |
-
#print("Identifying redactions on page.")
|
1720 |
-
|
1721 |
page_analysed_bounding_boxes = run_page_text_redaction(
|
1722 |
language,
|
1723 |
chosen_redact_entities,
|
@@ -1735,24 +1663,18 @@ def redact_text_pdf(
|
|
1735 |
comprehend_query_number
|
1736 |
)
|
1737 |
|
1738 |
-
|
1739 |
-
#print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
1740 |
-
#print("image:", image)
|
1741 |
else:
|
1742 |
page_analysed_bounding_boxes = []
|
1743 |
|
1744 |
|
1745 |
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1746 |
|
1747 |
-
#print("page_analysed_bounding_boxes_out_converted:", page_analysed_bounding_boxes)
|
1748 |
|
1749 |
# Annotate redactions on page
|
1750 |
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1751 |
|
1752 |
-
# print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
|
1753 |
-
|
1754 |
# Make pymupdf page redactions
|
1755 |
-
#print("redact_whole_page_list:", redact_whole_page_list)
|
1756 |
if redact_whole_page_list:
|
1757 |
int_reported_page_number = int(reported_page_number)
|
1758 |
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
@@ -1761,9 +1683,6 @@ def redact_text_pdf(
|
|
1761 |
|
1762 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
|
1763 |
|
1764 |
-
#print("image_annotations:", image_annotations)
|
1765 |
-
|
1766 |
-
#print("Did redact_page_with_pymupdf function")
|
1767 |
reported_page_no = page_no + 1
|
1768 |
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1769 |
|
@@ -1778,14 +1697,12 @@ def redact_text_pdf(
|
|
1778 |
|
1779 |
if not decision_process_table_on_page.empty:
|
1780 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
1781 |
-
|
1782 |
|
1783 |
toc = time.perf_counter()
|
1784 |
|
1785 |
time_taken = toc - tic
|
1786 |
|
1787 |
-
#print("toc - tic:", time_taken)
|
1788 |
-
|
1789 |
# Break if time taken is greater than max_time seconds
|
1790 |
if time_taken > max_time:
|
1791 |
print("Processing for", max_time, "seconds, breaking.")
|
|
|
78 |
custom_recogniser_word_list:List[str]=None,
|
79 |
redact_whole_page_list:List[str]=None,
|
80 |
latest_file_completed:int=0,
|
81 |
+
out_message:List=[],
|
82 |
+
out_file_paths:List=[],
|
83 |
+
log_files_output_paths:List=[],
|
84 |
first_loop_state:bool=False,
|
85 |
page_min:int=0,
|
86 |
page_max:int=999,
|
|
|
301 |
file_paths_list = file_paths
|
302 |
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
303 |
|
|
|
|
|
|
|
304 |
for file in file_paths_loop:
|
305 |
if isinstance(file, str):
|
306 |
file_path = file
|
|
|
310 |
if file_path:
|
311 |
pdf_file_name_without_ext = get_file_name_without_type(file_path)
|
312 |
pdf_file_name_with_ext = os.path.basename(file_path)
|
|
|
313 |
|
314 |
is_a_pdf = is_pdf(file_path) == True
|
315 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
|
|
357 |
custom_recogniser_word_list,
|
358 |
redact_whole_page_list,
|
359 |
max_fuzzy_spelling_mistakes_num,
|
360 |
+
match_fuzzy_whole_phrase_bool,
|
361 |
+
log_files_output_paths=log_files_output_paths)
|
362 |
+
|
|
|
|
|
363 |
# Save Textract request metadata (if exists)
|
364 |
if new_request_metadata:
|
|
|
365 |
all_request_metadata.append(new_request_metadata)
|
366 |
|
367 |
elif in_redact_method == text_ocr_option:
|
|
|
415 |
# Save file
|
416 |
if is_pdf(file_path) == False:
|
417 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
|
|
|
|
|
|
|
418 |
pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
419 |
out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
|
420 |
|
|
|
424 |
|
425 |
out_file_paths.append(out_redacted_pdf_file_path)
|
426 |
|
|
|
|
|
|
|
|
|
427 |
out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
428 |
|
429 |
logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
|
|
436 |
|
437 |
# Save the gradio_annotation_boxes to a JSON file
|
438 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
440 |
|
441 |
out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
|
442 |
review_df.to_csv(out_review_file_path, index=None)
|
443 |
out_file_paths.append(out_review_file_path)
|
444 |
|
445 |
+
#print("Saved review file to csv")
|
446 |
|
447 |
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
448 |
with open(out_annotation_file_path, 'w') as f:
|
449 |
json.dump(annotations_all_pages, f)
|
450 |
log_files_output_paths.append(out_annotation_file_path)
|
451 |
|
452 |
+
#print("Saving annotations to JSON")
|
453 |
|
454 |
except Exception as e:
|
455 |
print("Could not save annotations to json or csv file:", e)
|
|
|
467 |
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
468 |
|
469 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
|
|
470 |
|
471 |
else:
|
472 |
toc = time.perf_counter()
|
|
|
489 |
|
490 |
if combined_out_message: out_message = combined_out_message
|
491 |
|
|
|
|
|
492 |
# Ensure no duplicated output files
|
493 |
log_files_output_paths = list(set(log_files_output_paths))
|
494 |
out_file_paths = list(set(out_file_paths))
|
495 |
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
496 |
|
|
|
|
|
|
|
|
|
|
|
497 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
498 |
|
499 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
|
|
617 |
# Unpack coordinates
|
618 |
x1, y1, x2, y2 = rect_coordinates
|
619 |
|
|
|
|
|
|
|
620 |
x1 = (x1* scale_width)# + page_x_adjust
|
621 |
new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
|
622 |
x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
|
|
|
973 |
if custom_recogniser_word_list:
|
974 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
975 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
|
|
976 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
977 |
|
978 |
nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
|
979 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
|
|
980 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
981 |
|
982 |
|
|
|
1011 |
else: page_min = page_min - 1
|
1012 |
|
1013 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
|
|
1014 |
|
1015 |
# If running Textract, check if file already exists. If it does, load in existing data
|
|
|
1016 |
if analysis_type == textract_option:
|
1017 |
|
1018 |
json_file_path = output_folder + file_name + "_textract.json"
|
1019 |
|
|
|
1020 |
if not os.path.exists(json_file_path):
|
1021 |
print("No existing Textract results file found.")
|
1022 |
textract_data = {}
|
|
|
|
|
|
|
|
|
1023 |
else:
|
1024 |
# Open the file and load the JSON data
|
1025 |
no_textract_file = False
|
|
|
1032 |
textract_data = json.load(json_file)
|
1033 |
|
1034 |
###
|
|
|
1035 |
if current_loop_page == 0: page_loop_start = 0
|
1036 |
else: page_loop_start = current_loop_page
|
1037 |
|
|
|
1045 |
page_break_return = False
|
1046 |
|
1047 |
reported_page_number = str(page_no + 1)
|
|
|
1048 |
|
1049 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1050 |
try:
|
|
|
1061 |
|
1062 |
#print("Image is in range of pages to redact")
|
1063 |
if isinstance(image, str):
|
|
|
1064 |
image = Image.open(image)
|
1065 |
|
1066 |
# Need image size to convert textract OCR outputs to the correct sizes
|
|
|
1148 |
redaction_bboxes = []
|
1149 |
|
1150 |
|
1151 |
+
# if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
1152 |
+
# elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
1153 |
|
1154 |
+
# # Save decision making process
|
1155 |
+
# bboxes_str = str(redaction_bboxes)
|
1156 |
+
# with open(interim_results_file_path, "w") as f:
|
1157 |
+
# f.write(bboxes_str)
|
1158 |
|
1159 |
# Merge close bounding boxes
|
1160 |
merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
|
|
|
1166 |
all_image_annotations_boxes = []
|
1167 |
|
1168 |
for box in merged_redaction_bboxes:
|
|
|
1169 |
|
1170 |
x0 = box.left
|
1171 |
y0 = box.top
|
|
|
1193 |
|
1194 |
## Apply annotations with pymupdf
|
1195 |
else:
|
|
|
|
|
1196 |
if redact_whole_page_list:
|
1197 |
int_reported_page_number = int(reported_page_number)
|
1198 |
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
|
|
1237 |
|
1238 |
time_taken = toc - tic
|
1239 |
|
|
|
|
|
1240 |
# Break if time taken is greater than max_time seconds
|
1241 |
if time_taken > max_time:
|
1242 |
print("Processing for", max_time, "seconds, breaking loop.")
|
|
|
1249 |
pymupdf_doc = images
|
1250 |
|
1251 |
# Check if the image already exists in annotations_all_pages
|
|
|
1252 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1253 |
if existing_index is not None:
|
1254 |
# Replace the existing annotation
|
|
|
1265 |
if json_file_path not in log_files_output_paths:
|
1266 |
log_files_output_paths.append(json_file_path)
|
1267 |
|
1268 |
+
print("At end of redact_image_pdf function where time over max.", json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
|
1269 |
+
|
1270 |
current_loop_page += 1
|
1271 |
|
1272 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
|
|
1276 |
pymupdf_doc = images
|
1277 |
|
1278 |
# Check if the image already exists in annotations_all_pages
|
|
|
1279 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1280 |
if existing_index is not None:
|
1281 |
# Replace the existing annotation
|
|
|
1360 |
|
1361 |
if isinstance(char, LTAnno):
|
1362 |
|
|
|
|
|
|
|
1363 |
added_text = char.get_text()
|
1364 |
|
1365 |
# Handle double quotes
|
|
|
1375 |
|
1376 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1377 |
if '\n' in added_text:
|
1378 |
+
|
1379 |
# Finalize the current line
|
1380 |
if current_word:
|
1381 |
word_bboxes.append((current_word, current_word_bbox))
|
|
|
1423 |
word_bboxes.append((current_word, current_word_bbox))
|
1424 |
|
1425 |
if full_text:
|
|
|
1426 |
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
1427 |
# Convert special characters to a human-readable format
|
1428 |
+
|
1429 |
full_text = clean_unicode_text(full_text)
|
1430 |
full_text = full_text.strip()
|
1431 |
+
|
1432 |
|
1433 |
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
1434 |
|
|
|
1445 |
analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
|
1446 |
|
1447 |
# Remove brackets and split the string into four separate columns
|
|
|
|
|
|
|
1448 |
# Split the boundingBox list into four separate columns
|
1449 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1450 |
|
|
|
1456 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
1457 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
1458 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
|
|
|
|
1459 |
|
1460 |
return decision_process_table
|
1461 |
|
|
|
1549 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1550 |
|
1551 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
|
|
1552 |
if custom_recogniser_word_list:
|
1553 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1554 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
|
|
1558 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1559 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1560 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1561 |
tic = time.perf_counter()
|
1562 |
|
1563 |
# Open with Pikepdf to get text lines
|
|
|
1572 |
else: page_min = page_min - 1
|
1573 |
|
1574 |
print("Page range is",str(page_min + 1), "to", str(page_max))
|
|
|
1575 |
|
1576 |
if current_loop_page == 0: page_loop_start = 0
|
1577 |
else: page_loop_start = current_loop_page
|
|
|
1646 |
### REDACTION
|
1647 |
|
1648 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
|
|
|
|
1649 |
page_analysed_bounding_boxes = run_page_text_redaction(
|
1650 |
language,
|
1651 |
chosen_redact_entities,
|
|
|
1663 |
comprehend_query_number
|
1664 |
)
|
1665 |
|
1666 |
+
|
|
|
|
|
1667 |
else:
|
1668 |
page_analysed_bounding_boxes = []
|
1669 |
|
1670 |
|
1671 |
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1672 |
|
|
|
1673 |
|
1674 |
# Annotate redactions on page
|
1675 |
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1676 |
|
|
|
|
|
1677 |
# Make pymupdf page redactions
|
|
|
1678 |
if redact_whole_page_list:
|
1679 |
int_reported_page_number = int(reported_page_number)
|
1680 |
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
|
|
1683 |
|
1684 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
|
1685 |
|
|
|
|
|
|
|
1686 |
reported_page_no = page_no + 1
|
1687 |
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1688 |
|
|
|
1697 |
|
1698 |
if not decision_process_table_on_page.empty:
|
1699 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
1700 |
+
|
1701 |
|
1702 |
toc = time.perf_counter()
|
1703 |
|
1704 |
time_taken = toc - tic
|
1705 |
|
|
|
|
|
1706 |
# Break if time taken is greater than max_time seconds
|
1707 |
if time_taken > max_time:
|
1708 |
print("Processing for", max_time, "seconds, breaking.")
|
tools/redaction_review.py
CHANGED
@@ -396,7 +396,7 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
396 |
row_value_page = evt.row_value[0] # This is the page number value
|
397 |
return row_value_page
|
398 |
|
399 |
-
def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
400 |
'''
|
401 |
Converts coordinates from image space to Adobe PDF space.
|
402 |
|
@@ -431,7 +431,7 @@ def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width,
|
|
431 |
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
432 |
|
433 |
|
434 |
-
def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
|
435 |
'''
|
436 |
Create an xfdf file from a review csv file and a pdf
|
437 |
'''
|
|
|
396 |
row_value_page = evt.row_value[0] # This is the page number value
|
397 |
return row_value_page
|
398 |
|
399 |
+
def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|
400 |
'''
|
401 |
Converts coordinates from image space to Adobe PDF space.
|
402 |
|
|
|
431 |
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
432 |
|
433 |
|
434 |
+
def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str]):
|
435 |
'''
|
436 |
Create an xfdf file from a review csv file and a pdf
|
437 |
'''
|