Commit
·
8235bbb
1
Parent(s):
f0f9378
Improved logging
Browse files- README.md +4 -2
- app.py +45 -47
- tools/custom_image_analyser_engine.py +4 -1
- tools/file_redaction.py +43 -29
- tools/helper_functions.py +54 -55
README.md
CHANGED
@@ -11,13 +11,15 @@ license: agpl-3.0
|
|
11 |
|
12 |
# Document redaction
|
13 |
|
14 |
-
Redact
|
|
|
|
|
15 |
|
16 |
You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
|
17 |
|
18 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
19 |
|
20 |
-
This app accepts a maximum file size of
|
21 |
|
22 |
# USER GUIDE
|
23 |
|
|
|
11 |
|
12 |
# Document redaction
|
13 |
|
14 |
+
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
|
15 |
+
|
16 |
+
See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
|
17 |
|
18 |
You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
|
19 |
|
20 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
21 |
|
22 |
+
This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
|
23 |
|
24 |
# USER GUIDE
|
25 |
|
app.py
CHANGED
@@ -4,20 +4,20 @@ import socket
|
|
4 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
5 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
6 |
|
|
|
|
|
|
|
7 |
from gradio_image_annotation import image_annotator
|
8 |
|
9 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
|
10 |
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
11 |
from tools.file_redaction import choose_and_run_redactor
|
12 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
13 |
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
|
14 |
from tools.data_anonymise import anonymise_data_files
|
15 |
from tools.auth import authenticate_user
|
16 |
-
#from tools.aws_functions import load_data_from_aws
|
17 |
-
import gradio as gr
|
18 |
-
import pandas as pd
|
19 |
|
20 |
-
|
21 |
today_rev = datetime.now().strftime("%Y%m%d")
|
22 |
|
23 |
add_folder_to_path("tesseract/")
|
@@ -36,12 +36,10 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
|
|
36 |
language = 'en'
|
37 |
|
38 |
host_name = socket.gethostname()
|
39 |
-
|
40 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
41 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
42 |
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
43 |
|
44 |
-
|
45 |
text_ocr_option = "Simple text analysis - PDFs with selectable text"
|
46 |
tesseract_ocr_option = "Quick image analysis - typed text"
|
47 |
textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
|
@@ -70,10 +68,6 @@ with app:
|
|
70 |
all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
|
71 |
all_decision_process_table_state = gr.State(pd.DataFrame())
|
72 |
|
73 |
-
def reset_state_vars():
|
74 |
-
return [], [], pd.DataFrame(), pd.DataFrame()
|
75 |
-
|
76 |
-
|
77 |
in_allow_list_state = gr.State(pd.DataFrame())
|
78 |
|
79 |
session_hash_state = gr.State()
|
@@ -88,25 +82,32 @@ with app:
|
|
88 |
output_image_files_state = gr.State([])
|
89 |
output_file_list_state = gr.State([])
|
90 |
text_output_file_list_state = gr.State([])
|
91 |
-
log_files_output_list_state = gr.State([])
|
92 |
|
|
|
93 |
# Logging state
|
94 |
-
|
|
|
|
|
95 |
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
96 |
-
access_logs_state = gr.State(access_logs_folder +
|
97 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
98 |
-
usage_logs_state = gr.State(usage_logs_folder +
|
99 |
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
# Invisible elements effectively used as state variables
|
102 |
-
session_hash_textbox = gr.Textbox(value="", visible=False) # Invisible text box to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
103 |
-
textract_metadata_textbox = gr.Textbox(value="", visible=False)
|
104 |
-
doc_file_name_textbox = gr.Textbox(value="", visible=False)
|
105 |
-
doc_file_name_with_extension_textbox = gr.Textbox(value="", visible=False)
|
106 |
-
data_file_name_textbox = gr.Textbox(value="", visible=False)
|
107 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
108 |
-
estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
109 |
-
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
110 |
|
111 |
|
112 |
###
|
@@ -114,17 +115,17 @@ with app:
|
|
114 |
###
|
115 |
|
116 |
gr.Markdown(
|
117 |
-
"""
|
118 |
-
# Document redaction
|
119 |
|
120 |
-
Redact
|
|
|
|
|
121 |
|
122 |
You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
|
123 |
|
124 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
125 |
|
126 |
-
This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
|
127 |
-
""")
|
128 |
|
129 |
# PDF / IMAGES TAB
|
130 |
with gr.Tab("PDFs/images"):
|
@@ -148,7 +149,7 @@ with app:
|
|
148 |
|
149 |
# Feedback elements are invisible until revealed by redaction action
|
150 |
pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
151 |
-
pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
|
152 |
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
153 |
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
154 |
|
@@ -226,9 +227,6 @@ with app:
|
|
226 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
227 |
|
228 |
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
233 |
with gr.Row():
|
234 |
in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
|
@@ -257,15 +255,15 @@ with app:
|
|
257 |
###
|
258 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
|
259 |
|
260 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
|
261 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
262 |
-
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
|
263 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
|
264 |
#then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
|
265 |
|
266 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
267 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
|
268 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
|
269 |
|
270 |
# If a file has been completed, the function will continue onto the next document
|
271 |
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
|
@@ -321,27 +319,27 @@ with app:
|
|
321 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
322 |
|
323 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
324 |
-
access_callback = gr.CSVLogger()
|
325 |
access_callback.setup([session_hash_textbox], access_logs_folder)
|
326 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
327 |
then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
328 |
|
329 |
# User submitted feedback for pdf redactions
|
330 |
-
pdf_callback = gr.CSVLogger()
|
331 |
-
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text,
|
332 |
-
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text,
|
333 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
334 |
|
335 |
# User submitted feedback for data redactions
|
336 |
-
data_callback = gr.CSVLogger()
|
337 |
-
data_callback.setup([data_feedback_radio, data_further_details_text,
|
338 |
-
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text,
|
339 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
340 |
|
341 |
# Log processing time/token usage when making a query
|
342 |
-
usage_callback = gr.CSVLogger()
|
343 |
-
usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
|
344 |
-
estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
|
345 |
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
346 |
|
347 |
# Launch the Gradio app
|
|
|
4 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
5 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
6 |
|
7 |
+
import gradio as gr
|
8 |
+
import pandas as pd
|
9 |
+
from datetime import datetime
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
|
12 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars
|
13 |
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
|
17 |
from tools.data_anonymise import anonymise_data_files
|
18 |
from tools.auth import authenticate_user
|
|
|
|
|
|
|
19 |
|
20 |
+
|
21 |
today_rev = datetime.now().strftime("%Y%m%d")
|
22 |
|
23 |
add_folder_to_path("tesseract/")
|
|
|
36 |
language = 'en'
|
37 |
|
38 |
host_name = socket.gethostname()
|
|
|
39 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
40 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
41 |
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
42 |
|
|
|
43 |
text_ocr_option = "Simple text analysis - PDFs with selectable text"
|
44 |
tesseract_ocr_option = "Quick image analysis - typed text"
|
45 |
textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
|
|
|
68 |
all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
|
69 |
all_decision_process_table_state = gr.State(pd.DataFrame())
|
70 |
|
|
|
|
|
|
|
|
|
71 |
in_allow_list_state = gr.State(pd.DataFrame())
|
72 |
|
73 |
session_hash_state = gr.State()
|
|
|
82 |
output_image_files_state = gr.State([])
|
83 |
output_file_list_state = gr.State([])
|
84 |
text_output_file_list_state = gr.State([])
|
85 |
+
log_files_output_list_state = gr.State([])
|
86 |
|
87 |
+
|
88 |
# Logging state
|
89 |
+
log_file_name = 'log.csv'
|
90 |
+
|
91 |
+
feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
|
92 |
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
93 |
+
access_logs_state = gr.State(access_logs_folder + log_file_name)
|
94 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
95 |
+
usage_logs_state = gr.State(usage_logs_folder + log_file_name)
|
96 |
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
97 |
+
|
98 |
+
# Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
99 |
+
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
100 |
+
textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
|
101 |
+
comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
|
102 |
+
|
103 |
+
doc_file_name_textbox = gr.Textbox(label = "doc_file_name_textbox", value="", visible=False)
|
104 |
+
doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
|
105 |
+
data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
|
106 |
+
|
107 |
+
estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
108 |
+
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
|
|
|
|
111 |
|
112 |
|
113 |
###
|
|
|
115 |
###
|
116 |
|
117 |
gr.Markdown(
|
118 |
+
"""# Document redaction
|
|
|
119 |
|
120 |
+
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
|
121 |
+
|
122 |
+
See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
|
123 |
|
124 |
You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
|
125 |
|
126 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
127 |
|
128 |
+
This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.""")
|
|
|
129 |
|
130 |
# PDF / IMAGES TAB
|
131 |
with gr.Tab("PDFs/images"):
|
|
|
149 |
|
150 |
# Feedback elements are invisible until revealed by redaction action
|
151 |
pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
152 |
+
pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
|
153 |
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
154 |
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
155 |
|
|
|
227 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
228 |
|
229 |
|
|
|
|
|
|
|
230 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
231 |
with gr.Row():
|
232 |
in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
|
|
|
255 |
###
|
256 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
|
257 |
|
258 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox]).\
|
259 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
260 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
261 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc")#.\
|
262 |
#then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
|
263 |
|
264 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
265 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
266 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number])
|
267 |
|
268 |
# If a file has been completed, the function will continue onto the next document
|
269 |
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
|
|
|
319 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
320 |
|
321 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
322 |
+
access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
323 |
access_callback.setup([session_hash_textbox], access_logs_folder)
|
324 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
325 |
then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
326 |
|
327 |
# User submitted feedback for pdf redactions
|
328 |
+
pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
329 |
+
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], feedback_logs_folder)
|
330 |
+
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], None, preprocess=False).\
|
331 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
332 |
|
333 |
# User submitted feedback for data redactions
|
334 |
+
data_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
335 |
+
data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_textbox], feedback_logs_folder)
|
336 |
+
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_name_textbox], None, preprocess=False).\
|
337 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
338 |
|
339 |
# Log processing time/token usage when making a query
|
340 |
+
usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
341 |
+
usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
|
342 |
+
estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
|
343 |
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
344 |
|
345 |
# Launch the Gradio app
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -471,6 +471,7 @@ class CustomImageAnalyzerEngine:
|
|
471 |
|
472 |
horizontal_buffer = 0 # add pixels to right of width
|
473 |
height_buffer = 2 # add pixels to bounding box height
|
|
|
474 |
|
475 |
allow_list = text_analyzer_kwargs.get('allow_list', [])
|
476 |
|
@@ -494,6 +495,8 @@ class CustomImageAnalyzerEngine:
|
|
494 |
LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
|
495 |
)
|
496 |
|
|
|
|
|
497 |
for result in response["Entities"]:
|
498 |
result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
|
499 |
|
@@ -577,7 +580,7 @@ class CustomImageAnalyzerEngine:
|
|
577 |
|
578 |
combined_results.extend(line_results)
|
579 |
|
580 |
-
return combined_results
|
581 |
|
582 |
@staticmethod
|
583 |
def map_analyzer_results_to_bounding_boxes(
|
|
|
471 |
|
472 |
horizontal_buffer = 0 # add pixels to right of width
|
473 |
height_buffer = 2 # add pixels to bounding box height
|
474 |
+
comprehend_query_number = 0
|
475 |
|
476 |
allow_list = text_analyzer_kwargs.get('allow_list', [])
|
477 |
|
|
|
495 |
LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
|
496 |
)
|
497 |
|
498 |
+
comprehend_query_number += 1
|
499 |
+
|
500 |
for result in response["Entities"]:
|
501 |
result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
|
502 |
|
|
|
580 |
|
581 |
combined_results.extend(line_results)
|
582 |
|
583 |
+
return combined_results, comprehend_query_number
|
584 |
|
585 |
@staticmethod
|
586 |
def map_analyzer_results_to_bounding_boxes(
|
tools/file_redaction.py
CHANGED
@@ -91,6 +91,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
91 |
current_loop_page:int=0,
|
92 |
page_break_return:bool=False,
|
93 |
pii_identification_method:str="Local",
|
|
|
94 |
progress=gr.Progress(track_tqdm=True)):
|
95 |
'''
|
96 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
@@ -120,6 +121,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
120 |
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
121 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
122 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
|
|
123 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
124 |
|
125 |
The function returns a redacted document along with processing logs.
|
@@ -171,7 +173,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
171 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
172 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
173 |
|
174 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
175 |
|
176 |
# If we have reached the last page, return message
|
177 |
if current_loop_page >= number_of_pages:
|
@@ -181,7 +183,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
181 |
current_loop_page = 999
|
182 |
combined_out_message = out_message
|
183 |
|
184 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
185 |
|
186 |
# Create allow list
|
187 |
if not in_allow_list.empty:
|
@@ -220,7 +222,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
220 |
out_message = "No file selected"
|
221 |
print(out_message)
|
222 |
|
223 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
224 |
|
225 |
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
226 |
|
@@ -231,16 +233,16 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
231 |
except:
|
232 |
out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
|
233 |
print(out_message)
|
234 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
|
235 |
|
236 |
#Analyse and redact image-based pdf or image
|
237 |
if is_pdf_or_image(file_path) == False:
|
238 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
239 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
|
240 |
|
241 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
242 |
|
243 |
-
pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path,
|
244 |
prepared_pdf_image_paths,
|
245 |
language,
|
246 |
chosen_redact_entities,
|
@@ -259,7 +261,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
259 |
all_line_level_ocr_results_df,
|
260 |
all_decision_process_table,
|
261 |
pymupdf_doc,
|
262 |
-
pii_identification_method
|
|
|
263 |
|
264 |
# Save Textract request metadata (if exists)
|
265 |
if new_request_metadata:
|
@@ -272,12 +275,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
272 |
|
273 |
if is_pdf(file_path) == False:
|
274 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
275 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
276 |
|
277 |
# Analyse text-based pdf
|
278 |
print('Redacting file as text-based PDF')
|
279 |
|
280 |
-
pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path,
|
281 |
prepared_pdf_image_paths,language,
|
282 |
chosen_redact_entities,
|
283 |
chosen_redact_comprehend_entities,
|
@@ -291,12 +294,13 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
291 |
all_line_level_ocr_results_df,
|
292 |
all_decision_process_table,
|
293 |
pymupdf_doc,
|
294 |
-
pii_identification_method
|
|
|
295 |
|
296 |
else:
|
297 |
out_message = "No redaction method selected"
|
298 |
print(out_message)
|
299 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
300 |
|
301 |
# If at last page, save to file
|
302 |
if current_loop_page >= number_of_pages:
|
@@ -392,7 +396,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
392 |
|
393 |
# If textract requests made, write to logging file
|
394 |
if all_request_metadata:
|
395 |
-
all_request_metadata_str = '\n'.join(all_request_metadata)
|
396 |
|
397 |
all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
|
398 |
|
@@ -412,7 +416,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
412 |
out_file_paths = list(set(out_file_paths))
|
413 |
|
414 |
|
415 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
416 |
|
417 |
def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
|
418 |
'''
|
@@ -769,9 +773,10 @@ def redact_image_pdf(file_path:str,
|
|
769 |
all_decision_process_table = pd.DataFrame(),
|
770 |
pymupdf_doc = [],
|
771 |
pii_identification_method:str="Local",
|
|
|
772 |
page_break_val:int=int(page_break_value),
|
773 |
logging_file_paths:List=[],
|
774 |
-
max_time:int=int(max_time_value),
|
775 |
progress=Progress(track_tqdm=True)):
|
776 |
|
777 |
'''
|
@@ -796,9 +801,10 @@ def redact_image_pdf(file_path:str,
|
|
796 |
- all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
|
797 |
- pymupdf_doc (List, optional): The document as a PyMupdf object.
|
798 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
|
|
799 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
800 |
- logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
|
801 |
-
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
802 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
803 |
|
804 |
The function returns a fully or partially-redacted PDF document.
|
@@ -806,6 +812,7 @@ def redact_image_pdf(file_path:str,
|
|
806 |
file_name = get_file_path_end(file_path)
|
807 |
fill = (0, 0, 0) # Fill colour
|
808 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
|
|
809 |
|
810 |
#print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
|
811 |
|
@@ -836,7 +843,6 @@ def redact_image_pdf(file_path:str,
|
|
836 |
if current_loop_page == 0: page_loop_start = 0
|
837 |
else: page_loop_start = current_loop_page
|
838 |
|
839 |
-
#progress_bar = progress.tqdm(range(page_loop_start, number_of_pages), unit="pages", desc="Redacting pages")
|
840 |
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
841 |
|
842 |
for page_no in progress_bar:
|
@@ -872,8 +878,7 @@ def redact_image_pdf(file_path:str,
|
|
872 |
page_width, page_height = image.size
|
873 |
|
874 |
# Possibility to use different languages
|
875 |
-
if language == 'en':
|
876 |
-
ocr_lang = 'eng'
|
877 |
else: ocr_lang = language
|
878 |
|
879 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
@@ -943,7 +948,7 @@ def redact_image_pdf(file_path:str,
|
|
943 |
|
944 |
pii_identification_method= "AWS Comprehend" #"Local"
|
945 |
|
946 |
-
redaction_bboxes = image_analyser.analyze_text(
|
947 |
line_level_ocr_results,
|
948 |
line_level_ocr_results_with_children,
|
949 |
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
@@ -954,6 +959,8 @@ def redact_image_pdf(file_path:str,
|
|
954 |
score_threshold=score_threshold
|
955 |
)
|
956 |
|
|
|
|
|
957 |
# redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
|
958 |
# line_level_ocr_results_with_children,
|
959 |
# language,
|
@@ -1063,7 +1070,7 @@ def redact_image_pdf(file_path:str,
|
|
1063 |
|
1064 |
current_loop_page += 1
|
1065 |
|
1066 |
-
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
|
1067 |
|
1068 |
if is_pdf(file_path) == False:
|
1069 |
images.append(image)
|
@@ -1079,9 +1086,9 @@ def redact_image_pdf(file_path:str,
|
|
1079 |
progress.close(_tqdm=progress_bar)
|
1080 |
tqdm._instances.clear()
|
1081 |
|
1082 |
-
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
|
1083 |
|
1084 |
-
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
|
1085 |
|
1086 |
|
1087 |
###
|
@@ -1299,7 +1306,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
|
|
1299 |
'''
|
1300 |
Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
|
1301 |
'''
|
1302 |
-
|
1303 |
analyser_results = []
|
1304 |
|
1305 |
#text_to_analyse = initial_clean(text_container.text).strip()
|
@@ -1323,6 +1330,8 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
|
|
1323 |
LanguageCode=language # Specify the language of the text
|
1324 |
)
|
1325 |
|
|
|
|
|
1326 |
for result in response["Entities"]:
|
1327 |
|
1328 |
result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
|
@@ -1340,7 +1349,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
|
|
1340 |
analyser_results = []
|
1341 |
|
1342 |
|
1343 |
-
return analyser_results
|
1344 |
|
1345 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1346 |
decision_process_table = pd.DataFrame()
|
@@ -1397,6 +1406,7 @@ def redact_text_pdf(
|
|
1397 |
all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
|
1398 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1399 |
pii_identification_method: str = "Local",
|
|
|
1400 |
page_break_val: int = int(page_break_value), # Value for page break
|
1401 |
max_time: int = int(max_time_value),
|
1402 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
@@ -1422,12 +1432,14 @@ def redact_text_pdf(
|
|
1422 |
- all_decision_process_table: DataFrame for decision process table
|
1423 |
- pymupdf_doc: List of PyMuPDF documents
|
1424 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
|
|
1425 |
- page_break_val: Value for page break
|
1426 |
-
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1427 |
- progress: Progress tracking object
|
1428 |
'''
|
1429 |
|
1430 |
tic = time.perf_counter()
|
|
|
1431 |
|
1432 |
# Open with Pikepdf to get text lines
|
1433 |
pikepdf_pdf = Pdf.open(filename)
|
@@ -1517,7 +1529,9 @@ def redact_text_pdf(
|
|
1517 |
|
1518 |
if chosen_redact_entities:
|
1519 |
|
1520 |
-
text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
|
|
|
|
|
1521 |
|
1522 |
else:
|
1523 |
text_line_analyser_result = []
|
@@ -1576,7 +1590,7 @@ def redact_text_pdf(
|
|
1576 |
|
1577 |
current_loop_page += 1
|
1578 |
|
1579 |
-
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
|
1580 |
|
1581 |
|
1582 |
annotations_all_pages.append(image_annotations)
|
@@ -1588,7 +1602,7 @@ def redact_text_pdf(
|
|
1588 |
page_break_return = True
|
1589 |
progress.close(_tqdm=progress_bar)
|
1590 |
|
1591 |
-
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
|
1592 |
|
1593 |
|
1594 |
-
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
|
|
|
91 |
current_loop_page:int=0,
|
92 |
page_break_return:bool=False,
|
93 |
pii_identification_method:str="Local",
|
94 |
+
comprehend_query_number:int=0,
|
95 |
progress=gr.Progress(track_tqdm=True)):
|
96 |
'''
|
97 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
|
|
121 |
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
122 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
123 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
124 |
+
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
125 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
126 |
|
127 |
The function returns a redacted document along with processing logs.
|
|
|
173 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
174 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
175 |
|
176 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
177 |
|
178 |
# If we have reached the last page, return message
|
179 |
if current_loop_page >= number_of_pages:
|
|
|
183 |
current_loop_page = 999
|
184 |
combined_out_message = out_message
|
185 |
|
186 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
187 |
|
188 |
# Create allow list
|
189 |
if not in_allow_list.empty:
|
|
|
222 |
out_message = "No file selected"
|
223 |
print(out_message)
|
224 |
|
225 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
226 |
|
227 |
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
228 |
|
|
|
233 |
except:
|
234 |
out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
|
235 |
print(out_message)
|
236 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
|
237 |
|
238 |
#Analyse and redact image-based pdf or image
|
239 |
if is_pdf_or_image(file_path) == False:
|
240 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
241 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
|
242 |
|
243 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
244 |
|
245 |
+
pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
246 |
prepared_pdf_image_paths,
|
247 |
language,
|
248 |
chosen_redact_entities,
|
|
|
261 |
all_line_level_ocr_results_df,
|
262 |
all_decision_process_table,
|
263 |
pymupdf_doc,
|
264 |
+
pii_identification_method,
|
265 |
+
comprehend_query_number)
|
266 |
|
267 |
# Save Textract request metadata (if exists)
|
268 |
if new_request_metadata:
|
|
|
275 |
|
276 |
if is_pdf(file_path) == False:
|
277 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
278 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
279 |
|
280 |
# Analyse text-based pdf
|
281 |
print('Redacting file as text-based PDF')
|
282 |
|
283 |
+
pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(file_path,
|
284 |
prepared_pdf_image_paths,language,
|
285 |
chosen_redact_entities,
|
286 |
chosen_redact_comprehend_entities,
|
|
|
294 |
all_line_level_ocr_results_df,
|
295 |
all_decision_process_table,
|
296 |
pymupdf_doc,
|
297 |
+
pii_identification_method,
|
298 |
+
comprehend_query_number)
|
299 |
|
300 |
else:
|
301 |
out_message = "No redaction method selected"
|
302 |
print(out_message)
|
303 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
304 |
|
305 |
# If at last page, save to file
|
306 |
if current_loop_page >= number_of_pages:
|
|
|
396 |
|
397 |
# If textract requests made, write to logging file
|
398 |
if all_request_metadata:
|
399 |
+
all_request_metadata_str = '\n'.join(all_request_metadata).strip()
|
400 |
|
401 |
all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
|
402 |
|
|
|
416 |
out_file_paths = list(set(out_file_paths))
|
417 |
|
418 |
|
419 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
420 |
|
421 |
def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
|
422 |
'''
|
|
|
773 |
all_decision_process_table = pd.DataFrame(),
|
774 |
pymupdf_doc = [],
|
775 |
pii_identification_method:str="Local",
|
776 |
+
comprehend_query_number:int=0,
|
777 |
page_break_val:int=int(page_break_value),
|
778 |
logging_file_paths:List=[],
|
779 |
+
max_time:int=int(max_time_value),
|
780 |
progress=Progress(track_tqdm=True)):
|
781 |
|
782 |
'''
|
|
|
801 |
- all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
|
802 |
- pymupdf_doc (List, optional): The document as a PyMupdf object.
|
803 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
804 |
+
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
805 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
806 |
- logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
|
807 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
808 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
809 |
|
810 |
The function returns a fully or partially-redacted PDF document.
|
|
|
812 |
file_name = get_file_path_end(file_path)
|
813 |
fill = (0, 0, 0) # Fill colour
|
814 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
815 |
+
comprehend_query_number_new = 0
|
816 |
|
817 |
#print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
|
818 |
|
|
|
843 |
if current_loop_page == 0: page_loop_start = 0
|
844 |
else: page_loop_start = current_loop_page
|
845 |
|
|
|
846 |
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
847 |
|
848 |
for page_no in progress_bar:
|
|
|
878 |
page_width, page_height = image.size
|
879 |
|
880 |
# Possibility to use different languages
|
881 |
+
if language == 'en': ocr_lang = 'eng'
|
|
|
882 |
else: ocr_lang = language
|
883 |
|
884 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
|
|
948 |
|
949 |
pii_identification_method= "AWS Comprehend" #"Local"
|
950 |
|
951 |
+
redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
|
952 |
line_level_ocr_results,
|
953 |
line_level_ocr_results_with_children,
|
954 |
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
|
|
959 |
score_threshold=score_threshold
|
960 |
)
|
961 |
|
962 |
+
comprehend_query_number = comprehend_query_number_new
|
963 |
+
|
964 |
# redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
|
965 |
# line_level_ocr_results_with_children,
|
966 |
# language,
|
|
|
1070 |
|
1071 |
current_loop_page += 1
|
1072 |
|
1073 |
+
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1074 |
|
1075 |
if is_pdf(file_path) == False:
|
1076 |
images.append(image)
|
|
|
1086 |
progress.close(_tqdm=progress_bar)
|
1087 |
tqdm._instances.clear()
|
1088 |
|
1089 |
+
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1090 |
|
1091 |
+
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1092 |
|
1093 |
|
1094 |
###
|
|
|
1306 |
'''
|
1307 |
Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
|
1308 |
'''
|
1309 |
+
comprehend_query_number = 0
|
1310 |
analyser_results = []
|
1311 |
|
1312 |
#text_to_analyse = initial_clean(text_container.text).strip()
|
|
|
1330 |
LanguageCode=language # Specify the language of the text
|
1331 |
)
|
1332 |
|
1333 |
+
comprehend_query_number += 1
|
1334 |
+
|
1335 |
for result in response["Entities"]:
|
1336 |
|
1337 |
result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
|
|
|
1349 |
analyser_results = []
|
1350 |
|
1351 |
|
1352 |
+
return analyser_results, comprehend_query_number
|
1353 |
|
1354 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1355 |
decision_process_table = pd.DataFrame()
|
|
|
1406 |
all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
|
1407 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1408 |
pii_identification_method: str = "Local",
|
1409 |
+
comprehend_query_number:int = 0,
|
1410 |
page_break_val: int = int(page_break_value), # Value for page break
|
1411 |
max_time: int = int(max_time_value),
|
1412 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
|
1432 |
- all_decision_process_table: DataFrame for decision process table
|
1433 |
- pymupdf_doc: List of PyMuPDF documents
|
1434 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
1435 |
+
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
1436 |
- page_break_val: Value for page break
|
1437 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1438 |
- progress: Progress tracking object
|
1439 |
'''
|
1440 |
|
1441 |
tic = time.perf_counter()
|
1442 |
+
comprehend_query_number_new = 0
|
1443 |
|
1444 |
# Open with Pikepdf to get text lines
|
1445 |
pikepdf_pdf = Pdf.open(filename)
|
|
|
1529 |
|
1530 |
if chosen_redact_entities:
|
1531 |
|
1532 |
+
text_line_analyser_result, comprehend_query_number_new = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
|
1533 |
+
|
1534 |
+
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
1535 |
|
1536 |
else:
|
1537 |
text_line_analyser_result = []
|
|
|
1590 |
|
1591 |
current_loop_page += 1
|
1592 |
|
1593 |
+
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1594 |
|
1595 |
|
1596 |
annotations_all_pages.append(image_annotations)
|
|
|
1602 |
page_break_return = True
|
1603 |
progress.close(_tqdm=progress_bar)
|
1604 |
|
1605 |
+
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1606 |
|
1607 |
|
1608 |
+
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
tools/helper_functions.py
CHANGED
@@ -4,6 +4,9 @@ import gradio as gr
|
|
4 |
import pandas as pd
|
5 |
import unicodedata
|
6 |
|
|
|
|
|
|
|
7 |
def get_or_create_env_var(var_name, default_value):
|
8 |
# Get the environment variable if it exists
|
9 |
value = os.environ.get(var_name)
|
@@ -183,64 +186,60 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
|
|
183 |
async def get_connection_params(request: gr.Request):
|
184 |
base_folder = ""
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
if CUSTOM_CLOUDFRONT_HEADER_var
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
print("Cognito ID found:", out_session_hash)
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
|
240 |
-
|
241 |
-
else:
|
242 |
-
print("No session parameters found.")
|
243 |
-
return "",""
|
244 |
|
245 |
|
246 |
def clean_unicode_text(text):
|
|
|
4 |
import pandas as pd
|
5 |
import unicodedata
|
6 |
|
7 |
+
def reset_state_vars():
|
8 |
+
return [], [], pd.DataFrame(), pd.DataFrame(), 0, ""
|
9 |
+
|
10 |
def get_or_create_env_var(var_name, default_value):
|
11 |
# Get the environment variable if it exists
|
12 |
value = os.environ.get(var_name)
|
|
|
186 |
async def get_connection_params(request: gr.Request):
|
187 |
base_folder = ""
|
188 |
|
189 |
+
#print("request user:", request.username)
|
190 |
+
|
191 |
+
#request_data = await request.json() # Parse JSON body
|
192 |
+
#print("All request data:", request_data)
|
193 |
+
#context_value = request_data.get('context')
|
194 |
+
#if 'context' in request_data:
|
195 |
+
# print("Request context dictionary:", request_data['context'])
|
196 |
+
|
197 |
+
# print("Request headers dictionary:", request.headers)
|
198 |
+
# print("All host elements", request.client)
|
199 |
+
# print("IP address:", request.client.host)
|
200 |
+
# print("Query parameters:", dict(request.query_params))
|
201 |
+
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
202 |
+
#print("Request dictionary to object:", request.request.body())
|
203 |
+
print("Session hash:", request.session_hash)
|
204 |
+
|
205 |
+
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
|
206 |
+
CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
|
207 |
+
#print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
|
208 |
+
|
209 |
+
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
|
210 |
+
CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
|
211 |
+
#print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
|
212 |
+
|
213 |
+
if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
|
214 |
+
if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
|
215 |
+
supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
|
216 |
+
if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
|
217 |
+
print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
|
218 |
+
else:
|
219 |
+
raise(ValueError, "Custom Cloudfront header value does not match expected value.")
|
220 |
+
|
221 |
+
# Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
|
222 |
+
|
223 |
+
if request.username:
|
224 |
+
out_session_hash = request.username
|
225 |
+
base_folder = "user-files/"
|
226 |
+
print("Request username found:", out_session_hash)
|
227 |
+
|
228 |
+
elif 'x-cognito-id' in request.headers:
|
229 |
+
out_session_hash = request.headers['x-cognito-id']
|
230 |
+
base_folder = "user-files/"
|
231 |
+
print("Cognito ID found:", out_session_hash)
|
|
|
232 |
|
233 |
+
else:
|
234 |
+
out_session_hash = request.session_hash
|
235 |
+
base_folder = "temp-files/"
|
236 |
+
# print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
|
237 |
|
238 |
+
output_folder = base_folder + out_session_hash + "/"
|
239 |
+
#if bucket_name:
|
240 |
+
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
241 |
|
242 |
+
return out_session_hash, output_folder, out_session_hash
|
|
|
|
|
|
|
243 |
|
244 |
|
245 |
def clean_unicode_text(text):
|