seanpedrickcase
commited on
Commit
·
8652429
1
Parent(s):
6ea0852
Optimised Textract and Tesseract workings
Browse files- README.md +5 -4
- app.py +70 -44
- tools/aws_textract.py +69 -37
- tools/custom_image_analyser_engine.py +571 -18
- tools/data_anonymise.py +6 -2
- tools/file_conversion.py +32 -4
- tools/file_redaction.py +410 -190
- tools/load_spacy_model_custom_recognisers.py +1 -1
README.md
CHANGED
@@ -9,9 +9,10 @@ pinned: false
|
|
9 |
license: mit
|
10 |
---
|
11 |
|
12 |
-
#
|
13 |
-
Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
|
14 |
|
15 |
-
|
16 |
|
17 |
-
|
|
|
|
|
|
9 |
license: mit
|
10 |
---
|
11 |
|
12 |
+
# Document redaction
|
|
|
13 |
|
14 |
+
Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
|
15 |
|
16 |
+
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
17 |
+
|
18 |
+
This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
|
app.py
CHANGED
@@ -7,7 +7,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
|
7 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
|
8 |
from tools.aws_functions import upload_file_to_s3
|
9 |
from tools.file_redaction import choose_and_run_redactor
|
10 |
-
from tools.file_conversion import
|
11 |
from tools.data_anonymise import anonymise_data_files
|
12 |
from tools.auth import authenticate_user
|
13 |
#from tools.aws_functions import load_data_from_aws
|
@@ -37,6 +37,9 @@ app = gr.Blocks(theme = gr.themes.Base())
|
|
37 |
|
38 |
with app:
|
39 |
|
|
|
|
|
|
|
40 |
prepared_pdf_state = gr.State([])
|
41 |
output_image_files_state = gr.State([])
|
42 |
output_file_list_state = gr.State([])
|
@@ -56,23 +59,38 @@ with app:
|
|
56 |
access_logs_state = gr.State(access_logs_folder + 'log.csv')
|
57 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
58 |
usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
|
59 |
-
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
gr.Markdown(
|
62 |
"""
|
63 |
# Document redaction
|
64 |
|
65 |
-
Redact personal information from documents, open text, or xlsx/csv
|
66 |
|
67 |
-
|
68 |
|
69 |
-
This app accepts a maximum file size of
|
70 |
""")
|
71 |
|
|
|
72 |
with gr.Tab("PDFs/images"):
|
73 |
-
|
74 |
with gr.Accordion("Redact document", open = True):
|
75 |
-
|
|
|
|
|
76 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
77 |
|
78 |
with gr.Row():
|
@@ -83,16 +101,14 @@ with app:
|
|
83 |
with gr.Row():
|
84 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
85 |
|
|
|
86 |
pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
87 |
pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
|
88 |
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
89 |
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
90 |
-
|
91 |
-
with gr.Row():
|
92 |
-
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
93 |
-
# This keeps track of the time taken to redact files for logging purposes.
|
94 |
-
estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False)
|
95 |
|
|
|
96 |
with gr.Tab(label="Open text or Excel/csv files"):
|
97 |
gr.Markdown(
|
98 |
"""
|
@@ -115,19 +131,21 @@ with app:
|
|
115 |
text_output_file = gr.File(label="Output files")
|
116 |
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
|
117 |
|
|
|
118 |
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
119 |
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
|
120 |
choices=["The results were good", "The results were not good"], visible=False)
|
121 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
122 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
123 |
|
|
|
124 |
with gr.Tab(label="Redaction settings"):
|
125 |
gr.Markdown(
|
126 |
"""
|
127 |
Define redaction settings that affect both document and open text redaction.
|
128 |
""")
|
129 |
with gr.Accordion("Settings for documents", open = True):
|
130 |
-
|
131 |
with gr.Row():
|
132 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
133 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
@@ -140,53 +158,47 @@ with app:
|
|
140 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
141 |
with gr.Row():
|
142 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
143 |
-
#
|
144 |
with gr.Row():
|
145 |
in_allow_list = gr.UploadButton(label="Import allow list file.", file_count="multiple")
|
146 |
gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
|
147 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
148 |
log_files_output = gr.File(label="Log file output", interactive=False)
|
149 |
|
150 |
-
# Invisible text box to hold the session hash/username and Textract request metadata just for logging purposes
|
151 |
-
session_hash_textbox = gr.Textbox(value="", visible=False)
|
152 |
-
textract_metadata_textbox = gr.Textbox(value="", visible=False)
|
153 |
-
|
154 |
-
# AWS options - placeholder for possibility of storing data on s3
|
155 |
-
# with gr.Tab(label="Advanced options"):
|
156 |
-
# with gr.Accordion(label = "AWS data access", open = True):
|
157 |
-
# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
158 |
-
# with gr.Row():
|
159 |
-
# in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
|
160 |
-
# load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
|
161 |
-
|
162 |
-
# aws_log_box = gr.Textbox(label="AWS data load status")
|
163 |
-
|
164 |
-
# ### Loading AWS data ###
|
165 |
-
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
166 |
-
|
167 |
# If a custom allow list is uploaded
|
168 |
in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
173 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox], api_name="redact_doc")
|
174 |
|
175 |
# If the output file count text box changes, keep going with redacting each document until done
|
176 |
-
text_documents_done.change(fn =
|
177 |
-
then(fn = choose_and_run_redactor, inputs=[
|
178 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox]).\
|
179 |
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
180 |
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
183 |
|
184 |
-
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="
|
185 |
|
186 |
# If the output file count text box changes, keep going with redacting each data file until done
|
187 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
188 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
189 |
|
|
|
|
|
|
|
|
|
190 |
# Get connection details on app load
|
191 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
192 |
|
@@ -198,8 +210,8 @@ with app:
|
|
198 |
|
199 |
# User submitted feedback for pdf redactions
|
200 |
pdf_callback = gr.CSVLogger()
|
201 |
-
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text,
|
202 |
-
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text,
|
203 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
204 |
|
205 |
# User submitted feedback for data redactions
|
@@ -210,8 +222,8 @@ with app:
|
|
210 |
|
211 |
# Log processing time/token usage when making a query
|
212 |
usage_callback = gr.CSVLogger()
|
213 |
-
usage_callback.setup([session_hash_textbox,
|
214 |
-
estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox,
|
215 |
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
216 |
|
217 |
# Launch the Gradio app
|
@@ -222,4 +234,18 @@ if __name__ == "__main__":
|
|
222 |
if os.environ['COGNITO_AUTH'] == "1":
|
223 |
app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
|
224 |
else:
|
225 |
-
app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
|
8 |
from tools.aws_functions import upload_file_to_s3
|
9 |
from tools.file_redaction import choose_and_run_redactor
|
10 |
+
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
11 |
from tools.data_anonymise import anonymise_data_files
|
12 |
from tools.auth import authenticate_user
|
13 |
#from tools.aws_functions import load_data_from_aws
|
|
|
37 |
|
38 |
with app:
|
39 |
|
40 |
+
###
|
41 |
+
# STATE VARIABLES
|
42 |
+
###
|
43 |
prepared_pdf_state = gr.State([])
|
44 |
output_image_files_state = gr.State([])
|
45 |
output_file_list_state = gr.State([])
|
|
|
59 |
access_logs_state = gr.State(access_logs_folder + 'log.csv')
|
60 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
61 |
usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
|
62 |
+
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
63 |
+
|
64 |
+
# Invisible elements effectively used as state variables
|
65 |
+
session_hash_textbox = gr.Textbox(value="", visible=False) # Invisible text box to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
66 |
+
textract_metadata_textbox = gr.Textbox(value="", visible=False)
|
67 |
+
doc_file_name_textbox = gr.Textbox(value="", visible=False)
|
68 |
+
data_file_name_textbox = gr.Textbox(value="", visible=False)
|
69 |
+
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
70 |
+
estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
71 |
+
|
72 |
+
|
73 |
+
###
|
74 |
+
# UI DESIGN
|
75 |
+
###
|
76 |
|
77 |
gr.Markdown(
|
78 |
"""
|
79 |
# Document redaction
|
80 |
|
81 |
+
Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
|
82 |
|
83 |
+
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
84 |
|
85 |
+
This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
|
86 |
""")
|
87 |
|
88 |
+
# PDF / IMAGES TAB
|
89 |
with gr.Tab("PDFs/images"):
|
|
|
90 |
with gr.Accordion("Redact document", open = True):
|
91 |
+
in_doc_files = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
|
92 |
+
in_redaction_method = gr.Radio(label="Choose document redaction method. Note that for AWS Textract, there will be a cost to the service from use of AWS services.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - AWS Textract, handwriting/signatures"])
|
93 |
+
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
94 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
95 |
|
96 |
with gr.Row():
|
|
|
101 |
with gr.Row():
|
102 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
103 |
|
104 |
+
# Feedback elements are invisible until revealed by redaction action
|
105 |
pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
106 |
pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
|
107 |
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
108 |
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
109 |
+
|
|
|
|
|
|
|
|
|
110 |
|
111 |
+
# TEXT / TABULAR DATA TAB
|
112 |
with gr.Tab(label="Open text or Excel/csv files"):
|
113 |
gr.Markdown(
|
114 |
"""
|
|
|
131 |
text_output_file = gr.File(label="Output files")
|
132 |
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
|
133 |
|
134 |
+
# Feedback elements are invisible until revealed by redaction action
|
135 |
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
136 |
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
|
137 |
choices=["The results were good", "The results were not good"], visible=False)
|
138 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
139 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
140 |
|
141 |
+
# SETTINGS TAB
|
142 |
with gr.Tab(label="Redaction settings"):
|
143 |
gr.Markdown(
|
144 |
"""
|
145 |
Define redaction settings that affect both document and open text redaction.
|
146 |
""")
|
147 |
with gr.Accordion("Settings for documents", open = True):
|
148 |
+
|
149 |
with gr.Row():
|
150 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
151 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
|
|
158 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
159 |
with gr.Row():
|
160 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
161 |
+
# Upload 'Allow list' for terms not to be redacted
|
162 |
with gr.Row():
|
163 |
in_allow_list = gr.UploadButton(label="Import allow list file.", file_count="multiple")
|
164 |
gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
|
165 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
166 |
log_files_output = gr.File(label="Log file output", interactive=False)
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
# If a custom allow list is uploaded
|
169 |
in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
170 |
|
171 |
+
###
|
172 |
+
# PDF/IMAGE REDACTION
|
173 |
+
###
|
174 |
+
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox])
|
175 |
+
|
176 |
+
document_redact_btn.click(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare_doc").\
|
177 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox],
|
178 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox], api_name="redact_doc")
|
179 |
|
180 |
# If the output file count text box changes, keep going with redacting each document until done
|
181 |
+
text_documents_done.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
|
182 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox],
|
183 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox]).\
|
184 |
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
185 |
|
186 |
+
###
|
187 |
+
# TABULAR DATA REDACTION
|
188 |
+
###
|
189 |
+
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
190 |
+
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_textbox])
|
191 |
|
192 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
193 |
|
194 |
# If the output file count text box changes, keep going with redacting each data file until done
|
195 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
196 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
197 |
|
198 |
+
###
|
199 |
+
# APP LOAD AND LOGGING
|
200 |
+
###
|
201 |
+
|
202 |
# Get connection details on app load
|
203 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
204 |
|
|
|
210 |
|
211 |
# User submitted feedback for pdf redactions
|
212 |
pdf_callback = gr.CSVLogger()
|
213 |
+
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_doc_files], feedback_logs_folder)
|
214 |
+
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_doc_files], None, preprocess=False).\
|
215 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
216 |
|
217 |
# User submitted feedback for data redactions
|
|
|
222 |
|
223 |
# Log processing time/token usage when making a query
|
224 |
usage_callback = gr.CSVLogger()
|
225 |
+
usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
|
226 |
+
estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
|
227 |
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
228 |
|
229 |
# Launch the Gradio app
|
|
|
234 |
if os.environ['COGNITO_AUTH'] == "1":
|
235 |
app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
|
236 |
else:
|
237 |
+
app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')
|
238 |
+
|
239 |
+
|
240 |
+
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
241 |
+
# with gr.Tab(label="Advanced options"):
|
242 |
+
# with gr.Accordion(label = "AWS data access", open = True):
|
243 |
+
# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
244 |
+
# with gr.Row():
|
245 |
+
# in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
|
246 |
+
# load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
|
247 |
+
|
248 |
+
# aws_log_box = gr.Textbox(label="AWS data load status")
|
249 |
+
|
250 |
+
# ### Loading AWS data ###
|
251 |
+
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_doc_files, aws_log_box])
|
tools/aws_textract.py
CHANGED
@@ -44,7 +44,7 @@ def analyse_page_with_textract(pdf_page_bytes, json_file_path):
|
|
44 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
45 |
|
46 |
text_blocks = response['Blocks']
|
47 |
-
request_metadata = extract_textract_metadata(response)
|
48 |
|
49 |
# Write the response to a JSON file
|
50 |
with open(json_file_path, 'w') as json_file:
|
@@ -92,56 +92,75 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
92 |
signatures = []
|
93 |
handwriting = []
|
94 |
|
|
|
|
|
95 |
for text_block in json_data:
|
96 |
|
97 |
is_signature = False
|
98 |
is_handwriting = False
|
99 |
|
100 |
-
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
# If a line, pull out the text type and confidence from the child words and get text, bounding box
|
105 |
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
|
|
109 |
if 'Relationships' in text_block:
|
110 |
for relationship in text_block['Relationships']:
|
111 |
if relationship['Type'] == 'CHILD':
|
112 |
for child_id in relationship['Ids']:
|
113 |
child_block = next((block for block in json_data if block['Id'] == child_id), None)
|
114 |
-
if child_block and '
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
-
# If handwriting or signature, add to bounding box
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
entity_name = "HANDWRITING"
|
138 |
-
word_end = len(entity_name)
|
139 |
-
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
|
140 |
-
handwriting.append(recogniser_result)
|
141 |
-
print("Handwriting found:", handwriting[-1])
|
142 |
|
143 |
elif (text_block['BlockType'] == 'SIGNATURE'):
|
144 |
-
|
145 |
|
146 |
is_signature = True
|
147 |
entity_name = "SIGNATURE"
|
@@ -161,12 +180,25 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
161 |
width_abs = int(width * page_width)
|
162 |
height_abs = int(height * page_height)
|
163 |
|
164 |
-
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text=
|
165 |
signatures.append(recogniser_result)
|
166 |
print("Signature found:", signatures[-1])
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
# Create OCRResult with absolute coordinates
|
169 |
-
ocr_result = OCRResult(
|
170 |
all_ocr_results.append(ocr_result)
|
171 |
|
172 |
is_signature_or_handwriting = is_signature | is_handwriting
|
@@ -178,4 +210,4 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
178 |
if is_signature: signature_recogniser_results.append(recogniser_result)
|
179 |
if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
|
180 |
|
181 |
-
return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results
|
|
|
44 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
45 |
|
46 |
text_blocks = response['Blocks']
|
47 |
+
request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
|
48 |
|
49 |
# Write the response to a JSON file
|
50 |
with open(json_file_path, 'w') as json_file:
|
|
|
92 |
signatures = []
|
93 |
handwriting = []
|
94 |
|
95 |
+
combined_results = {}
|
96 |
+
|
97 |
for text_block in json_data:
|
98 |
|
99 |
is_signature = False
|
100 |
is_handwriting = False
|
101 |
|
102 |
+
|
103 |
|
104 |
+
if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
|
|
|
|
|
105 |
|
106 |
+
if text_block['BlockType'] == 'LINE':
|
107 |
+
# Extract text and bounding box for the line
|
108 |
+
line_text = text_block.get('Text', '')
|
109 |
+
line_bbox = text_block["Geometry"]["BoundingBox"]
|
110 |
+
line_left = int(line_bbox["Left"] * page_width)
|
111 |
+
line_top = int(line_bbox["Top"] * page_height)
|
112 |
+
line_right = int((line_bbox["Left"] + line_bbox["Width"]) * page_width)
|
113 |
+
line_bottom = int((line_bbox["Top"] + line_bbox["Height"]) * page_height)
|
114 |
|
115 |
+
words = []
|
116 |
if 'Relationships' in text_block:
|
117 |
for relationship in text_block['Relationships']:
|
118 |
if relationship['Type'] == 'CHILD':
|
119 |
for child_id in relationship['Ids']:
|
120 |
child_block = next((block for block in json_data if block['Id'] == child_id), None)
|
121 |
+
if child_block and child_block['BlockType'] == 'WORD':
|
122 |
+
word_text = child_block.get('Text', '')
|
123 |
+
word_bbox = child_block["Geometry"]["BoundingBox"]
|
124 |
+
confidence = child_block.get('Confidence','')
|
125 |
+
word_left = int(word_bbox["Left"] * page_width)
|
126 |
+
word_top = int(word_bbox["Top"] * page_height)
|
127 |
+
word_right = int((word_bbox["Left"] + word_bbox["Width"]) * page_width)
|
128 |
+
word_bottom = int((word_bbox["Top"] + word_bbox["Height"]) * page_height)
|
129 |
+
|
130 |
+
# Extract BoundingBox details
|
131 |
+
width = word_bbox["Width"]
|
132 |
+
height = word_bbox["Height"]
|
133 |
+
|
134 |
+
# Convert proportional coordinates to absolute coordinates
|
135 |
+
width_abs = int(width * page_width)
|
136 |
+
height_abs = int(height * page_height)
|
137 |
+
|
138 |
+
words.append({
|
139 |
+
'text': word_text,
|
140 |
+
'bounding_box': (word_left, word_top, word_right, word_bottom)
|
141 |
+
})
|
142 |
+
# Check for handwriting
|
143 |
+
text_type = child_block.get("TextType", '')
|
144 |
+
|
145 |
+
if text_type == "HANDWRITING":
|
146 |
+
is_handwriting = True
|
147 |
+
entity_name = "HANDWRITING"
|
148 |
+
word_end = len(entity_name)
|
149 |
+
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=width_abs, height=height_abs)
|
150 |
+
handwriting.append(recogniser_result)
|
151 |
+
print("Handwriting found:", handwriting[-1])
|
152 |
+
|
153 |
+
combined_results[line_text] = {
|
154 |
+
'bounding_box': (line_left, line_top, line_right, line_bottom),
|
155 |
+
'words': words
|
156 |
+
}
|
157 |
|
|
|
158 |
|
159 |
+
|
160 |
+
# If handwriting or signature, add to bounding box
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
elif (text_block['BlockType'] == 'SIGNATURE'):
|
163 |
+
line_text = "SIGNATURE"
|
164 |
|
165 |
is_signature = True
|
166 |
entity_name = "SIGNATURE"
|
|
|
180 |
width_abs = int(width * page_width)
|
181 |
height_abs = int(height * page_height)
|
182 |
|
183 |
+
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
|
184 |
signatures.append(recogniser_result)
|
185 |
print("Signature found:", signatures[-1])
|
186 |
|
187 |
+
# Extract BoundingBox details
|
188 |
+
bbox = text_block["Geometry"]["BoundingBox"]
|
189 |
+
left = bbox["Left"]
|
190 |
+
top = bbox["Top"]
|
191 |
+
width = bbox["Width"]
|
192 |
+
height = bbox["Height"]
|
193 |
+
|
194 |
+
# Convert proportional coordinates to absolute coordinates
|
195 |
+
left_abs = int(left * page_width)
|
196 |
+
top_abs = int(top * page_height)
|
197 |
+
width_abs = int(width * page_width)
|
198 |
+
height_abs = int(height * page_height)
|
199 |
+
|
200 |
# Create OCRResult with absolute coordinates
|
201 |
+
ocr_result = OCRResult(line_text, left_abs, top_abs, width_abs, height_abs)
|
202 |
all_ocr_results.append(ocr_result)
|
203 |
|
204 |
is_signature_or_handwriting = is_signature | is_handwriting
|
|
|
210 |
if is_signature: signature_recogniser_results.append(recogniser_result)
|
211 |
if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
|
212 |
|
213 |
+
return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, combined_results
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -1,9 +1,14 @@
|
|
1 |
import pytesseract
|
2 |
-
from PIL import Image
|
3 |
import numpy as np
|
4 |
from presidio_analyzer import AnalyzerEngine, RecognizerResult
|
|
|
5 |
from typing import List, Dict, Optional, Union, Tuple
|
6 |
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
@dataclass
|
9 |
class OCRResult:
|
@@ -25,17 +30,399 @@ class CustomImageRecognizerResult:
|
|
25 |
height: int
|
26 |
text: str
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
class CustomImageAnalyzerEngine:
|
29 |
def __init__(
|
30 |
self,
|
31 |
analyzer_engine: Optional[AnalyzerEngine] = None,
|
32 |
-
tesseract_config: Optional[str] = None
|
|
|
33 |
):
|
34 |
if not analyzer_engine:
|
35 |
analyzer_engine = AnalyzerEngine()
|
36 |
self.analyzer_engine = analyzer_engine
|
37 |
self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
|
40 |
# Ensure image is a PIL Image
|
41 |
if isinstance(image, str):
|
@@ -43,18 +430,30 @@ class CustomImageAnalyzerEngine:
|
|
43 |
elif isinstance(image, np.ndarray):
|
44 |
image = Image.fromarray(image)
|
45 |
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
# Filter out empty strings and low confidence results
|
49 |
-
valid_indices = [i for i, text in enumerate(
|
50 |
|
51 |
return [
|
52 |
OCRResult(
|
53 |
-
text=
|
54 |
-
left=
|
55 |
-
top=
|
56 |
-
width=
|
57 |
-
height=
|
58 |
)
|
59 |
for i in valid_indices
|
60 |
]
|
@@ -86,7 +485,7 @@ class CustomImageAnalyzerEngine:
|
|
86 |
text=relevant_text,
|
87 |
left=ocr_result.left + self.estimate_x_offset(ocr_result.text, result.start),
|
88 |
top=ocr_result.top,
|
89 |
-
width=self.estimate_width(ocr_result, result.start, result.end),
|
90 |
height=ocr_result.height
|
91 |
)
|
92 |
|
@@ -132,28 +531,160 @@ class CustomImageAnalyzerEngine:
|
|
132 |
text_position = word_end + 1 # +1 for the space between words
|
133 |
|
134 |
return pii_bboxes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
@staticmethod
|
137 |
def estimate_x_offset(full_text: str, start: int) -> int:
|
138 |
# Estimate the x-offset based on character position
|
139 |
# This is a simple estimation and might need refinement for variable-width fonts
|
140 |
return int(start / len(full_text) * len(full_text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
-
@staticmethod
|
143 |
-
def estimate_width(ocr_result: OCRResult, start: int, end: int) -> int:
|
144 |
-
# Estimate the width of the relevant text portion
|
145 |
-
full_width = ocr_result.width
|
146 |
-
full_length = len(ocr_result.text)
|
147 |
-
return int((end - start) / full_length * full_width)
|
148 |
|
149 |
# Function to combine OCR results into line-level results
|
150 |
-
def combine_ocr_results(ocr_results, x_threshold
|
151 |
# Sort OCR results by 'top' to ensure line order
|
152 |
ocr_results = sorted(ocr_results, key=lambda x: (x.top, x.left))
|
153 |
|
154 |
combined_results = []
|
|
|
155 |
current_line = []
|
156 |
current_bbox = None
|
|
|
157 |
|
158 |
for result in ocr_results:
|
159 |
if not current_line:
|
@@ -178,11 +709,33 @@ def combine_ocr_results(ocr_results, x_threshold = 20, y_threshold = 10):
|
|
178 |
else:
|
179 |
# Commit the current line and start a new one
|
180 |
combined_results.append(current_bbox)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
current_line = [result]
|
182 |
current_bbox = result
|
183 |
|
184 |
# Append the last line
|
185 |
if current_bbox:
|
186 |
combined_results.append(current_bbox)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
-
return combined_results
|
|
|
1 |
import pytesseract
|
|
|
2 |
import numpy as np
|
3 |
from presidio_analyzer import AnalyzerEngine, RecognizerResult
|
4 |
+
#from presidio_image_redactor import ImagePreprocessor
|
5 |
from typing import List, Dict, Optional, Union, Tuple
|
6 |
from dataclasses import dataclass
|
7 |
+
import cv2
|
8 |
+
import PIL
|
9 |
+
from PIL import ImageDraw, ImageFont, Image
|
10 |
+
from typing import Optional, Tuple, Union
|
11 |
+
from copy import deepcopy
|
12 |
|
13 |
@dataclass
|
14 |
class OCRResult:
|
|
|
30 |
height: int
|
31 |
text: str
|
32 |
|
33 |
+
class ImagePreprocessor:
|
34 |
+
"""ImagePreprocessor class.
|
35 |
+
|
36 |
+
Parent class for image preprocessing objects.
|
37 |
+
"""
|
38 |
+
|
39 |
+
def __init__(self, use_greyscale: bool = True) -> None:
|
40 |
+
"""Initialize the ImagePreprocessor class.
|
41 |
+
|
42 |
+
:param use_greyscale: Whether to convert the image to greyscale.
|
43 |
+
"""
|
44 |
+
self.use_greyscale = use_greyscale
|
45 |
+
|
46 |
+
def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
|
47 |
+
"""Preprocess the image to be analyzed.
|
48 |
+
|
49 |
+
:param image: Loaded PIL image.
|
50 |
+
|
51 |
+
:return: The processed image and any metadata regarding the
|
52 |
+
preprocessing approach.
|
53 |
+
"""
|
54 |
+
return image, {}
|
55 |
+
|
56 |
+
def convert_image_to_array(self, image: Image.Image) -> np.ndarray:
|
57 |
+
"""Convert PIL image to numpy array.
|
58 |
+
|
59 |
+
:param image: Loaded PIL image.
|
60 |
+
:param convert_to_greyscale: Whether to convert the image to greyscale.
|
61 |
+
|
62 |
+
:return: image pixels as a numpy array.
|
63 |
+
|
64 |
+
"""
|
65 |
+
|
66 |
+
if isinstance(image, np.ndarray):
|
67 |
+
img = image
|
68 |
+
else:
|
69 |
+
if self.use_greyscale:
|
70 |
+
image = image.convert("L")
|
71 |
+
img = np.asarray(image)
|
72 |
+
return img
|
73 |
+
|
74 |
+
@staticmethod
|
75 |
+
def _get_bg_color(
|
76 |
+
image: Image.Image, is_greyscale: bool, invert: bool = False
|
77 |
+
) -> Union[int, Tuple[int, int, int]]:
|
78 |
+
"""Select most common color as background color.
|
79 |
+
|
80 |
+
:param image: Loaded PIL image.
|
81 |
+
:param is_greyscale: Whether the image is greyscale.
|
82 |
+
:param invert: TRUE if you want to get the inverse of the bg color.
|
83 |
+
|
84 |
+
:return: Background color.
|
85 |
+
"""
|
86 |
+
# Invert colors if invert flag is True
|
87 |
+
if invert:
|
88 |
+
if image.mode == "RGBA":
|
89 |
+
# Handle transparency as needed
|
90 |
+
r, g, b, a = image.split()
|
91 |
+
rgb_image = Image.merge("RGB", (r, g, b))
|
92 |
+
inverted_image = PIL.ImageOps.invert(rgb_image)
|
93 |
+
r2, g2, b2 = inverted_image.split()
|
94 |
+
|
95 |
+
image = Image.merge("RGBA", (r2, g2, b2, a))
|
96 |
+
|
97 |
+
else:
|
98 |
+
image = PIL.ImageOps.invert(image)
|
99 |
+
|
100 |
+
# Get background color
|
101 |
+
if is_greyscale:
|
102 |
+
# Select most common color as color
|
103 |
+
bg_color = int(np.bincount(image.flatten()).argmax())
|
104 |
+
else:
|
105 |
+
# Reduce size of image to 1 pixel to get dominant color
|
106 |
+
tmp_image = image.copy()
|
107 |
+
tmp_image = tmp_image.resize((1, 1), resample=0)
|
108 |
+
bg_color = tmp_image.getpixel((0, 0))
|
109 |
+
|
110 |
+
return bg_color
|
111 |
+
|
112 |
+
@staticmethod
|
113 |
+
def _get_image_contrast(image: np.ndarray) -> Tuple[float, float]:
|
114 |
+
"""Compute the contrast level and mean intensity of an image.
|
115 |
+
|
116 |
+
:param image: Input image pixels (as a numpy array).
|
117 |
+
|
118 |
+
:return: A tuple containing the contrast level and mean intensity of the image.
|
119 |
+
"""
|
120 |
+
contrast = np.std(image)
|
121 |
+
mean_intensity = np.mean(image)
|
122 |
+
return contrast, mean_intensity
|
123 |
+
|
124 |
+
class BilateralFilter(ImagePreprocessor):
|
125 |
+
"""BilateralFilter class.
|
126 |
+
|
127 |
+
The class applies bilateral filtering to an image. and returns the filtered
|
128 |
+
image and metadata.
|
129 |
+
"""
|
130 |
+
|
131 |
+
def __init__(
|
132 |
+
self, diameter: int = 3, sigma_color: int = 40, sigma_space: int = 40
|
133 |
+
) -> None:
|
134 |
+
"""Initialize the BilateralFilter class.
|
135 |
+
|
136 |
+
:param diameter: Diameter of each pixel neighborhood.
|
137 |
+
:param sigma_color: value of sigma in the color space.
|
138 |
+
:param sigma_space: value of sigma in the coordinate space.
|
139 |
+
"""
|
140 |
+
super().__init__(use_greyscale=True)
|
141 |
+
|
142 |
+
self.diameter = diameter
|
143 |
+
self.sigma_color = sigma_color
|
144 |
+
self.sigma_space = sigma_space
|
145 |
+
|
146 |
+
def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
|
147 |
+
"""Preprocess the image to be analyzed.
|
148 |
+
|
149 |
+
:param image: Loaded PIL image.
|
150 |
+
|
151 |
+
:return: The processed image and metadata (diameter, sigma_color, sigma_space).
|
152 |
+
"""
|
153 |
+
image = self.convert_image_to_array(image)
|
154 |
+
|
155 |
+
# Apply bilateral filtering
|
156 |
+
filtered_image = cv2.bilateralFilter(
|
157 |
+
image,
|
158 |
+
self.diameter,
|
159 |
+
self.sigma_color,
|
160 |
+
self.sigma_space,
|
161 |
+
)
|
162 |
+
|
163 |
+
metadata = {
|
164 |
+
"diameter": self.diameter,
|
165 |
+
"sigma_color": self.sigma_color,
|
166 |
+
"sigma_space": self.sigma_space,
|
167 |
+
}
|
168 |
+
|
169 |
+
return Image.fromarray(filtered_image), metadata
|
170 |
+
|
171 |
+
|
172 |
+
class SegmentedAdaptiveThreshold(ImagePreprocessor):
|
173 |
+
"""SegmentedAdaptiveThreshold class.
|
174 |
+
|
175 |
+
The class applies adaptive thresholding to an image
|
176 |
+
and returns the thresholded image and metadata.
|
177 |
+
The parameters used to run the adaptivethresholding are selected based on
|
178 |
+
the contrast level of the image.
|
179 |
+
"""
|
180 |
+
|
181 |
+
def __init__(
|
182 |
+
self,
|
183 |
+
block_size: int = 5,
|
184 |
+
contrast_threshold: int = 40,
|
185 |
+
c_low_contrast: int = 10,
|
186 |
+
c_high_contrast: int = 40,
|
187 |
+
bg_threshold: int = 122,
|
188 |
+
) -> None:
|
189 |
+
"""Initialize the SegmentedAdaptiveThreshold class.
|
190 |
+
|
191 |
+
:param block_size: Size of the neighborhood area for threshold calculation.
|
192 |
+
:param contrast_threshold: Threshold for low contrast images.
|
193 |
+
:param C_low_contrast: Constant added to the mean for low contrast images.
|
194 |
+
:param C_high_contrast: Constant added to the mean for high contrast images.
|
195 |
+
:param bg_threshold: Threshold for background color.
|
196 |
+
"""
|
197 |
+
|
198 |
+
super().__init__(use_greyscale=True)
|
199 |
+
self.block_size = block_size
|
200 |
+
self.c_low_contrast = c_low_contrast
|
201 |
+
self.c_high_contrast = c_high_contrast
|
202 |
+
self.bg_threshold = bg_threshold
|
203 |
+
self.contrast_threshold = contrast_threshold
|
204 |
+
|
205 |
+
def preprocess_image(
|
206 |
+
self, image: Union[Image.Image, np.ndarray]
|
207 |
+
) -> Tuple[Image.Image, dict]:
|
208 |
+
"""Preprocess the image.
|
209 |
+
|
210 |
+
:param image: Loaded PIL image.
|
211 |
+
|
212 |
+
:return: The processed image and metadata (C, background_color, contrast).
|
213 |
+
"""
|
214 |
+
if not isinstance(image, np.ndarray):
|
215 |
+
image = self.convert_image_to_array(image)
|
216 |
+
|
217 |
+
# Determine background color
|
218 |
+
background_color = self._get_bg_color(image, True)
|
219 |
+
contrast, _ = self._get_image_contrast(image)
|
220 |
+
|
221 |
+
c = (
|
222 |
+
self.c_low_contrast
|
223 |
+
if contrast <= self.contrast_threshold
|
224 |
+
else self.c_high_contrast
|
225 |
+
)
|
226 |
+
|
227 |
+
if background_color < self.bg_threshold:
|
228 |
+
adaptive_threshold_image = cv2.adaptiveThreshold(
|
229 |
+
image,
|
230 |
+
255,
|
231 |
+
cv2.ADAPTIVE_THRESH_MEAN_C,
|
232 |
+
cv2.THRESH_BINARY_INV,
|
233 |
+
self.block_size,
|
234 |
+
-c,
|
235 |
+
)
|
236 |
+
else:
|
237 |
+
adaptive_threshold_image = cv2.adaptiveThreshold(
|
238 |
+
image,
|
239 |
+
255,
|
240 |
+
cv2.ADAPTIVE_THRESH_MEAN_C,
|
241 |
+
cv2.THRESH_BINARY,
|
242 |
+
self.block_size,
|
243 |
+
c,
|
244 |
+
)
|
245 |
+
|
246 |
+
metadata = {"C": c, "background_color": background_color, "contrast": contrast}
|
247 |
+
return Image.fromarray(adaptive_threshold_image), metadata
|
248 |
+
|
249 |
+
|
250 |
+
|
251 |
+
|
252 |
+
class ImageRescaling(ImagePreprocessor):
|
253 |
+
"""ImageRescaling class. Rescales images based on their size."""
|
254 |
+
|
255 |
+
def __init__(
|
256 |
+
self,
|
257 |
+
small_size: int = 1048576,
|
258 |
+
large_size: int = 4000000,
|
259 |
+
factor: int = 2,
|
260 |
+
interpolation: int = cv2.INTER_AREA,
|
261 |
+
) -> None:
|
262 |
+
"""Initialize the ImageRescaling class.
|
263 |
+
|
264 |
+
:param small_size: Threshold for small image size.
|
265 |
+
:param large_size: Threshold for large image size.
|
266 |
+
:param factor: Scaling factor for resizing.
|
267 |
+
:param interpolation: Interpolation method for resizing.
|
268 |
+
"""
|
269 |
+
super().__init__(use_greyscale=True)
|
270 |
+
|
271 |
+
self.small_size = small_size
|
272 |
+
self.large_size = large_size
|
273 |
+
self.factor = factor
|
274 |
+
self.interpolation = interpolation
|
275 |
+
|
276 |
+
def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
|
277 |
+
"""Preprocess the image to be analyzed.
|
278 |
+
|
279 |
+
:param image: Loaded PIL image.
|
280 |
+
|
281 |
+
:return: The processed image and metadata (scale_factor).
|
282 |
+
"""
|
283 |
+
|
284 |
+
scale_factor = 1
|
285 |
+
if image.size < self.small_size:
|
286 |
+
scale_factor = self.factor
|
287 |
+
elif image.size > self.large_size:
|
288 |
+
scale_factor = 1 / self.factor
|
289 |
+
|
290 |
+
width = int(image.shape[1] * scale_factor)
|
291 |
+
height = int(image.shape[0] * scale_factor)
|
292 |
+
dimensions = (width, height)
|
293 |
+
|
294 |
+
# resize image
|
295 |
+
rescaled_image = cv2.resize(image, dimensions, interpolation=self.interpolation)
|
296 |
+
metadata = {"scale_factor": scale_factor}
|
297 |
+
return Image.fromarray(rescaled_image), metadata
|
298 |
+
|
299 |
+
|
300 |
+
class ContrastSegmentedImageEnhancer(ImagePreprocessor):
|
301 |
+
"""Class containing all logic to perform contrastive segmentation.
|
302 |
+
|
303 |
+
Contrastive segmentation is a preprocessing step that aims to enhance the
|
304 |
+
text in an image by increasing the contrast between the text and the
|
305 |
+
background. The parameters used to run the preprocessing are selected based
|
306 |
+
on the contrast level of the image.
|
307 |
+
"""
|
308 |
+
|
309 |
+
def __init__(
|
310 |
+
self,
|
311 |
+
bilateral_filter: Optional[BilateralFilter] = None,
|
312 |
+
adaptive_threshold: Optional[SegmentedAdaptiveThreshold] = None,
|
313 |
+
image_rescaling: Optional[ImageRescaling] = None,
|
314 |
+
low_contrast_threshold: int = 40,
|
315 |
+
) -> None:
|
316 |
+
"""Initialize the class.
|
317 |
+
|
318 |
+
:param bilateral_filter: Optional BilateralFilter instance.
|
319 |
+
:param adaptive_threshold: Optional AdaptiveThreshold instance.
|
320 |
+
:param image_rescaling: Optional ImageRescaling instance.
|
321 |
+
:param low_contrast_threshold: Threshold for low contrast images.
|
322 |
+
"""
|
323 |
+
|
324 |
+
super().__init__(use_greyscale=True)
|
325 |
+
if not bilateral_filter:
|
326 |
+
self.bilateral_filter = BilateralFilter()
|
327 |
+
else:
|
328 |
+
self.bilateral_filter = bilateral_filter
|
329 |
+
|
330 |
+
if not adaptive_threshold:
|
331 |
+
self.adaptive_threshold = SegmentedAdaptiveThreshold()
|
332 |
+
else:
|
333 |
+
self.adaptive_threshold = adaptive_threshold
|
334 |
+
|
335 |
+
if not image_rescaling:
|
336 |
+
self.image_rescaling = ImageRescaling()
|
337 |
+
else:
|
338 |
+
self.image_rescaling = image_rescaling
|
339 |
+
|
340 |
+
self.low_contrast_threshold = low_contrast_threshold
|
341 |
+
|
342 |
+
def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
|
343 |
+
"""Preprocess the image to be analyzed.
|
344 |
+
|
345 |
+
:param image: Loaded PIL image.
|
346 |
+
|
347 |
+
:return: The processed image and metadata (background color, scale percentage,
|
348 |
+
contrast level, and C value).
|
349 |
+
"""
|
350 |
+
image = self.convert_image_to_array(image)
|
351 |
+
|
352 |
+
# Apply bilateral filtering
|
353 |
+
filtered_image, _ = self.bilateral_filter.preprocess_image(image)
|
354 |
+
|
355 |
+
# Convert to grayscale
|
356 |
+
pil_filtered_image = Image.fromarray(np.uint8(filtered_image))
|
357 |
+
pil_grayscale_image = pil_filtered_image.convert("L")
|
358 |
+
grayscale_image = np.asarray(pil_grayscale_image)
|
359 |
+
|
360 |
+
# Improve contrast
|
361 |
+
adjusted_image, _, adjusted_contrast = self._improve_contrast(grayscale_image)
|
362 |
+
|
363 |
+
# Adaptive Thresholding
|
364 |
+
adaptive_threshold_image, _ = self.adaptive_threshold.preprocess_image(
|
365 |
+
adjusted_image
|
366 |
+
)
|
367 |
+
# Increase contrast
|
368 |
+
_, threshold_image = cv2.threshold(
|
369 |
+
np.asarray(adaptive_threshold_image),
|
370 |
+
0,
|
371 |
+
255,
|
372 |
+
cv2.THRESH_BINARY | cv2.THRESH_OTSU,
|
373 |
+
)
|
374 |
+
|
375 |
+
# Rescale image
|
376 |
+
rescaled_image, scale_metadata = self.image_rescaling.preprocess_image(
|
377 |
+
threshold_image
|
378 |
+
)
|
379 |
+
|
380 |
+
return rescaled_image, scale_metadata
|
381 |
+
|
382 |
+
def _improve_contrast(self, image: np.ndarray) -> Tuple[np.ndarray, str, str]:
|
383 |
+
"""Improve the contrast of an image based on its initial contrast level.
|
384 |
+
|
385 |
+
:param image: Input image.
|
386 |
+
|
387 |
+
:return: A tuple containing the improved image, the initial contrast level,
|
388 |
+
and the adjusted contrast level.
|
389 |
+
"""
|
390 |
+
contrast, mean_intensity = self._get_image_contrast(image)
|
391 |
+
|
392 |
+
if contrast <= self.low_contrast_threshold:
|
393 |
+
alpha = 1.5
|
394 |
+
beta = -mean_intensity * alpha
|
395 |
+
adjusted_image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
|
396 |
+
adjusted_contrast, _ = self._get_image_contrast(adjusted_image)
|
397 |
+
else:
|
398 |
+
adjusted_image = image
|
399 |
+
adjusted_contrast = contrast
|
400 |
+
return adjusted_image, contrast, adjusted_contrast
|
401 |
+
|
402 |
class CustomImageAnalyzerEngine:
|
403 |
def __init__(
|
404 |
self,
|
405 |
analyzer_engine: Optional[AnalyzerEngine] = None,
|
406 |
+
tesseract_config: Optional[str] = None,
|
407 |
+
image_preprocessor: Optional[ImagePreprocessor] = None
|
408 |
):
|
409 |
if not analyzer_engine:
|
410 |
analyzer_engine = AnalyzerEngine()
|
411 |
self.analyzer_engine = analyzer_engine
|
412 |
self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
|
413 |
|
414 |
+
if not image_preprocessor:
|
415 |
+
# image_preprocessor = ImagePreprocessor(
|
416 |
+
# c_low_contrast=10,
|
417 |
+
# c_high_contrast=20,
|
418 |
+
# contrast_threshold=0.5,
|
419 |
+
# bg_threshold=128,
|
420 |
+
# block_size=11
|
421 |
+
# )
|
422 |
+
image_preprocessor = ContrastSegmentedImageEnhancer()
|
423 |
+
print(image_preprocessor)
|
424 |
+
self.image_preprocessor = image_preprocessor
|
425 |
+
|
426 |
def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
|
427 |
# Ensure image is a PIL Image
|
428 |
if isinstance(image, str):
|
|
|
430 |
elif isinstance(image, np.ndarray):
|
431 |
image = Image.fromarray(image)
|
432 |
|
433 |
+
image_processed, preprocessing_metadata = self.image_preprocessor.preprocess_image(image)
|
434 |
+
|
435 |
+
#print("pre-processing metadata:", preprocessing_metadata)
|
436 |
+
#image_processed.save("image_processed.png")
|
437 |
+
|
438 |
+
ocr_data = pytesseract.image_to_data(image_processed, output_type=pytesseract.Output.DICT, config=self.tesseract_config)
|
439 |
+
|
440 |
+
if preprocessing_metadata and ("scale_factor" in preprocessing_metadata):
|
441 |
+
ocr_result = self._scale_bbox_results(
|
442 |
+
ocr_data, preprocessing_metadata["scale_factor"]
|
443 |
+
)
|
444 |
+
|
445 |
+
ocr_result = self.remove_space_boxes(ocr_result)
|
446 |
|
447 |
# Filter out empty strings and low confidence results
|
448 |
+
valid_indices = [i for i, text in enumerate(ocr_result['text']) if text.strip() and int(ocr_result['conf'][i]) > 0]
|
449 |
|
450 |
return [
|
451 |
OCRResult(
|
452 |
+
text=ocr_result['text'][i],
|
453 |
+
left=ocr_result['left'][i],
|
454 |
+
top=ocr_result['top'][i],
|
455 |
+
width=ocr_result['width'][i],
|
456 |
+
height=ocr_result['height'][i]
|
457 |
)
|
458 |
for i in valid_indices
|
459 |
]
|
|
|
485 |
text=relevant_text,
|
486 |
left=ocr_result.left + self.estimate_x_offset(ocr_result.text, result.start),
|
487 |
top=ocr_result.top,
|
488 |
+
width=self.estimate_width(ocr_result=ocr_result, start=result.start, end=result.end),
|
489 |
height=ocr_result.height
|
490 |
)
|
491 |
|
|
|
531 |
text_position = word_end + 1 # +1 for the space between words
|
532 |
|
533 |
return pii_bboxes
|
534 |
+
|
535 |
+
@staticmethod
|
536 |
+
def remove_space_boxes(ocr_result: dict) -> dict:
|
537 |
+
"""Remove OCR bboxes that are for spaces.
|
538 |
+
|
539 |
+
:param ocr_result: OCR results (raw or thresholded).
|
540 |
+
:return: OCR results with empty words removed.
|
541 |
+
"""
|
542 |
+
# Get indices of items with no text
|
543 |
+
idx = list()
|
544 |
+
for i, text in enumerate(ocr_result["text"]):
|
545 |
+
is_not_space = text.isspace() is False
|
546 |
+
if text != "" and is_not_space:
|
547 |
+
idx.append(i)
|
548 |
+
|
549 |
+
# Only retain items with text
|
550 |
+
filtered_ocr_result = {}
|
551 |
+
for key in list(ocr_result.keys()):
|
552 |
+
filtered_ocr_result[key] = [ocr_result[key][i] for i in idx]
|
553 |
+
|
554 |
+
return filtered_ocr_result
|
555 |
+
|
556 |
+
@staticmethod
|
557 |
+
def _scale_bbox_results(
|
558 |
+
ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
|
559 |
+
) -> Dict[str, float]:
|
560 |
+
"""Scale down the bounding box results based on a scale percentage.
|
561 |
+
|
562 |
+
:param ocr_result: OCR results (raw).
|
563 |
+
:param scale_percent: Scale percentage for resizing the bounding box.
|
564 |
+
|
565 |
+
:return: OCR results (scaled).
|
566 |
+
"""
|
567 |
+
scaled_results = deepcopy(ocr_result)
|
568 |
+
coordinate_keys = ["left", "top"]
|
569 |
+
dimension_keys = ["width", "height"]
|
570 |
+
|
571 |
+
for coord_key in coordinate_keys:
|
572 |
+
scaled_results[coord_key] = [
|
573 |
+
int(np.ceil((x) / (scale_factor))) for x in scaled_results[coord_key]
|
574 |
+
]
|
575 |
+
|
576 |
+
for dim_key in dimension_keys:
|
577 |
+
scaled_results[dim_key] = [
|
578 |
+
max(1, int(np.ceil(x / (scale_factor))))
|
579 |
+
for x in scaled_results[dim_key]
|
580 |
+
]
|
581 |
+
return scaled_results
|
582 |
|
583 |
@staticmethod
|
584 |
def estimate_x_offset(full_text: str, start: int) -> int:
|
585 |
# Estimate the x-offset based on character position
|
586 |
# This is a simple estimation and might need refinement for variable-width fonts
|
587 |
return int(start / len(full_text) * len(full_text))
|
588 |
+
|
589 |
+
def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
|
590 |
+
# Extract the relevant text portion
|
591 |
+
relevant_text = ocr_result.text[start:end]
|
592 |
+
|
593 |
+
# If the relevant text is the same as the full text, return the full width
|
594 |
+
if relevant_text == ocr_result.text:
|
595 |
+
return ocr_result.width
|
596 |
+
|
597 |
+
# Estimate width based on the proportion of the relevant text length to the total text length
|
598 |
+
total_text_length = len(ocr_result.text)
|
599 |
+
relevant_text_length = len(relevant_text)
|
600 |
+
|
601 |
+
if total_text_length == 0:
|
602 |
+
return 0 # Avoid division by zero
|
603 |
+
|
604 |
+
# Proportion of the relevant text to the total text
|
605 |
+
proportion = relevant_text_length / total_text_length
|
606 |
+
|
607 |
+
# Estimate the width based on the proportion
|
608 |
+
estimated_width = int(proportion * ocr_result.width)
|
609 |
+
|
610 |
+
return estimated_width
|
611 |
+
|
612 |
+
|
613 |
+
# def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
|
614 |
+
# # Extract the relevant text portion
|
615 |
+
# relevant_text = ocr_result.text[start:end]
|
616 |
+
|
617 |
+
# # Check if the relevant text is the entire text of the OCR result
|
618 |
+
# if relevant_text == ocr_result.text:
|
619 |
+
# return ocr_result.width
|
620 |
+
|
621 |
+
# # Estimate the font size based on the height of the bounding box
|
622 |
+
# estimated_font_size = ocr_result.height + 4
|
623 |
+
|
624 |
+
# # Create a blank image with enough width to measure the text
|
625 |
+
# dummy_image = Image.new('RGB', (1000, 50), color=(255, 255, 255))
|
626 |
+
# draw = ImageDraw.Draw(dummy_image)
|
627 |
+
|
628 |
+
# # Specify the font and size
|
629 |
+
# try:
|
630 |
+
# font = ImageFont.truetype("arial.ttf", estimated_font_size) # Adjust the font file as needed
|
631 |
+
# except IOError:
|
632 |
+
# font = ImageFont.load_default() # Fallback to default font if the specified font is not found
|
633 |
+
|
634 |
+
# # Draw the relevant text on the image
|
635 |
+
# draw.text((0, 0), relevant_text, fill=(0, 0, 0), font=font)
|
636 |
+
|
637 |
+
# # Save the image for debugging purposes
|
638 |
+
# dummy_image.save("debug_image.png")
|
639 |
+
|
640 |
+
# # Use pytesseract to get the bounding box of the relevant text
|
641 |
+
# bbox = pytesseract.image_to_boxes(dummy_image, config=self.tesseract_config)
|
642 |
+
|
643 |
+
# # Print the bbox for debugging
|
644 |
+
# print("Bounding box:", bbox)
|
645 |
+
|
646 |
+
# # Calculate the width from the bounding box
|
647 |
+
# if bbox:
|
648 |
+
# try:
|
649 |
+
# # Initialize min_left and max_right with extreme values
|
650 |
+
# min_left = float('inf')
|
651 |
+
# max_right = float('-inf')
|
652 |
+
|
653 |
+
# # Split the bbox string into lines
|
654 |
+
# bbox_lines = bbox.splitlines()
|
655 |
+
|
656 |
+
# for line in bbox_lines:
|
657 |
+
# parts = line.split()
|
658 |
+
# if len(parts) == 6:
|
659 |
+
# _, left, _, right, _, _ = parts
|
660 |
+
# left = int(left)
|
661 |
+
# right = int(right)
|
662 |
+
# min_left = min(min_left, left)
|
663 |
+
# max_right = max(max_right, right)
|
664 |
+
|
665 |
+
# width = max_right - min_left
|
666 |
+
# except ValueError as e:
|
667 |
+
# print("Error parsing bounding box:", e)
|
668 |
+
# width = 0
|
669 |
+
# else:
|
670 |
+
# width = 0
|
671 |
+
|
672 |
+
# print("Estimated width:", width)
|
673 |
+
|
674 |
+
# return width
|
675 |
+
|
676 |
|
|
|
|
|
|
|
|
|
|
|
|
|
677 |
|
678 |
# Function to combine OCR results into line-level results
|
679 |
+
def combine_ocr_results(ocr_results, x_threshold=20, y_threshold=3):
|
680 |
# Sort OCR results by 'top' to ensure line order
|
681 |
ocr_results = sorted(ocr_results, key=lambda x: (x.top, x.left))
|
682 |
|
683 |
combined_results = []
|
684 |
+
new_format_results = {}
|
685 |
current_line = []
|
686 |
current_bbox = None
|
687 |
+
line_counter = 1
|
688 |
|
689 |
for result in ocr_results:
|
690 |
if not current_line:
|
|
|
709 |
else:
|
710 |
# Commit the current line and start a new one
|
711 |
combined_results.append(current_bbox)
|
712 |
+
new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
|
713 |
+
'bounding_box': (current_bbox.left, current_bbox.top,
|
714 |
+
current_bbox.left + current_bbox.width,
|
715 |
+
current_bbox.top + current_bbox.height),
|
716 |
+
'words': [{'text': word.text,
|
717 |
+
'bounding_box': (word.left, word.top,
|
718 |
+
word.left + word.width,
|
719 |
+
word.top + word.height)}
|
720 |
+
for word in current_line]
|
721 |
+
}
|
722 |
+
line_counter += 1
|
723 |
current_line = [result]
|
724 |
current_bbox = result
|
725 |
|
726 |
# Append the last line
|
727 |
if current_bbox:
|
728 |
combined_results.append(current_bbox)
|
729 |
+
new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
|
730 |
+
'bounding_box': (current_bbox.left, current_bbox.top,
|
731 |
+
current_bbox.left + current_bbox.width,
|
732 |
+
current_bbox.top + current_bbox.height),
|
733 |
+
'words': [{'text': word.text,
|
734 |
+
'bounding_box': (word.left, word.top,
|
735 |
+
word.left + word.width,
|
736 |
+
word.top + word.height)}
|
737 |
+
for word in current_line]
|
738 |
+
}
|
739 |
+
|
740 |
+
return combined_results, new_format_results
|
741 |
|
|
tools/data_anonymise.py
CHANGED
@@ -195,7 +195,9 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
195 |
df_dict = df.to_dict(orient="list")
|
196 |
|
197 |
if in_allow_list:
|
198 |
-
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
|
|
|
|
199 |
|
200 |
#analyzer = nlp_analyser #AnalyzerEngine()
|
201 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
@@ -371,7 +373,9 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
371 |
|
372 |
|
373 |
if in_allow_list:
|
374 |
-
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
|
|
|
|
375 |
|
376 |
anon_df = pd.DataFrame()
|
377 |
#out_file_paths = []
|
|
|
195 |
df_dict = df.to_dict(orient="list")
|
196 |
|
197 |
if in_allow_list:
|
198 |
+
in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
|
199 |
+
else:
|
200 |
+
in_allow_list_flat = []
|
201 |
|
202 |
#analyzer = nlp_analyser #AnalyzerEngine()
|
203 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
|
|
373 |
|
374 |
|
375 |
if in_allow_list:
|
376 |
+
in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
|
377 |
+
else:
|
378 |
+
in_allow_list_flat = []
|
379 |
|
380 |
anon_df = pd.DataFrame()
|
381 |
#out_file_paths = []
|
tools/file_conversion.py
CHANGED
@@ -98,7 +98,33 @@ def process_file(file_path):
|
|
98 |
|
99 |
return img_object
|
100 |
|
101 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
file_paths: List[str],
|
103 |
in_redact_method: str,
|
104 |
in_allow_list: Optional[List[List[str]]] = None,
|
@@ -159,6 +185,8 @@ def prepare_image_or_text_pdf(
|
|
159 |
|
160 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
161 |
|
|
|
|
|
162 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
163 |
#print("file_paths_loop:", str(file_paths_loop))
|
164 |
|
@@ -173,7 +201,7 @@ def prepare_image_or_text_pdf(
|
|
173 |
|
174 |
# Check if the file is an image type
|
175 |
if file_extension in ['.jpg', '.jpeg', '.png']:
|
176 |
-
in_redact_method = "
|
177 |
|
178 |
# If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
|
179 |
if file_extension in ['.json']:
|
@@ -191,7 +219,7 @@ def prepare_image_or_text_pdf(
|
|
191 |
print(out_message)
|
192 |
return out_message, out_file_paths
|
193 |
|
194 |
-
if in_redact_method == "
|
195 |
# Analyse and redact image-based pdf or image
|
196 |
if is_pdf_or_image(file_path) == False:
|
197 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
@@ -201,7 +229,7 @@ def prepare_image_or_text_pdf(
|
|
201 |
out_file_path = process_file(file_path)
|
202 |
#print("Out file path at image conversion step:", out_file_path)
|
203 |
|
204 |
-
elif in_redact_method == "
|
205 |
if is_pdf(file_path) == False:
|
206 |
out_message = "Please upload a PDF file for text analysis."
|
207 |
print(out_message)
|
|
|
98 |
|
99 |
return img_object
|
100 |
|
101 |
+
def get_input_file_names(file_input):
|
102 |
+
'''
|
103 |
+
Get list of input files to report to logs.
|
104 |
+
'''
|
105 |
+
|
106 |
+
all_relevant_files = []
|
107 |
+
|
108 |
+
for file in file_input:
|
109 |
+
file_path = file.name
|
110 |
+
print(file_path)
|
111 |
+
file_path_without_ext = get_file_path_end(file_path)
|
112 |
+
|
113 |
+
#print("file:", file_path)
|
114 |
+
|
115 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
116 |
+
|
117 |
+
# Check if the file is an image type
|
118 |
+
if file_extension in ['.jpg', '.jpeg', '.png', '.xlsx', '.csv', '.parquet']:
|
119 |
+
all_relevant_files.append(file_path_without_ext)
|
120 |
+
|
121 |
+
all_relevant_files_str = ", ".join(all_relevant_files)
|
122 |
+
|
123 |
+
print("all_relevant_files_str:", all_relevant_files_str)
|
124 |
+
|
125 |
+
return all_relevant_files_str
|
126 |
+
|
127 |
+
def prepare_image_or_pdf(
|
128 |
file_paths: List[str],
|
129 |
in_redact_method: str,
|
130 |
in_allow_list: Optional[List[List[str]]] = None,
|
|
|
185 |
|
186 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
187 |
|
188 |
+
progress(0.1, desc='Preparing file')
|
189 |
+
|
190 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
191 |
#print("file_paths_loop:", str(file_paths_loop))
|
192 |
|
|
|
201 |
|
202 |
# Check if the file is an image type
|
203 |
if file_extension in ['.jpg', '.jpeg', '.png']:
|
204 |
+
in_redact_method = "Quick image analysis - typed text"
|
205 |
|
206 |
# If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
|
207 |
if file_extension in ['.json']:
|
|
|
219 |
print(out_message)
|
220 |
return out_message, out_file_paths
|
221 |
|
222 |
+
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
|
223 |
# Analyse and redact image-based pdf or image
|
224 |
if is_pdf_or_image(file_path) == False:
|
225 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
|
|
229 |
out_file_path = process_file(file_path)
|
230 |
#print("Out file path at image conversion step:", out_file_path)
|
231 |
|
232 |
+
elif in_redact_method == "Simple text analysis - PDFs with selectable text":
|
233 |
if is_pdf(file_path) == False:
|
234 |
out_message = "Please upload a PDF file for text analysis."
|
235 |
print(out_message)
|
tools/file_redaction.py
CHANGED
@@ -4,10 +4,10 @@ import json
|
|
4 |
import io
|
5 |
import os
|
6 |
from PIL import Image, ImageChops, ImageDraw
|
7 |
-
from typing import List
|
8 |
import pandas as pd
|
9 |
|
10 |
-
from presidio_image_redactor.entities import ImageRecognizerResult
|
11 |
from pdfminer.high_level import extract_pages
|
12 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
|
13 |
from pikepdf import Pdf, Dictionary, Name
|
@@ -20,15 +20,38 @@ from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRRes
|
|
20 |
from tools.file_conversion import process_file
|
21 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
22 |
from tools.helper_functions import get_file_path_end, output_folder
|
23 |
-
from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
|
24 |
from tools.data_anonymise import generate_decision_process_output
|
25 |
from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
|
26 |
|
27 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
tic = time.perf_counter()
|
30 |
-
all_request_metadata = []
|
31 |
-
all_request_metadata_str = ""
|
32 |
|
33 |
# If this is the first time around, set variables to 0/blank
|
34 |
if first_loop_state==True:
|
@@ -48,36 +71,164 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
48 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
49 |
if latest_file_completed >= len(file_paths):
|
50 |
print("Last file reached")
|
51 |
-
# Set to a very high number so as not to
|
52 |
latest_file_completed = 99
|
53 |
final_out_message = '\n'.join(out_message)
|
54 |
#final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
string: The input string.
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
|
69 |
-
|
70 |
-
|
71 |
|
72 |
-
|
73 |
-
|
|
|
74 |
|
75 |
-
|
|
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
|
78 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
79 |
|
80 |
-
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time,
|
81 |
|
82 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
83 |
|
@@ -87,7 +238,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
87 |
else:
|
88 |
in_allow_list_flat = []
|
89 |
|
90 |
-
|
91 |
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
92 |
file_path = file.name
|
93 |
|
@@ -97,19 +247,20 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
97 |
if is_a_pdf == False:
|
98 |
# If user has not submitted a pdf, assume it's an image
|
99 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
100 |
-
in_redact_method = "
|
101 |
else:
|
102 |
out_message = "No file selected"
|
103 |
print(out_message)
|
104 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state,
|
105 |
|
106 |
-
if in_redact_method == "
|
107 |
-
#
|
108 |
-
|
109 |
-
|
|
|
110 |
|
111 |
-
print("Redacting file" + file_path_without_ext + "as an image-based file")
|
112 |
-
pdf_images, output_logs, logging_file_paths,
|
113 |
|
114 |
# Save file
|
115 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
@@ -128,30 +279,29 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
128 |
f.write(output_logs_str)
|
129 |
log_files_output_paths.append(logs_output_file_name)
|
130 |
|
131 |
-
|
132 |
-
if
|
133 |
-
print("Request metadata:",
|
134 |
-
all_request_metadata.append(
|
135 |
|
136 |
# Increase latest file completed count unless we are at the last file
|
137 |
if latest_file_completed != len(file_paths):
|
138 |
print("Completed file number:", str(latest_file_completed))
|
139 |
latest_file_completed += 1
|
140 |
|
141 |
-
elif in_redact_method == "
|
142 |
|
143 |
if is_pdf(file_path) == False:
|
144 |
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
145 |
|
146 |
# Analyse text-based pdf
|
147 |
print('Redacting file as text-based PDF')
|
148 |
-
pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "
|
149 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
150 |
pdf_text.save(out_text_file_path)
|
151 |
|
152 |
# Convert message
|
153 |
convert_message="Converting PDF to image-based PDF to embed redactions."
|
154 |
-
#progress(0.8, desc=convert_message)
|
155 |
print(convert_message)
|
156 |
|
157 |
# Convert document to image-based document to 'embed' redactions
|
@@ -164,10 +314,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
164 |
f.write(output_logs_str)
|
165 |
log_files_output_paths.append(logs_output_file_name)
|
166 |
|
167 |
-
# Add confirmation for converting to image if you want
|
168 |
-
# out_message.append(img_output_summary)
|
169 |
-
|
170 |
-
#out_file_paths.append(out_text_file_path)
|
171 |
out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
|
172 |
out_message.append(out_message_new)
|
173 |
|
@@ -178,8 +324,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
178 |
else:
|
179 |
out_message = "No redaction method selected"
|
180 |
print(out_message)
|
181 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state,
|
182 |
-
|
183 |
|
184 |
toc = time.perf_counter()
|
185 |
out_time = f"in {toc - tic:0.1f} seconds."
|
@@ -188,48 +333,105 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
188 |
out_message_out = '\n'.join(out_message)
|
189 |
out_message_out = out_message_out + " " + out_time
|
190 |
|
191 |
-
|
192 |
if all_request_metadata:
|
193 |
all_request_metadata_str = '\n'.join(all_request_metadata)
|
194 |
|
195 |
-
|
196 |
-
all_request_metadata_file_path = output_folder + "textract_request_metadata.txt"
|
197 |
|
198 |
with open(all_request_metadata_file_path, "w") as f:
|
199 |
f.write(all_request_metadata_str)
|
200 |
-
|
|
|
|
|
|
|
|
|
201 |
|
202 |
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
203 |
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
merged_bboxes = []
|
206 |
grouped_bboxes = defaultdict(list)
|
207 |
|
|
|
208 |
if signature_recogniser_results or handwriting_recogniser_results:
|
209 |
-
|
210 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
211 |
print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
212 |
bboxes.extend(handwriting_recogniser_results)
|
213 |
-
|
214 |
|
215 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
216 |
-
print("Signature boxes exist at merge:",
|
217 |
bboxes.extend(signature_recogniser_results)
|
218 |
|
219 |
-
#
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
grouped_bboxes[round(box.top / vertical_threshold)].append(box)
|
222 |
|
223 |
-
#
|
224 |
for _, group in grouped_bboxes.items():
|
225 |
group.sort(key=lambda box: box.left)
|
226 |
|
227 |
merged_box = group[0]
|
228 |
for next_box in group[1:]:
|
229 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
230 |
-
#print("Merging a box")
|
231 |
# Calculate new dimensions for the merged box
|
232 |
-
#print("Merged box:", merged_box)
|
233 |
if merged_box.text == next_box.text:
|
234 |
new_text = merged_box.text
|
235 |
else:
|
@@ -247,9 +449,10 @@ def merge_img_bboxes(bboxes, signature_recogniser_results = [], handwriting_reco
|
|
247 |
merged_box = next_box
|
248 |
|
249 |
merged_bboxes.append(merged_box)
|
|
|
250 |
return merged_bboxes
|
251 |
|
252 |
-
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="
|
253 |
'''
|
254 |
Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
|
255 |
'''
|
@@ -259,7 +462,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
259 |
fill = (0, 0, 0) # Fill colour
|
260 |
decision_process_output_str = ""
|
261 |
images = []
|
262 |
-
request_metadata = {}
|
263 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
264 |
|
265 |
if not image_paths:
|
@@ -297,11 +500,13 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
297 |
all_ocr_results = []
|
298 |
all_decision_process = []
|
299 |
|
300 |
-
if analysis_type == "
|
301 |
-
elif analysis_type == "AWS Textract": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
302 |
|
303 |
for n in range(0, number_of_pages):
|
304 |
handwriting_or_signature_boxes = []
|
|
|
|
|
305 |
|
306 |
try:
|
307 |
image = image_paths[0][n]#.copy()
|
@@ -339,17 +544,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
339 |
else: ocr_lang = language
|
340 |
|
341 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
342 |
-
if analysis_type == "
|
343 |
|
344 |
ocr_results = image_analyser.perform_ocr(image)
|
345 |
|
346 |
# Combine OCR results
|
347 |
-
ocr_results = combine_ocr_results(ocr_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
# Import results from json and convert
|
350 |
-
if analysis_type == "AWS Textract":
|
351 |
|
352 |
-
|
353 |
# Convert the image to bytes using an in-memory buffer
|
354 |
image_buffer = io.BytesIO()
|
355 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
@@ -358,8 +568,9 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
358 |
json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
|
359 |
|
360 |
if not os.path.exists(json_file_path):
|
361 |
-
text_blocks,
|
362 |
logging_file_paths.append(json_file_path)
|
|
|
363 |
else:
|
364 |
# Open the file and load the JSON data
|
365 |
print("Found existing Textract json results file for this page.")
|
@@ -367,7 +578,13 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
367 |
text_blocks = json.load(json_file)
|
368 |
text_blocks = text_blocks['Blocks']
|
369 |
|
370 |
-
ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results = json_to_ocrresult(text_blocks, page_width, page_height)
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
|
372 |
# Step 2: Analyze text and identify PII
|
373 |
bboxes = image_analyser.analyze_text(
|
@@ -376,10 +593,18 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
376 |
entities=chosen_redact_entities,
|
377 |
allow_list=allow_list,
|
378 |
score_threshold=score_threshold,
|
379 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
|
381 |
# Merge close bounding boxes
|
382 |
-
merged_bboxes = merge_img_bboxes(bboxes, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
|
383 |
|
384 |
# Export the decision making process
|
385 |
if merged_bboxes:
|
@@ -434,82 +659,19 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
|
|
434 |
return [], []
|
435 |
|
436 |
# Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
|
437 |
-
|
438 |
-
# '''
|
439 |
-
# Merge identified bounding boxes containing PII that are very close to one another
|
440 |
-
# '''
|
441 |
-
# analyzed_bounding_boxes = []
|
442 |
-
# if len(analyzer_results) > 0 and len(characters) > 0:
|
443 |
-
# merged_bounding_boxes = []
|
444 |
-
# current_box = None
|
445 |
-
# current_y = None
|
446 |
-
|
447 |
-
# for i, result in enumerate(analyzer_results):
|
448 |
-
# print("Considering result", str(i))
|
449 |
-
# for char in characters[result.start : result.end]:
|
450 |
-
# if isinstance(char, LTChar):
|
451 |
-
# char_box = list(char.bbox)
|
452 |
-
# # Add vertical padding to the top of the box
|
453 |
-
# char_box[3] += vertical_padding
|
454 |
-
|
455 |
-
# if current_y is None or current_box is None:
|
456 |
-
# current_box = char_box
|
457 |
-
# current_y = char_box[1]
|
458 |
-
# else:
|
459 |
-
# vertical_diff_bboxes = abs(char_box[1] - current_y)
|
460 |
-
# horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
461 |
-
|
462 |
-
# if (
|
463 |
-
# vertical_diff_bboxes <= 5
|
464 |
-
# and horizontal_diff_bboxes <= combine_pixel_dist
|
465 |
-
# ):
|
466 |
-
# current_box[2] = char_box[2] # Extend the current box horizontally
|
467 |
-
# current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
468 |
-
# else:
|
469 |
-
# merged_bounding_boxes.append(
|
470 |
-
# {"boundingBox": current_box, "result": result})
|
471 |
-
|
472 |
-
# # Reset current_box and current_y after appending
|
473 |
-
# current_box = char_box
|
474 |
-
# current_y = char_box[1]
|
475 |
-
|
476 |
-
# # After finishing with the current result, add the last box for this result
|
477 |
-
# if current_box:
|
478 |
-
# merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
|
479 |
-
# current_box = None
|
480 |
-
# current_y = None # Reset for the next result
|
481 |
-
|
482 |
-
# if not merged_bounding_boxes:
|
483 |
-
# analyzed_bounding_boxes.extend(
|
484 |
-
# {"boundingBox": char.bbox, "result": result}
|
485 |
-
# for result in analyzer_results
|
486 |
-
# for char in characters[result.start:result.end]
|
487 |
-
# if isinstance(char, LTChar)
|
488 |
-
# )
|
489 |
-
# else:
|
490 |
-
# analyzed_bounding_boxes.extend(merged_bounding_boxes)
|
491 |
-
|
492 |
-
# print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
|
493 |
-
|
494 |
-
# return analyzed_bounding_boxes
|
495 |
-
|
496 |
-
def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2, signature_bounding_boxes=None):
|
497 |
'''
|
498 |
-
Merge identified bounding boxes containing PII
|
499 |
'''
|
500 |
analyzed_bounding_boxes = []
|
501 |
-
merged_bounding_boxes = []
|
502 |
-
current_box = None
|
503 |
-
current_y = None
|
504 |
-
|
505 |
-
# Handle PII and text bounding boxes first
|
506 |
if len(analyzer_results) > 0 and len(characters) > 0:
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
#print("Characters:", characters)
|
511 |
|
512 |
-
|
|
|
|
|
513 |
if isinstance(char, LTChar):
|
514 |
char_box = list(char.bbox)
|
515 |
# Add vertical padding to the top of the box
|
@@ -535,58 +697,121 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
|
|
535 |
# Reset current_box and current_y after appending
|
536 |
current_box = char_box
|
537 |
current_y = char_box[1]
|
538 |
-
|
539 |
# After finishing with the current result, add the last box for this result
|
540 |
if current_box:
|
541 |
merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
|
542 |
current_box = None
|
543 |
current_y = None # Reset for the next result
|
544 |
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
horizontal_diff_bboxes = abs(sig_box[0] - current_box[2])
|
555 |
-
|
556 |
-
if (
|
557 |
-
vertical_diff_bboxes <= 5
|
558 |
-
and horizontal_diff_bboxes <= combine_pixel_dist
|
559 |
-
):
|
560 |
-
current_box[2] = sig_box[2] # Extend the current box horizontally
|
561 |
-
current_box[3] = max(current_box[3], sig_box[3]) # Ensure the top is the highest
|
562 |
-
else:
|
563 |
-
merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
|
564 |
-
|
565 |
-
# Reset current_box and current_y after appending
|
566 |
-
current_box = sig_box
|
567 |
-
current_y = sig_box[1]
|
568 |
-
|
569 |
-
# Add the last bounding box for the signature
|
570 |
-
if current_box:
|
571 |
-
merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
|
572 |
-
current_box = None
|
573 |
-
current_y = None
|
574 |
-
|
575 |
-
# If no bounding boxes were merged, add individual character bounding boxes
|
576 |
-
if not merged_bounding_boxes:
|
577 |
-
analyzed_bounding_boxes.extend(
|
578 |
-
{"boundingBox": char.bbox, "result": result}
|
579 |
-
for result in analyzer_results
|
580 |
-
for char in characters[result.start:result.end]
|
581 |
-
if isinstance(char, LTChar)
|
582 |
-
)
|
583 |
-
else:
|
584 |
-
analyzed_bounding_boxes.extend(merged_bounding_boxes)
|
585 |
|
586 |
-
|
587 |
|
588 |
return analyzed_bounding_boxes
|
589 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
590 |
def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
|
591 |
decision_process_table = pd.DataFrame()
|
592 |
|
@@ -625,14 +850,14 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
|
|
625 |
annotations_on_page.append(annotation)
|
626 |
return annotations_on_page
|
627 |
|
628 |
-
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "
|
629 |
'''
|
630 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
631 |
'''
|
632 |
annotations_all_pages = []
|
633 |
decision_process_table_all_pages = []
|
634 |
|
635 |
-
combine_pixel_dist =
|
636 |
|
637 |
pdf = Pdf.open(filename)
|
638 |
page_num = 0
|
@@ -674,7 +899,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
674 |
text_container_analyzed_bounding_boxes = []
|
675 |
characters = []
|
676 |
|
677 |
-
if analysis_type == "
|
678 |
for i, text_container in enumerate(page_layout):
|
679 |
|
680 |
text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
|
@@ -686,11 +911,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
686 |
page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
|
687 |
page_analyzer_results.extend(text_container_analyzer_results)
|
688 |
|
689 |
-
# Merge bounding boxes if very close together
|
690 |
-
text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
|
691 |
-
|
692 |
-
page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
|
693 |
-
page_analyzer_results.extend(text_container_analyzer_results)
|
694 |
|
695 |
decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
|
696 |
|
|
|
4 |
import io
|
5 |
import os
|
6 |
from PIL import Image, ImageChops, ImageDraw
|
7 |
+
from typing import List, Dict
|
8 |
import pandas as pd
|
9 |
|
10 |
+
#from presidio_image_redactor.entities import ImageRecognizerResult
|
11 |
from pdfminer.high_level import extract_pages
|
12 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
|
13 |
from pikepdf import Pdf, Dictionary, Name
|
|
|
20 |
from tools.file_conversion import process_file
|
21 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
22 |
from tools.helper_functions import get_file_path_end, output_folder
|
23 |
+
from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
|
24 |
from tools.data_anonymise import generate_decision_process_output
|
25 |
from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
|
26 |
|
27 |
+
def sum_numbers_before_seconds(string:str):
|
28 |
+
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
string: The input string.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
The sum of all numbers before 'seconds' in the string.
|
35 |
+
"""
|
36 |
+
|
37 |
+
# Extract numbers before 'seconds' using regular expression
|
38 |
+
numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
|
39 |
+
|
40 |
+
# Extract the numbers from the matches
|
41 |
+
numbers = [float(num.split()[0]) for num in numbers]
|
42 |
+
|
43 |
+
# Sum up the extracted numbers
|
44 |
+
sum_of_numbers = round(sum(numbers),1)
|
45 |
+
|
46 |
+
return sum_of_numbers
|
47 |
+
|
48 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", progress=gr.Progress(track_tqdm=True)):
|
49 |
+
'''
|
50 |
+
Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
|
51 |
+
'''
|
52 |
|
53 |
tic = time.perf_counter()
|
54 |
+
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
|
|
55 |
|
56 |
# If this is the first time around, set variables to 0/blank
|
57 |
if first_loop_state==True:
|
|
|
71 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
72 |
if latest_file_completed >= len(file_paths):
|
73 |
print("Last file reached")
|
74 |
+
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
75 |
latest_file_completed = 99
|
76 |
final_out_message = '\n'.join(out_message)
|
77 |
#final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
78 |
+
|
79 |
+
estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
|
80 |
+
print("Estimated total processing time:", str(estimate_total_processing_time))
|
81 |
+
|
82 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str
|
83 |
+
|
84 |
+
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
85 |
+
|
86 |
+
if not in_allow_list.empty:
|
87 |
+
in_allow_list_flat = in_allow_list[0].tolist()
|
88 |
+
print("In allow list:", in_allow_list_flat)
|
89 |
+
else:
|
90 |
+
in_allow_list_flat = []
|
91 |
+
|
92 |
+
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
93 |
+
file_path = file.name
|
94 |
+
|
95 |
+
if file_path:
|
96 |
+
file_path_without_ext = get_file_path_end(file_path)
|
97 |
+
is_a_pdf = is_pdf(file_path) == True
|
98 |
+
if is_a_pdf == False:
|
99 |
+
# If user has not submitted a pdf, assume it's an image
|
100 |
+
print("File is not a pdf, assuming that image analysis needs to be used.")
|
101 |
+
in_redact_method = "Quick image analysis - typed text"
|
102 |
+
else:
|
103 |
+
out_message = "No file selected"
|
104 |
+
print(out_message)
|
105 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
106 |
+
|
107 |
+
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
|
108 |
+
#Analyse and redact image-based pdf or image
|
109 |
+
if is_pdf_or_image(file_path) == False:
|
110 |
+
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
111 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
112 |
+
|
113 |
+
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
114 |
+
pdf_images, output_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
|
115 |
+
|
116 |
+
# Save file
|
117 |
+
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
118 |
+
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
119 |
+
|
120 |
+
out_file_paths.append(out_image_file_path)
|
121 |
+
if logging_file_paths:
|
122 |
+
log_files_output_paths.extend(logging_file_paths)
|
123 |
+
|
124 |
+
out_message.append("File '" + file_path_without_ext + "' successfully redacted")
|
125 |
+
|
126 |
+
# Save decision making process
|
127 |
+
output_logs_str = str(output_logs)
|
128 |
+
logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
|
129 |
+
with open(logs_output_file_name, "w") as f:
|
130 |
+
f.write(output_logs_str)
|
131 |
+
log_files_output_paths.append(logs_output_file_name)
|
132 |
+
|
133 |
+
# Save Textract request metadata (if exists)
|
134 |
+
if new_request_metadata:
|
135 |
+
print("Request metadata:", new_request_metadata)
|
136 |
+
all_request_metadata.append(new_request_metadata)
|
137 |
+
|
138 |
+
# Increase latest file completed count unless we are at the last file
|
139 |
+
if latest_file_completed != len(file_paths):
|
140 |
+
print("Completed file number:", str(latest_file_completed))
|
141 |
+
latest_file_completed += 1
|
142 |
+
|
143 |
+
elif in_redact_method == "Simple text analysis - PDFs with selectable text":
|
144 |
+
|
145 |
+
if is_pdf(file_path) == False:
|
146 |
+
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
147 |
+
|
148 |
+
# Analyse text-based pdf
|
149 |
+
print('Redacting file as text-based PDF')
|
150 |
+
import time
|
151 |
+
import re
|
152 |
+
import json
|
153 |
+
import io
|
154 |
+
import os
|
155 |
+
from PIL import Image, ImageChops, ImageDraw
|
156 |
+
from typing import List, Dict
|
157 |
+
import pandas as pd
|
158 |
|
159 |
+
#from presidio_image_redactor.entities import ImageRecognizerResult
|
160 |
+
from pdfminer.high_level import extract_pages
|
161 |
+
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
|
162 |
+
from pikepdf import Pdf, Dictionary, Name
|
163 |
+
import gradio as gr
|
164 |
+
from gradio import Progress
|
165 |
|
166 |
+
from collections import defaultdict # For efficient grouping
|
|
|
167 |
|
168 |
+
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
169 |
+
from tools.file_conversion import process_file
|
170 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
171 |
+
from tools.helper_functions import get_file_path_end, output_folder
|
172 |
+
from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
|
173 |
+
from tools.data_anonymise import generate_decision_process_output
|
174 |
+
from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
|
175 |
|
176 |
+
def sum_numbers_before_seconds(string:str):
|
177 |
+
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
178 |
|
179 |
+
Args:
|
180 |
+
string: The input string.
|
181 |
|
182 |
+
Returns:
|
183 |
+
The sum of all numbers before 'seconds' in the string.
|
184 |
+
"""
|
185 |
|
186 |
+
# Extract numbers before 'seconds' using regular expression
|
187 |
+
numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
|
188 |
|
189 |
+
# Extract the numbers from the matches
|
190 |
+
numbers = [float(num.split()[0]) for num in numbers]
|
191 |
+
|
192 |
+
# Sum up the extracted numbers
|
193 |
+
sum_of_numbers = round(sum(numbers),1)
|
194 |
+
|
195 |
+
return sum_of_numbers
|
196 |
+
|
197 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", progress=gr.Progress(track_tqdm=True)):
|
198 |
+
'''
|
199 |
+
Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
|
200 |
+
'''
|
201 |
+
|
202 |
+
tic = time.perf_counter()
|
203 |
+
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
204 |
+
|
205 |
+
# If this is the first time around, set variables to 0/blank
|
206 |
+
if first_loop_state==True:
|
207 |
+
latest_file_completed = 0
|
208 |
+
#out_message = []
|
209 |
+
out_file_paths = []
|
210 |
+
|
211 |
+
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
212 |
+
if isinstance(out_message, str):
|
213 |
+
out_message = [out_message]
|
214 |
+
|
215 |
+
if not out_file_paths:
|
216 |
+
out_file_paths = []
|
217 |
+
|
218 |
+
latest_file_completed = int(latest_file_completed)
|
219 |
+
|
220 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
221 |
+
if latest_file_completed >= len(file_paths):
|
222 |
+
print("Last file reached")
|
223 |
+
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
224 |
+
latest_file_completed = 99
|
225 |
+
final_out_message = '\n'.join(out_message)
|
226 |
+
#final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
227 |
+
|
228 |
estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
|
229 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
230 |
|
231 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str
|
232 |
|
233 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
234 |
|
|
|
238 |
else:
|
239 |
in_allow_list_flat = []
|
240 |
|
|
|
241 |
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
242 |
file_path = file.name
|
243 |
|
|
|
247 |
if is_a_pdf == False:
|
248 |
# If user has not submitted a pdf, assume it's an image
|
249 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
250 |
+
in_redact_method = "Quick image analysis - typed text"
|
251 |
else:
|
252 |
out_message = "No file selected"
|
253 |
print(out_message)
|
254 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
255 |
|
256 |
+
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
|
257 |
+
#Analyse and redact image-based pdf or image
|
258 |
+
if is_pdf_or_image(file_path) == False:
|
259 |
+
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
260 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
261 |
|
262 |
+
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
263 |
+
pdf_images, output_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
|
264 |
|
265 |
# Save file
|
266 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
|
|
279 |
f.write(output_logs_str)
|
280 |
log_files_output_paths.append(logs_output_file_name)
|
281 |
|
282 |
+
# Save Textract request metadata (if exists)
|
283 |
+
if new_request_metadata:
|
284 |
+
print("Request metadata:", new_request_metadata)
|
285 |
+
all_request_metadata.append(new_request_metadata)
|
286 |
|
287 |
# Increase latest file completed count unless we are at the last file
|
288 |
if latest_file_completed != len(file_paths):
|
289 |
print("Completed file number:", str(latest_file_completed))
|
290 |
latest_file_completed += 1
|
291 |
|
292 |
+
elif in_redact_method == "Simple text analysis - PDFs with selectable text":
|
293 |
|
294 |
if is_pdf(file_path) == False:
|
295 |
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
296 |
|
297 |
# Analyse text-based pdf
|
298 |
print('Redacting file as text-based PDF')
|
299 |
+
pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
|
300 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
301 |
pdf_text.save(out_text_file_path)
|
302 |
|
303 |
# Convert message
|
304 |
convert_message="Converting PDF to image-based PDF to embed redactions."
|
|
|
305 |
print(convert_message)
|
306 |
|
307 |
# Convert document to image-based document to 'embed' redactions
|
|
|
314 |
f.write(output_logs_str)
|
315 |
log_files_output_paths.append(logs_output_file_name)
|
316 |
|
|
|
|
|
|
|
|
|
317 |
out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
|
318 |
out_message.append(out_message_new)
|
319 |
|
|
|
324 |
else:
|
325 |
out_message = "No redaction method selected"
|
326 |
print(out_message)
|
327 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
|
|
328 |
|
329 |
toc = time.perf_counter()
|
330 |
out_time = f"in {toc - tic:0.1f} seconds."
|
|
|
333 |
out_message_out = '\n'.join(out_message)
|
334 |
out_message_out = out_message_out + " " + out_time
|
335 |
|
336 |
+
# If textract requests made, write to logging file
|
337 |
if all_request_metadata:
|
338 |
all_request_metadata_str = '\n'.join(all_request_metadata)
|
339 |
|
340 |
+
all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
|
|
|
341 |
|
342 |
with open(all_request_metadata_file_path, "w") as f:
|
343 |
f.write(all_request_metadata_str)
|
344 |
+
|
345 |
+
# Add the request metadata to the log outputs if not there already
|
346 |
+
if all_request_metadata_file_path not in log_files_output_paths:
|
347 |
+
log_files_output_paths.append(all_request_metadata_file_path)
|
348 |
+
|
349 |
|
350 |
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
351 |
|
352 |
+
|
353 |
+
|
354 |
+
def bounding_boxes_overlap(box1, box2):
|
355 |
+
"""Check if two bounding boxes overlap."""
|
356 |
+
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
357 |
+
box1[1] < box2[3] and box2[1] < box1[3])
|
358 |
+
|
359 |
+
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold=150, vertical_threshold=25):
|
360 |
merged_bboxes = []
|
361 |
grouped_bboxes = defaultdict(list)
|
362 |
|
363 |
+
# Process signature and handwriting results
|
364 |
if signature_recogniser_results or handwriting_recogniser_results:
|
|
|
365 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
366 |
print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
367 |
bboxes.extend(handwriting_recogniser_results)
|
|
|
368 |
|
369 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
370 |
+
print("Signature boxes exist at merge:", signature_recogniser_results)
|
371 |
bboxes.extend(signature_recogniser_results)
|
372 |
|
373 |
+
# Reconstruct bounding boxes for substrings of interest
|
374 |
+
reconstructed_bboxes = []
|
375 |
+
for bbox in bboxes:
|
376 |
+
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
377 |
+
for line_text, line_info in combined_results.items():
|
378 |
+
line_box = line_info['bounding_box']
|
379 |
+
if bounding_boxes_overlap(bbox_box, line_box):
|
380 |
+
if bbox.text in line_text:
|
381 |
+
start_char = line_text.index(bbox.text)
|
382 |
+
end_char = start_char + len(bbox.text)
|
383 |
+
|
384 |
+
relevant_words = []
|
385 |
+
current_char = 0
|
386 |
+
for word in line_info['words']:
|
387 |
+
word_end = current_char + len(word['text'])
|
388 |
+
if current_char <= start_char < word_end or current_char < end_char <= word_end:
|
389 |
+
relevant_words.append(word)
|
390 |
+
if word_end >= end_char:
|
391 |
+
break
|
392 |
+
current_char = word_end # +1 for space
|
393 |
+
if not word['text'].endswith(' '):
|
394 |
+
current_char += 1 # +1 for space if the word doesn't already end with a space
|
395 |
+
|
396 |
+
if relevant_words:
|
397 |
+
print("Relevant words:", relevant_words)
|
398 |
+
left = min(word['bounding_box'][0] for word in relevant_words)
|
399 |
+
top = min(word['bounding_box'][1] for word in relevant_words)
|
400 |
+
right = max(word['bounding_box'][2] for word in relevant_words)
|
401 |
+
bottom = max(word['bounding_box'][3] for word in relevant_words)
|
402 |
+
|
403 |
+
# Combine the text of the relevant words
|
404 |
+
combined_text = " ".join(word['text'] for word in relevant_words)
|
405 |
+
|
406 |
+
reconstructed_bbox = CustomImageRecognizerResult(
|
407 |
+
bbox.entity_type,
|
408 |
+
bbox.start,
|
409 |
+
bbox.end,
|
410 |
+
bbox.score,
|
411 |
+
left,
|
412 |
+
top,
|
413 |
+
right - left, # width
|
414 |
+
bottom - top, # height
|
415 |
+
combined_text
|
416 |
+
)
|
417 |
+
reconstructed_bboxes.append(reconstructed_bbox)
|
418 |
+
break
|
419 |
+
else:
|
420 |
+
# If the bbox text is not found in any line in combined_results, keep the original bbox
|
421 |
+
reconstructed_bboxes.append(bbox)
|
422 |
+
|
423 |
+
# Group reconstructed bboxes by approximate vertical proximity
|
424 |
+
for box in reconstructed_bboxes:
|
425 |
grouped_bboxes[round(box.top / vertical_threshold)].append(box)
|
426 |
|
427 |
+
# Merge within each group
|
428 |
for _, group in grouped_bboxes.items():
|
429 |
group.sort(key=lambda box: box.left)
|
430 |
|
431 |
merged_box = group[0]
|
432 |
for next_box in group[1:]:
|
433 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
|
|
434 |
# Calculate new dimensions for the merged box
|
|
|
435 |
if merged_box.text == next_box.text:
|
436 |
new_text = merged_box.text
|
437 |
else:
|
|
|
449 |
merged_box = next_box
|
450 |
|
451 |
merged_bboxes.append(merged_box)
|
452 |
+
|
453 |
return merged_bboxes
|
454 |
|
455 |
+
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", progress=Progress(track_tqdm=True)):
|
456 |
'''
|
457 |
Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
|
458 |
'''
|
|
|
462 |
fill = (0, 0, 0) # Fill colour
|
463 |
decision_process_output_str = ""
|
464 |
images = []
|
465 |
+
#request_metadata = {}
|
466 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
467 |
|
468 |
if not image_paths:
|
|
|
500 |
all_ocr_results = []
|
501 |
all_decision_process = []
|
502 |
|
503 |
+
if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
504 |
+
elif analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
505 |
|
506 |
for n in range(0, number_of_pages):
|
507 |
handwriting_or_signature_boxes = []
|
508 |
+
signature_recogniser_results = []
|
509 |
+
handwriting_recogniser_results = []
|
510 |
|
511 |
try:
|
512 |
image = image_paths[0][n]#.copy()
|
|
|
544 |
else: ocr_lang = language
|
545 |
|
546 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
547 |
+
if analysis_type == "Quick image analysis - typed text":
|
548 |
|
549 |
ocr_results = image_analyser.perform_ocr(image)
|
550 |
|
551 |
# Combine OCR results
|
552 |
+
ocr_results, ocr_results_with_children = combine_ocr_results(ocr_results)
|
553 |
+
|
554 |
+
# Save decision making process
|
555 |
+
ocr_results_with_children_str = str(ocr_results_with_children)
|
556 |
+
logs_output_file_name = output_folder + "ocr_with_children.txt"
|
557 |
+
with open(logs_output_file_name, "w") as f:
|
558 |
+
f.write(ocr_results_with_children_str)
|
559 |
|
560 |
# Import results from json and convert
|
561 |
+
if analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures":
|
562 |
|
|
|
563 |
# Convert the image to bytes using an in-memory buffer
|
564 |
image_buffer = io.BytesIO()
|
565 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
|
|
568 |
json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
|
569 |
|
570 |
if not os.path.exists(json_file_path):
|
571 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
|
572 |
logging_file_paths.append(json_file_path)
|
573 |
+
request_metadata = request_metadata + "\n" + new_request_metadata
|
574 |
else:
|
575 |
# Open the file and load the JSON data
|
576 |
print("Found existing Textract json results file for this page.")
|
|
|
578 |
text_blocks = json.load(json_file)
|
579 |
text_blocks = text_blocks['Blocks']
|
580 |
|
581 |
+
ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
|
582 |
+
|
583 |
+
# Save decision making process
|
584 |
+
ocr_results_with_children_str = str(ocr_results_with_children)
|
585 |
+
logs_output_file_name = output_folder + "ocr_with_children_textract.txt"
|
586 |
+
with open(logs_output_file_name, "w") as f:
|
587 |
+
f.write(ocr_results_with_children_str)
|
588 |
|
589 |
# Step 2: Analyze text and identify PII
|
590 |
bboxes = image_analyser.analyze_text(
|
|
|
593 |
entities=chosen_redact_entities,
|
594 |
allow_list=allow_list,
|
595 |
score_threshold=score_threshold,
|
596 |
+
)
|
597 |
+
|
598 |
+
if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
599 |
+
elif analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
600 |
+
|
601 |
+
# Save decision making process
|
602 |
+
bboxes_str = str(bboxes)
|
603 |
+
with open(interim_results_file_path, "w") as f:
|
604 |
+
f.write(bboxes_str)
|
605 |
|
606 |
# Merge close bounding boxes
|
607 |
+
merged_bboxes = merge_img_bboxes(bboxes, ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
|
608 |
|
609 |
# Export the decision making process
|
610 |
if merged_bboxes:
|
|
|
659 |
return [], []
|
660 |
|
661 |
# Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
|
662 |
+
def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
663 |
'''
|
664 |
+
Merge identified bounding boxes containing PII that are very close to one another
|
665 |
'''
|
666 |
analyzed_bounding_boxes = []
|
|
|
|
|
|
|
|
|
|
|
667 |
if len(analyzer_results) > 0 and len(characters) > 0:
|
668 |
+
merged_bounding_boxes = []
|
669 |
+
current_box = None
|
670 |
+
current_y = None
|
|
|
671 |
|
672 |
+
for i, result in enumerate(analyzer_results):
|
673 |
+
print("Considering result", str(i))
|
674 |
+
for char in characters[result.start : result.end]:
|
675 |
if isinstance(char, LTChar):
|
676 |
char_box = list(char.bbox)
|
677 |
# Add vertical padding to the top of the box
|
|
|
697 |
# Reset current_box and current_y after appending
|
698 |
current_box = char_box
|
699 |
current_y = char_box[1]
|
700 |
+
|
701 |
# After finishing with the current result, add the last box for this result
|
702 |
if current_box:
|
703 |
merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
|
704 |
current_box = None
|
705 |
current_y = None # Reset for the next result
|
706 |
|
707 |
+
if not merged_bounding_boxes:
|
708 |
+
analyzed_bounding_boxes.extend(
|
709 |
+
{"boundingBox": char.bbox, "result": result}
|
710 |
+
for result in analyzer_results
|
711 |
+
for char in characters[result.start:result.end]
|
712 |
+
if isinstance(char, LTChar)
|
713 |
+
)
|
714 |
+
else:
|
715 |
+
analyzed_bounding_boxes.extend(merged_bounding_boxes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
|
717 |
+
print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
|
718 |
|
719 |
return analyzed_bounding_boxes
|
720 |
|
721 |
+
# def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2, signature_bounding_boxes=None):
|
722 |
+
# '''
|
723 |
+
# Merge identified bounding boxes containing PII or signatures that are very close to one another.
|
724 |
+
# '''
|
725 |
+
# analyzed_bounding_boxes = []
|
726 |
+
# merged_bounding_boxes = []
|
727 |
+
# current_box = None
|
728 |
+
# current_y = None
|
729 |
+
|
730 |
+
# # Handle PII and text bounding boxes first
|
731 |
+
# if len(analyzer_results) > 0 and len(characters) > 0:
|
732 |
+
# for i, result in enumerate(analyzer_results):
|
733 |
+
# #print("Considering result", str(i))
|
734 |
+
# #print("Result:", result)
|
735 |
+
# #print("Characters:", characters)
|
736 |
+
|
737 |
+
# for char in characters[result.start: result.end]:
|
738 |
+
# if isinstance(char, LTChar):
|
739 |
+
# char_box = list(char.bbox)
|
740 |
+
# # Add vertical padding to the top of the box
|
741 |
+
# char_box[3] += vertical_padding
|
742 |
+
|
743 |
+
# if current_y is None or current_box is None:
|
744 |
+
# current_box = char_box
|
745 |
+
# current_y = char_box[1]
|
746 |
+
# else:
|
747 |
+
# vertical_diff_bboxes = abs(char_box[1] - current_y)
|
748 |
+
# horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
749 |
+
|
750 |
+
# if (
|
751 |
+
# vertical_diff_bboxes <= 5
|
752 |
+
# and horizontal_diff_bboxes <= combine_pixel_dist
|
753 |
+
# ):
|
754 |
+
# current_box[2] = char_box[2] # Extend the current box horizontally
|
755 |
+
# current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
756 |
+
# else:
|
757 |
+
# merged_bounding_boxes.append(
|
758 |
+
# {"boundingBox": current_box, "result": result})
|
759 |
+
|
760 |
+
# # Reset current_box and current_y after appending
|
761 |
+
# current_box = char_box
|
762 |
+
# current_y = char_box[1]
|
763 |
+
|
764 |
+
# # After finishing with the current result, add the last box for this result
|
765 |
+
# if current_box:
|
766 |
+
# merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
|
767 |
+
# current_box = None
|
768 |
+
# current_y = None # Reset for the next result
|
769 |
+
|
770 |
+
# # Handle signature bounding boxes (without specific characters)
|
771 |
+
# if signature_bounding_boxes is not None:
|
772 |
+
# for sig_box in signature_bounding_boxes:
|
773 |
+
# sig_box = list(sig_box) # Ensure it's a list to modify the values
|
774 |
+
# if current_y is None or current_box is None:
|
775 |
+
# current_box = sig_box
|
776 |
+
# current_y = sig_box[1]
|
777 |
+
# else:
|
778 |
+
# vertical_diff_bboxes = abs(sig_box[1] - current_y)
|
779 |
+
# horizontal_diff_bboxes = abs(sig_box[0] - current_box[2])
|
780 |
+
|
781 |
+
# if (
|
782 |
+
# vertical_diff_bboxes <= 5
|
783 |
+
# and horizontal_diff_bboxes <= combine_pixel_dist
|
784 |
+
# ):
|
785 |
+
# current_box[2] = sig_box[2] # Extend the current box horizontally
|
786 |
+
# current_box[3] = max(current_box[3], sig_box[3]) # Ensure the top is the highest
|
787 |
+
# else:
|
788 |
+
# merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
|
789 |
+
|
790 |
+
# # Reset current_box and current_y after appending
|
791 |
+
# current_box = sig_box
|
792 |
+
# current_y = sig_box[1]
|
793 |
+
|
794 |
+
# # Add the last bounding box for the signature
|
795 |
+
# if current_box:
|
796 |
+
# merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
|
797 |
+
# current_box = None
|
798 |
+
# current_y = None
|
799 |
+
|
800 |
+
# # If no bounding boxes were merged, add individual character bounding boxes
|
801 |
+
# if not merged_bounding_boxes:
|
802 |
+
# analyzed_bounding_boxes.extend(
|
803 |
+
# {"boundingBox": char.bbox, "result": result}
|
804 |
+
# for result in analyzer_results
|
805 |
+
# for char in characters[result.start:result.end]
|
806 |
+
# if isinstance(char, LTChar)
|
807 |
+
# )
|
808 |
+
# else:
|
809 |
+
# analyzed_bounding_boxes.extend(merged_bounding_boxes)
|
810 |
+
|
811 |
+
# #print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
|
812 |
+
|
813 |
+
# return analyzed_bounding_boxes
|
814 |
+
|
815 |
def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
|
816 |
decision_process_table = pd.DataFrame()
|
817 |
|
|
|
850 |
annotations_on_page.append(annotation)
|
851 |
return annotations_on_page
|
852 |
|
853 |
+
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Simple text analysis - PDFs with selectable text", progress=Progress(track_tqdm=True)):
|
854 |
'''
|
855 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
856 |
'''
|
857 |
annotations_all_pages = []
|
858 |
decision_process_table_all_pages = []
|
859 |
|
860 |
+
combine_pixel_dist = 200 # Horizontal distance between PII bounding boxes under/equal they are combined into one
|
861 |
|
862 |
pdf = Pdf.open(filename)
|
863 |
page_num = 0
|
|
|
899 |
text_container_analyzed_bounding_boxes = []
|
900 |
characters = []
|
901 |
|
902 |
+
if analysis_type == "Simple text analysis - PDFs with selectable text":
|
903 |
for i, text_container in enumerate(page_layout):
|
904 |
|
905 |
text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
|
|
|
911 |
page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
|
912 |
page_analyzer_results.extend(text_container_analyzer_results)
|
913 |
|
|
|
|
|
|
|
|
|
|
|
914 |
|
915 |
decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
|
916 |
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -18,7 +18,7 @@ score_threshold = 0.001
|
|
18 |
# Custom title recogniser
|
19 |
import re
|
20 |
titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
|
21 |
-
titles_regex = '\\b' + '
|
22 |
titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
|
23 |
titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
|
24 |
|
|
|
18 |
# Custom title recogniser
|
19 |
import re
|
20 |
titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
|
21 |
+
titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
|
22 |
titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
|
23 |
titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
|
24 |
|