seanpedrickcase commited on
Commit
8652429
·
1 Parent(s): 6ea0852

Optimised Textract and Tesseract workings

Browse files
README.md CHANGED
@@ -9,9 +9,10 @@ pinned: false
9
  license: mit
10
  ---
11
 
12
- # Introduction
13
- Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
14
 
15
- WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
16
 
17
- Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
 
 
 
9
  license: mit
10
  ---
11
 
12
+ # Document redaction
 
13
 
14
+ Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
15
 
16
+ NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
17
+
18
+ This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
app.py CHANGED
@@ -7,7 +7,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
7
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
8
  from tools.aws_functions import upload_file_to_s3
9
  from tools.file_redaction import choose_and_run_redactor
10
- from tools.file_conversion import prepare_image_or_text_pdf
11
  from tools.data_anonymise import anonymise_data_files
12
  from tools.auth import authenticate_user
13
  #from tools.aws_functions import load_data_from_aws
@@ -37,6 +37,9 @@ app = gr.Blocks(theme = gr.themes.Base())
37
 
38
  with app:
39
 
 
 
 
40
  prepared_pdf_state = gr.State([])
41
  output_image_files_state = gr.State([])
42
  output_file_list_state = gr.State([])
@@ -56,23 +59,38 @@ with app:
56
  access_logs_state = gr.State(access_logs_folder + 'log.csv')
57
  access_s3_logs_loc_state = gr.State(access_logs_folder)
58
  usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
59
- usage_s3_logs_loc_state = gr.State(usage_logs_folder)
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  gr.Markdown(
62
  """
63
  # Document redaction
64
 
65
- Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction. If you are getting 0 redactions, it's possible that the text in the document is saved in image format instead of as selectable text. Select 'Image analysis' on the Settings page in this case.
66
 
67
- WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
68
 
69
- This app accepts a maximum file size of 10mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
70
  """)
71
 
 
72
  with gr.Tab("PDFs/images"):
73
-
74
  with gr.Accordion("Redact document", open = True):
75
- in_file = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
 
 
76
  document_redact_btn = gr.Button("Redact document(s)", variant="primary")
77
 
78
  with gr.Row():
@@ -83,16 +101,14 @@ with app:
83
  with gr.Row():
84
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
85
 
 
86
  pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
87
  pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
88
  pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
89
  pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
90
-
91
- with gr.Row():
92
- s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
93
- # This keeps track of the time taken to redact files for logging purposes.
94
- estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False)
95
 
 
96
  with gr.Tab(label="Open text or Excel/csv files"):
97
  gr.Markdown(
98
  """
@@ -115,19 +131,21 @@ with app:
115
  text_output_file = gr.File(label="Output files")
116
  text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
117
 
 
118
  data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
119
  data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
120
  choices=["The results were good", "The results were not good"], visible=False)
121
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
122
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
123
 
 
124
  with gr.Tab(label="Redaction settings"):
125
  gr.Markdown(
126
  """
127
  Define redaction settings that affect both document and open text redaction.
128
  """)
129
  with gr.Accordion("Settings for documents", open = True):
130
- in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis", "AWS Textract"])
131
  with gr.Row():
132
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
133
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
@@ -140,53 +158,47 @@ with app:
140
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
141
  with gr.Row():
142
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
143
- #in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
144
  with gr.Row():
145
  in_allow_list = gr.UploadButton(label="Import allow list file.", file_count="multiple")
146
  gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
147
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
148
  log_files_output = gr.File(label="Log file output", interactive=False)
149
 
150
- # Invisible text box to hold the session hash/username and Textract request metadata just for logging purposes
151
- session_hash_textbox = gr.Textbox(value="", visible=False)
152
- textract_metadata_textbox = gr.Textbox(value="", visible=False)
153
-
154
- # AWS options - placeholder for possibility of storing data on s3
155
- # with gr.Tab(label="Advanced options"):
156
- # with gr.Accordion(label = "AWS data access", open = True):
157
- # aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
158
- # with gr.Row():
159
- # in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
160
- # load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
161
-
162
- # aws_log_box = gr.Textbox(label="AWS data load status")
163
-
164
- # ### Loading AWS data ###
165
- # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
166
-
167
  # If a custom allow list is uploaded
168
  in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
169
 
170
- # Document redaction
171
- document_redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
172
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox],
 
 
 
 
173
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox], api_name="redact_doc")
174
 
175
  # If the output file count text box changes, keep going with redacting each document until done
176
- text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
177
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox],
178
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox]).\
179
  then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
180
 
181
- # Tabular data redaction
182
- in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
 
 
 
183
 
184
- tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_text")
185
 
186
  # If the output file count text box changes, keep going with redacting each data file until done
187
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
188
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
189
 
 
 
 
 
190
  # Get connection details on app load
191
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
192
 
@@ -198,8 +210,8 @@ with app:
198
 
199
  # User submitted feedback for pdf redactions
200
  pdf_callback = gr.CSVLogger()
201
- pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_logs_folder)
202
- pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
203
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
204
 
205
  # User submitted feedback for data redactions
@@ -210,8 +222,8 @@ with app:
210
 
211
  # Log processing time/token usage when making a query
212
  usage_callback = gr.CSVLogger()
213
- usage_callback.setup([session_hash_textbox, in_data_files, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
214
- estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, in_data_files, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
215
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
216
 
217
  # Launch the Gradio app
@@ -222,4 +234,18 @@ if __name__ == "__main__":
222
  if os.environ['COGNITO_AUTH'] == "1":
223
  app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
224
  else:
225
- app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
8
  from tools.aws_functions import upload_file_to_s3
9
  from tools.file_redaction import choose_and_run_redactor
10
+ from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
11
  from tools.data_anonymise import anonymise_data_files
12
  from tools.auth import authenticate_user
13
  #from tools.aws_functions import load_data_from_aws
 
37
 
38
  with app:
39
 
40
+ ###
41
+ # STATE VARIABLES
42
+ ###
43
  prepared_pdf_state = gr.State([])
44
  output_image_files_state = gr.State([])
45
  output_file_list_state = gr.State([])
 
59
  access_logs_state = gr.State(access_logs_folder + 'log.csv')
60
  access_s3_logs_loc_state = gr.State(access_logs_folder)
61
  usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
62
+ usage_s3_logs_loc_state = gr.State(usage_logs_folder)
63
+
64
+ # Invisible elements effectively used as state variables
65
+ session_hash_textbox = gr.Textbox(value="", visible=False) # Invisible text box to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
66
+ textract_metadata_textbox = gr.Textbox(value="", visible=False)
67
+ doc_file_name_textbox = gr.Textbox(value="", visible=False)
68
+ data_file_name_textbox = gr.Textbox(value="", visible=False)
69
+ s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
70
+ estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
71
+
72
+
73
+ ###
74
+ # UI DESIGN
75
+ ###
76
 
77
  gr.Markdown(
78
  """
79
  # Document redaction
80
 
81
+ Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
82
 
83
+ NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
84
 
85
+ This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
86
  """)
87
 
88
+ # PDF / IMAGES TAB
89
  with gr.Tab("PDFs/images"):
 
90
  with gr.Accordion("Redact document", open = True):
91
+ in_doc_files = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
92
+ in_redaction_method = gr.Radio(label="Choose document redaction method. Note that for AWS Textract, there will be a cost to the service from use of AWS services.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - AWS Textract, handwriting/signatures"])
93
+ gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
94
  document_redact_btn = gr.Button("Redact document(s)", variant="primary")
95
 
96
  with gr.Row():
 
101
  with gr.Row():
102
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
103
 
104
+ # Feedback elements are invisible until revealed by redaction action
105
  pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
106
  pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
107
  pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
108
  pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
109
+
 
 
 
 
110
 
111
+ # TEXT / TABULAR DATA TAB
112
  with gr.Tab(label="Open text or Excel/csv files"):
113
  gr.Markdown(
114
  """
 
131
  text_output_file = gr.File(label="Output files")
132
  text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
133
 
134
+ # Feedback elements are invisible until revealed by redaction action
135
  data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
136
  data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
137
  choices=["The results were good", "The results were not good"], visible=False)
138
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
139
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
140
 
141
+ # SETTINGS TAB
142
  with gr.Tab(label="Redaction settings"):
143
  gr.Markdown(
144
  """
145
  Define redaction settings that affect both document and open text redaction.
146
  """)
147
  with gr.Accordion("Settings for documents", open = True):
148
+
149
  with gr.Row():
150
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
151
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
 
158
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
159
  with gr.Row():
160
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
161
+ # Upload 'Allow list' for terms not to be redacted
162
  with gr.Row():
163
  in_allow_list = gr.UploadButton(label="Import allow list file.", file_count="multiple")
164
  gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
165
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
166
  log_files_output = gr.File(label="Log file output", interactive=False)
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # If a custom allow list is uploaded
169
  in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
170
 
171
+ ###
172
+ # PDF/IMAGE REDACTION
173
+ ###
174
+ in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox])
175
+
176
+ document_redact_btn.click(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare_doc").\
177
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox],
178
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox], api_name="redact_doc")
179
 
180
  # If the output file count text box changes, keep going with redacting each document until done
181
+ text_documents_done.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
182
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox],
183
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox]).\
184
  then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
185
 
186
+ ###
187
+ # TABULAR DATA REDACTION
188
+ ###
189
+ in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
190
+ then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_textbox])
191
 
192
+ tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
193
 
194
  # If the output file count text box changes, keep going with redacting each data file until done
195
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
196
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
197
 
198
+ ###
199
+ # APP LOAD AND LOGGING
200
+ ###
201
+
202
  # Get connection details on app load
203
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
204
 
 
210
 
211
  # User submitted feedback for pdf redactions
212
  pdf_callback = gr.CSVLogger()
213
+ pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_doc_files], feedback_logs_folder)
214
+ pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_doc_files], None, preprocess=False).\
215
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
216
 
217
  # User submitted feedback for data redactions
 
222
 
223
  # Log processing time/token usage when making a query
224
  usage_callback = gr.CSVLogger()
225
+ usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
226
+ estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
227
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
228
 
229
  # Launch the Gradio app
 
234
  if os.environ['COGNITO_AUTH'] == "1":
235
  app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
236
  else:
237
+ app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')
238
+
239
+
240
+ # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
241
+ # with gr.Tab(label="Advanced options"):
242
+ # with gr.Accordion(label = "AWS data access", open = True):
243
+ # aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
244
+ # with gr.Row():
245
+ # in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
246
+ # load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
247
+
248
+ # aws_log_box = gr.Textbox(label="AWS data load status")
249
+
250
+ # ### Loading AWS data ###
251
+ # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_doc_files, aws_log_box])
tools/aws_textract.py CHANGED
@@ -44,7 +44,7 @@ def analyse_page_with_textract(pdf_page_bytes, json_file_path):
44
  response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
45
 
46
  text_blocks = response['Blocks']
47
- request_metadata = extract_textract_metadata(response)
48
 
49
  # Write the response to a JSON file
50
  with open(json_file_path, 'w') as json_file:
@@ -92,56 +92,75 @@ def json_to_ocrresult(json_data, page_width, page_height):
92
  signatures = []
93
  handwriting = []
94
 
 
 
95
  for text_block in json_data:
96
 
97
  is_signature = False
98
  is_handwriting = False
99
 
100
- if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
101
 
102
- if (text_block['BlockType'] == 'LINE'):
103
-
104
- # If a line, pull out the text type and confidence from the child words and get text, bounding box
105
 
106
- if 'Text' in text_block:
107
- text = text_block['Text']
 
 
 
 
 
 
108
 
 
109
  if 'Relationships' in text_block:
110
  for relationship in text_block['Relationships']:
111
  if relationship['Type'] == 'CHILD':
112
  for child_id in relationship['Ids']:
113
  child_block = next((block for block in json_data if block['Id'] == child_id), None)
114
- if child_block and 'TextType' in child_block:
115
- text_type = child_block['TextType']
116
- confidence = text_block['Confidence']
117
- break
118
- break
119
-
120
- # Extract BoundingBox details
121
- bbox = text_block["Geometry"]["BoundingBox"]
122
- left = bbox["Left"]
123
- top = bbox["Top"]
124
- width = bbox["Width"]
125
- height = bbox["Height"]
126
-
127
- # Convert proportional coordinates to absolute coordinates
128
- left_abs = int(left * page_width)
129
- top_abs = int(top * page_height)
130
- width_abs = int(width * page_width)
131
- height_abs = int(height * page_height)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- # If handwriting or signature, add to bounding box
134
 
135
- if text_type == "HANDWRITING":
136
- is_handwriting = True
137
- entity_name = "HANDWRITING"
138
- word_end = len(entity_name)
139
- recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
140
- handwriting.append(recogniser_result)
141
- print("Handwriting found:", handwriting[-1])
142
 
143
  elif (text_block['BlockType'] == 'SIGNATURE'):
144
- text = "SIGNATURE"
145
 
146
  is_signature = True
147
  entity_name = "SIGNATURE"
@@ -161,12 +180,25 @@ def json_to_ocrresult(json_data, page_width, page_height):
161
  width_abs = int(width * page_width)
162
  height_abs = int(height * page_height)
163
 
164
- recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
165
  signatures.append(recogniser_result)
166
  print("Signature found:", signatures[-1])
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # Create OCRResult with absolute coordinates
169
- ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
170
  all_ocr_results.append(ocr_result)
171
 
172
  is_signature_or_handwriting = is_signature | is_handwriting
@@ -178,4 +210,4 @@ def json_to_ocrresult(json_data, page_width, page_height):
178
  if is_signature: signature_recogniser_results.append(recogniser_result)
179
  if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
180
 
181
- return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results
 
44
  response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
45
 
46
  text_blocks = response['Blocks']
47
+ request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
48
 
49
  # Write the response to a JSON file
50
  with open(json_file_path, 'w') as json_file:
 
92
  signatures = []
93
  handwriting = []
94
 
95
+ combined_results = {}
96
+
97
  for text_block in json_data:
98
 
99
  is_signature = False
100
  is_handwriting = False
101
 
102
+
103
 
104
+ if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
 
 
105
 
106
+ if text_block['BlockType'] == 'LINE':
107
+ # Extract text and bounding box for the line
108
+ line_text = text_block.get('Text', '')
109
+ line_bbox = text_block["Geometry"]["BoundingBox"]
110
+ line_left = int(line_bbox["Left"] * page_width)
111
+ line_top = int(line_bbox["Top"] * page_height)
112
+ line_right = int((line_bbox["Left"] + line_bbox["Width"]) * page_width)
113
+ line_bottom = int((line_bbox["Top"] + line_bbox["Height"]) * page_height)
114
 
115
+ words = []
116
  if 'Relationships' in text_block:
117
  for relationship in text_block['Relationships']:
118
  if relationship['Type'] == 'CHILD':
119
  for child_id in relationship['Ids']:
120
  child_block = next((block for block in json_data if block['Id'] == child_id), None)
121
+ if child_block and child_block['BlockType'] == 'WORD':
122
+ word_text = child_block.get('Text', '')
123
+ word_bbox = child_block["Geometry"]["BoundingBox"]
124
+ confidence = child_block.get('Confidence','')
125
+ word_left = int(word_bbox["Left"] * page_width)
126
+ word_top = int(word_bbox["Top"] * page_height)
127
+ word_right = int((word_bbox["Left"] + word_bbox["Width"]) * page_width)
128
+ word_bottom = int((word_bbox["Top"] + word_bbox["Height"]) * page_height)
129
+
130
+ # Extract BoundingBox details
131
+ width = word_bbox["Width"]
132
+ height = word_bbox["Height"]
133
+
134
+ # Convert proportional coordinates to absolute coordinates
135
+ width_abs = int(width * page_width)
136
+ height_abs = int(height * page_height)
137
+
138
+ words.append({
139
+ 'text': word_text,
140
+ 'bounding_box': (word_left, word_top, word_right, word_bottom)
141
+ })
142
+ # Check for handwriting
143
+ text_type = child_block.get("TextType", '')
144
+
145
+ if text_type == "HANDWRITING":
146
+ is_handwriting = True
147
+ entity_name = "HANDWRITING"
148
+ word_end = len(entity_name)
149
+ recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=width_abs, height=height_abs)
150
+ handwriting.append(recogniser_result)
151
+ print("Handwriting found:", handwriting[-1])
152
+
153
+ combined_results[line_text] = {
154
+ 'bounding_box': (line_left, line_top, line_right, line_bottom),
155
+ 'words': words
156
+ }
157
 
 
158
 
159
+
160
+ # If handwriting or signature, add to bounding box
 
 
 
 
 
161
 
162
  elif (text_block['BlockType'] == 'SIGNATURE'):
163
+ line_text = "SIGNATURE"
164
 
165
  is_signature = True
166
  entity_name = "SIGNATURE"
 
180
  width_abs = int(width * page_width)
181
  height_abs = int(height * page_height)
182
 
183
+ recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
184
  signatures.append(recogniser_result)
185
  print("Signature found:", signatures[-1])
186
 
187
+ # Extract BoundingBox details
188
+ bbox = text_block["Geometry"]["BoundingBox"]
189
+ left = bbox["Left"]
190
+ top = bbox["Top"]
191
+ width = bbox["Width"]
192
+ height = bbox["Height"]
193
+
194
+ # Convert proportional coordinates to absolute coordinates
195
+ left_abs = int(left * page_width)
196
+ top_abs = int(top * page_height)
197
+ width_abs = int(width * page_width)
198
+ height_abs = int(height * page_height)
199
+
200
  # Create OCRResult with absolute coordinates
201
+ ocr_result = OCRResult(line_text, left_abs, top_abs, width_abs, height_abs)
202
  all_ocr_results.append(ocr_result)
203
 
204
  is_signature_or_handwriting = is_signature | is_handwriting
 
210
  if is_signature: signature_recogniser_results.append(recogniser_result)
211
  if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
212
 
213
+ return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, combined_results
tools/custom_image_analyser_engine.py CHANGED
@@ -1,9 +1,14 @@
1
  import pytesseract
2
- from PIL import Image
3
  import numpy as np
4
  from presidio_analyzer import AnalyzerEngine, RecognizerResult
 
5
  from typing import List, Dict, Optional, Union, Tuple
6
  from dataclasses import dataclass
 
 
 
 
 
7
 
8
  @dataclass
9
  class OCRResult:
@@ -25,17 +30,399 @@ class CustomImageRecognizerResult:
25
  height: int
26
  text: str
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  class CustomImageAnalyzerEngine:
29
  def __init__(
30
  self,
31
  analyzer_engine: Optional[AnalyzerEngine] = None,
32
- tesseract_config: Optional[str] = None
 
33
  ):
34
  if not analyzer_engine:
35
  analyzer_engine = AnalyzerEngine()
36
  self.analyzer_engine = analyzer_engine
37
  self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
38
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
40
  # Ensure image is a PIL Image
41
  if isinstance(image, str):
@@ -43,18 +430,30 @@ class CustomImageAnalyzerEngine:
43
  elif isinstance(image, np.ndarray):
44
  image = Image.fromarray(image)
45
 
46
- ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT, config=self.tesseract_config)
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  # Filter out empty strings and low confidence results
49
- valid_indices = [i for i, text in enumerate(ocr_data['text']) if text.strip() and int(ocr_data['conf'][i]) > 0]
50
 
51
  return [
52
  OCRResult(
53
- text=ocr_data['text'][i],
54
- left=ocr_data['left'][i],
55
- top=ocr_data['top'][i],
56
- width=ocr_data['width'][i],
57
- height=ocr_data['height'][i]
58
  )
59
  for i in valid_indices
60
  ]
@@ -86,7 +485,7 @@ class CustomImageAnalyzerEngine:
86
  text=relevant_text,
87
  left=ocr_result.left + self.estimate_x_offset(ocr_result.text, result.start),
88
  top=ocr_result.top,
89
- width=self.estimate_width(ocr_result, result.start, result.end),
90
  height=ocr_result.height
91
  )
92
 
@@ -132,28 +531,160 @@ class CustomImageAnalyzerEngine:
132
  text_position = word_end + 1 # +1 for the space between words
133
 
134
  return pii_bboxes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  @staticmethod
137
  def estimate_x_offset(full_text: str, start: int) -> int:
138
  # Estimate the x-offset based on character position
139
  # This is a simple estimation and might need refinement for variable-width fonts
140
  return int(start / len(full_text) * len(full_text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- @staticmethod
143
- def estimate_width(ocr_result: OCRResult, start: int, end: int) -> int:
144
- # Estimate the width of the relevant text portion
145
- full_width = ocr_result.width
146
- full_length = len(ocr_result.text)
147
- return int((end - start) / full_length * full_width)
148
 
149
  # Function to combine OCR results into line-level results
150
- def combine_ocr_results(ocr_results, x_threshold = 20, y_threshold = 10):
151
  # Sort OCR results by 'top' to ensure line order
152
  ocr_results = sorted(ocr_results, key=lambda x: (x.top, x.left))
153
 
154
  combined_results = []
 
155
  current_line = []
156
  current_bbox = None
 
157
 
158
  for result in ocr_results:
159
  if not current_line:
@@ -178,11 +709,33 @@ def combine_ocr_results(ocr_results, x_threshold = 20, y_threshold = 10):
178
  else:
179
  # Commit the current line and start a new one
180
  combined_results.append(current_bbox)
 
 
 
 
 
 
 
 
 
 
 
181
  current_line = [result]
182
  current_bbox = result
183
 
184
  # Append the last line
185
  if current_bbox:
186
  combined_results.append(current_bbox)
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
- return combined_results
 
1
  import pytesseract
 
2
  import numpy as np
3
  from presidio_analyzer import AnalyzerEngine, RecognizerResult
4
+ #from presidio_image_redactor import ImagePreprocessor
5
  from typing import List, Dict, Optional, Union, Tuple
6
  from dataclasses import dataclass
7
+ import cv2
8
+ import PIL
9
+ from PIL import ImageDraw, ImageFont, Image
10
+ from typing import Optional, Tuple, Union
11
+ from copy import deepcopy
12
 
13
  @dataclass
14
  class OCRResult:
 
30
  height: int
31
  text: str
32
 
33
+ class ImagePreprocessor:
34
+ """ImagePreprocessor class.
35
+
36
+ Parent class for image preprocessing objects.
37
+ """
38
+
39
+ def __init__(self, use_greyscale: bool = True) -> None:
40
+ """Initialize the ImagePreprocessor class.
41
+
42
+ :param use_greyscale: Whether to convert the image to greyscale.
43
+ """
44
+ self.use_greyscale = use_greyscale
45
+
46
+ def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
47
+ """Preprocess the image to be analyzed.
48
+
49
+ :param image: Loaded PIL image.
50
+
51
+ :return: The processed image and any metadata regarding the
52
+ preprocessing approach.
53
+ """
54
+ return image, {}
55
+
56
+ def convert_image_to_array(self, image: Image.Image) -> np.ndarray:
57
+ """Convert PIL image to numpy array.
58
+
59
+ :param image: Loaded PIL image.
60
+ :param convert_to_greyscale: Whether to convert the image to greyscale.
61
+
62
+ :return: image pixels as a numpy array.
63
+
64
+ """
65
+
66
+ if isinstance(image, np.ndarray):
67
+ img = image
68
+ else:
69
+ if self.use_greyscale:
70
+ image = image.convert("L")
71
+ img = np.asarray(image)
72
+ return img
73
+
74
+ @staticmethod
75
+ def _get_bg_color(
76
+ image: Image.Image, is_greyscale: bool, invert: bool = False
77
+ ) -> Union[int, Tuple[int, int, int]]:
78
+ """Select most common color as background color.
79
+
80
+ :param image: Loaded PIL image.
81
+ :param is_greyscale: Whether the image is greyscale.
82
+ :param invert: TRUE if you want to get the inverse of the bg color.
83
+
84
+ :return: Background color.
85
+ """
86
+ # Invert colors if invert flag is True
87
+ if invert:
88
+ if image.mode == "RGBA":
89
+ # Handle transparency as needed
90
+ r, g, b, a = image.split()
91
+ rgb_image = Image.merge("RGB", (r, g, b))
92
+ inverted_image = PIL.ImageOps.invert(rgb_image)
93
+ r2, g2, b2 = inverted_image.split()
94
+
95
+ image = Image.merge("RGBA", (r2, g2, b2, a))
96
+
97
+ else:
98
+ image = PIL.ImageOps.invert(image)
99
+
100
+ # Get background color
101
+ if is_greyscale:
102
+ # Select most common color as color
103
+ bg_color = int(np.bincount(image.flatten()).argmax())
104
+ else:
105
+ # Reduce size of image to 1 pixel to get dominant color
106
+ tmp_image = image.copy()
107
+ tmp_image = tmp_image.resize((1, 1), resample=0)
108
+ bg_color = tmp_image.getpixel((0, 0))
109
+
110
+ return bg_color
111
+
112
+ @staticmethod
113
+ def _get_image_contrast(image: np.ndarray) -> Tuple[float, float]:
114
+ """Compute the contrast level and mean intensity of an image.
115
+
116
+ :param image: Input image pixels (as a numpy array).
117
+
118
+ :return: A tuple containing the contrast level and mean intensity of the image.
119
+ """
120
+ contrast = np.std(image)
121
+ mean_intensity = np.mean(image)
122
+ return contrast, mean_intensity
123
+
124
+ class BilateralFilter(ImagePreprocessor):
125
+ """BilateralFilter class.
126
+
127
+ The class applies bilateral filtering to an image. and returns the filtered
128
+ image and metadata.
129
+ """
130
+
131
+ def __init__(
132
+ self, diameter: int = 3, sigma_color: int = 40, sigma_space: int = 40
133
+ ) -> None:
134
+ """Initialize the BilateralFilter class.
135
+
136
+ :param diameter: Diameter of each pixel neighborhood.
137
+ :param sigma_color: value of sigma in the color space.
138
+ :param sigma_space: value of sigma in the coordinate space.
139
+ """
140
+ super().__init__(use_greyscale=True)
141
+
142
+ self.diameter = diameter
143
+ self.sigma_color = sigma_color
144
+ self.sigma_space = sigma_space
145
+
146
+ def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
147
+ """Preprocess the image to be analyzed.
148
+
149
+ :param image: Loaded PIL image.
150
+
151
+ :return: The processed image and metadata (diameter, sigma_color, sigma_space).
152
+ """
153
+ image = self.convert_image_to_array(image)
154
+
155
+ # Apply bilateral filtering
156
+ filtered_image = cv2.bilateralFilter(
157
+ image,
158
+ self.diameter,
159
+ self.sigma_color,
160
+ self.sigma_space,
161
+ )
162
+
163
+ metadata = {
164
+ "diameter": self.diameter,
165
+ "sigma_color": self.sigma_color,
166
+ "sigma_space": self.sigma_space,
167
+ }
168
+
169
+ return Image.fromarray(filtered_image), metadata
170
+
171
+
172
+ class SegmentedAdaptiveThreshold(ImagePreprocessor):
173
+ """SegmentedAdaptiveThreshold class.
174
+
175
+ The class applies adaptive thresholding to an image
176
+ and returns the thresholded image and metadata.
177
+ The parameters used to run the adaptivethresholding are selected based on
178
+ the contrast level of the image.
179
+ """
180
+
181
+ def __init__(
182
+ self,
183
+ block_size: int = 5,
184
+ contrast_threshold: int = 40,
185
+ c_low_contrast: int = 10,
186
+ c_high_contrast: int = 40,
187
+ bg_threshold: int = 122,
188
+ ) -> None:
189
+ """Initialize the SegmentedAdaptiveThreshold class.
190
+
191
+ :param block_size: Size of the neighborhood area for threshold calculation.
192
+ :param contrast_threshold: Threshold for low contrast images.
193
+ :param C_low_contrast: Constant added to the mean for low contrast images.
194
+ :param C_high_contrast: Constant added to the mean for high contrast images.
195
+ :param bg_threshold: Threshold for background color.
196
+ """
197
+
198
+ super().__init__(use_greyscale=True)
199
+ self.block_size = block_size
200
+ self.c_low_contrast = c_low_contrast
201
+ self.c_high_contrast = c_high_contrast
202
+ self.bg_threshold = bg_threshold
203
+ self.contrast_threshold = contrast_threshold
204
+
205
+ def preprocess_image(
206
+ self, image: Union[Image.Image, np.ndarray]
207
+ ) -> Tuple[Image.Image, dict]:
208
+ """Preprocess the image.
209
+
210
+ :param image: Loaded PIL image.
211
+
212
+ :return: The processed image and metadata (C, background_color, contrast).
213
+ """
214
+ if not isinstance(image, np.ndarray):
215
+ image = self.convert_image_to_array(image)
216
+
217
+ # Determine background color
218
+ background_color = self._get_bg_color(image, True)
219
+ contrast, _ = self._get_image_contrast(image)
220
+
221
+ c = (
222
+ self.c_low_contrast
223
+ if contrast <= self.contrast_threshold
224
+ else self.c_high_contrast
225
+ )
226
+
227
+ if background_color < self.bg_threshold:
228
+ adaptive_threshold_image = cv2.adaptiveThreshold(
229
+ image,
230
+ 255,
231
+ cv2.ADAPTIVE_THRESH_MEAN_C,
232
+ cv2.THRESH_BINARY_INV,
233
+ self.block_size,
234
+ -c,
235
+ )
236
+ else:
237
+ adaptive_threshold_image = cv2.adaptiveThreshold(
238
+ image,
239
+ 255,
240
+ cv2.ADAPTIVE_THRESH_MEAN_C,
241
+ cv2.THRESH_BINARY,
242
+ self.block_size,
243
+ c,
244
+ )
245
+
246
+ metadata = {"C": c, "background_color": background_color, "contrast": contrast}
247
+ return Image.fromarray(adaptive_threshold_image), metadata
248
+
249
+
250
+
251
+
252
+ class ImageRescaling(ImagePreprocessor):
253
+ """ImageRescaling class. Rescales images based on their size."""
254
+
255
+ def __init__(
256
+ self,
257
+ small_size: int = 1048576,
258
+ large_size: int = 4000000,
259
+ factor: int = 2,
260
+ interpolation: int = cv2.INTER_AREA,
261
+ ) -> None:
262
+ """Initialize the ImageRescaling class.
263
+
264
+ :param small_size: Threshold for small image size.
265
+ :param large_size: Threshold for large image size.
266
+ :param factor: Scaling factor for resizing.
267
+ :param interpolation: Interpolation method for resizing.
268
+ """
269
+ super().__init__(use_greyscale=True)
270
+
271
+ self.small_size = small_size
272
+ self.large_size = large_size
273
+ self.factor = factor
274
+ self.interpolation = interpolation
275
+
276
+ def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
277
+ """Preprocess the image to be analyzed.
278
+
279
+ :param image: Loaded PIL image.
280
+
281
+ :return: The processed image and metadata (scale_factor).
282
+ """
283
+
284
+ scale_factor = 1
285
+ if image.size < self.small_size:
286
+ scale_factor = self.factor
287
+ elif image.size > self.large_size:
288
+ scale_factor = 1 / self.factor
289
+
290
+ width = int(image.shape[1] * scale_factor)
291
+ height = int(image.shape[0] * scale_factor)
292
+ dimensions = (width, height)
293
+
294
+ # resize image
295
+ rescaled_image = cv2.resize(image, dimensions, interpolation=self.interpolation)
296
+ metadata = {"scale_factor": scale_factor}
297
+ return Image.fromarray(rescaled_image), metadata
298
+
299
+
300
+ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
301
+ """Class containing all logic to perform contrastive segmentation.
302
+
303
+ Contrastive segmentation is a preprocessing step that aims to enhance the
304
+ text in an image by increasing the contrast between the text and the
305
+ background. The parameters used to run the preprocessing are selected based
306
+ on the contrast level of the image.
307
+ """
308
+
309
+ def __init__(
310
+ self,
311
+ bilateral_filter: Optional[BilateralFilter] = None,
312
+ adaptive_threshold: Optional[SegmentedAdaptiveThreshold] = None,
313
+ image_rescaling: Optional[ImageRescaling] = None,
314
+ low_contrast_threshold: int = 40,
315
+ ) -> None:
316
+ """Initialize the class.
317
+
318
+ :param bilateral_filter: Optional BilateralFilter instance.
319
+ :param adaptive_threshold: Optional AdaptiveThreshold instance.
320
+ :param image_rescaling: Optional ImageRescaling instance.
321
+ :param low_contrast_threshold: Threshold for low contrast images.
322
+ """
323
+
324
+ super().__init__(use_greyscale=True)
325
+ if not bilateral_filter:
326
+ self.bilateral_filter = BilateralFilter()
327
+ else:
328
+ self.bilateral_filter = bilateral_filter
329
+
330
+ if not adaptive_threshold:
331
+ self.adaptive_threshold = SegmentedAdaptiveThreshold()
332
+ else:
333
+ self.adaptive_threshold = adaptive_threshold
334
+
335
+ if not image_rescaling:
336
+ self.image_rescaling = ImageRescaling()
337
+ else:
338
+ self.image_rescaling = image_rescaling
339
+
340
+ self.low_contrast_threshold = low_contrast_threshold
341
+
342
+ def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
343
+ """Preprocess the image to be analyzed.
344
+
345
+ :param image: Loaded PIL image.
346
+
347
+ :return: The processed image and metadata (background color, scale percentage,
348
+ contrast level, and C value).
349
+ """
350
+ image = self.convert_image_to_array(image)
351
+
352
+ # Apply bilateral filtering
353
+ filtered_image, _ = self.bilateral_filter.preprocess_image(image)
354
+
355
+ # Convert to grayscale
356
+ pil_filtered_image = Image.fromarray(np.uint8(filtered_image))
357
+ pil_grayscale_image = pil_filtered_image.convert("L")
358
+ grayscale_image = np.asarray(pil_grayscale_image)
359
+
360
+ # Improve contrast
361
+ adjusted_image, _, adjusted_contrast = self._improve_contrast(grayscale_image)
362
+
363
+ # Adaptive Thresholding
364
+ adaptive_threshold_image, _ = self.adaptive_threshold.preprocess_image(
365
+ adjusted_image
366
+ )
367
+ # Increase contrast
368
+ _, threshold_image = cv2.threshold(
369
+ np.asarray(adaptive_threshold_image),
370
+ 0,
371
+ 255,
372
+ cv2.THRESH_BINARY | cv2.THRESH_OTSU,
373
+ )
374
+
375
+ # Rescale image
376
+ rescaled_image, scale_metadata = self.image_rescaling.preprocess_image(
377
+ threshold_image
378
+ )
379
+
380
+ return rescaled_image, scale_metadata
381
+
382
+ def _improve_contrast(self, image: np.ndarray) -> Tuple[np.ndarray, str, str]:
383
+ """Improve the contrast of an image based on its initial contrast level.
384
+
385
+ :param image: Input image.
386
+
387
+ :return: A tuple containing the improved image, the initial contrast level,
388
+ and the adjusted contrast level.
389
+ """
390
+ contrast, mean_intensity = self._get_image_contrast(image)
391
+
392
+ if contrast <= self.low_contrast_threshold:
393
+ alpha = 1.5
394
+ beta = -mean_intensity * alpha
395
+ adjusted_image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
396
+ adjusted_contrast, _ = self._get_image_contrast(adjusted_image)
397
+ else:
398
+ adjusted_image = image
399
+ adjusted_contrast = contrast
400
+ return adjusted_image, contrast, adjusted_contrast
401
+
402
  class CustomImageAnalyzerEngine:
403
  def __init__(
404
  self,
405
  analyzer_engine: Optional[AnalyzerEngine] = None,
406
+ tesseract_config: Optional[str] = None,
407
+ image_preprocessor: Optional[ImagePreprocessor] = None
408
  ):
409
  if not analyzer_engine:
410
  analyzer_engine = AnalyzerEngine()
411
  self.analyzer_engine = analyzer_engine
412
  self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
413
 
414
+ if not image_preprocessor:
415
+ # image_preprocessor = ImagePreprocessor(
416
+ # c_low_contrast=10,
417
+ # c_high_contrast=20,
418
+ # contrast_threshold=0.5,
419
+ # bg_threshold=128,
420
+ # block_size=11
421
+ # )
422
+ image_preprocessor = ContrastSegmentedImageEnhancer()
423
+ print(image_preprocessor)
424
+ self.image_preprocessor = image_preprocessor
425
+
426
  def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
427
  # Ensure image is a PIL Image
428
  if isinstance(image, str):
 
430
  elif isinstance(image, np.ndarray):
431
  image = Image.fromarray(image)
432
 
433
+ image_processed, preprocessing_metadata = self.image_preprocessor.preprocess_image(image)
434
+
435
+ #print("pre-processing metadata:", preprocessing_metadata)
436
+ #image_processed.save("image_processed.png")
437
+
438
+ ocr_data = pytesseract.image_to_data(image_processed, output_type=pytesseract.Output.DICT, config=self.tesseract_config)
439
+
440
+ if preprocessing_metadata and ("scale_factor" in preprocessing_metadata):
441
+ ocr_result = self._scale_bbox_results(
442
+ ocr_data, preprocessing_metadata["scale_factor"]
443
+ )
444
+
445
+ ocr_result = self.remove_space_boxes(ocr_result)
446
 
447
  # Filter out empty strings and low confidence results
448
+ valid_indices = [i for i, text in enumerate(ocr_result['text']) if text.strip() and int(ocr_result['conf'][i]) > 0]
449
 
450
  return [
451
  OCRResult(
452
+ text=ocr_result['text'][i],
453
+ left=ocr_result['left'][i],
454
+ top=ocr_result['top'][i],
455
+ width=ocr_result['width'][i],
456
+ height=ocr_result['height'][i]
457
  )
458
  for i in valid_indices
459
  ]
 
485
  text=relevant_text,
486
  left=ocr_result.left + self.estimate_x_offset(ocr_result.text, result.start),
487
  top=ocr_result.top,
488
+ width=self.estimate_width(ocr_result=ocr_result, start=result.start, end=result.end),
489
  height=ocr_result.height
490
  )
491
 
 
531
  text_position = word_end + 1 # +1 for the space between words
532
 
533
  return pii_bboxes
534
+
535
+ @staticmethod
536
+ def remove_space_boxes(ocr_result: dict) -> dict:
537
+ """Remove OCR bboxes that are for spaces.
538
+
539
+ :param ocr_result: OCR results (raw or thresholded).
540
+ :return: OCR results with empty words removed.
541
+ """
542
+ # Get indices of items with no text
543
+ idx = list()
544
+ for i, text in enumerate(ocr_result["text"]):
545
+ is_not_space = text.isspace() is False
546
+ if text != "" and is_not_space:
547
+ idx.append(i)
548
+
549
+ # Only retain items with text
550
+ filtered_ocr_result = {}
551
+ for key in list(ocr_result.keys()):
552
+ filtered_ocr_result[key] = [ocr_result[key][i] for i in idx]
553
+
554
+ return filtered_ocr_result
555
+
556
+ @staticmethod
557
+ def _scale_bbox_results(
558
+ ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
559
+ ) -> Dict[str, float]:
560
+ """Scale down the bounding box results based on a scale percentage.
561
+
562
+ :param ocr_result: OCR results (raw).
563
+ :param scale_percent: Scale percentage for resizing the bounding box.
564
+
565
+ :return: OCR results (scaled).
566
+ """
567
+ scaled_results = deepcopy(ocr_result)
568
+ coordinate_keys = ["left", "top"]
569
+ dimension_keys = ["width", "height"]
570
+
571
+ for coord_key in coordinate_keys:
572
+ scaled_results[coord_key] = [
573
+ int(np.ceil((x) / (scale_factor))) for x in scaled_results[coord_key]
574
+ ]
575
+
576
+ for dim_key in dimension_keys:
577
+ scaled_results[dim_key] = [
578
+ max(1, int(np.ceil(x / (scale_factor))))
579
+ for x in scaled_results[dim_key]
580
+ ]
581
+ return scaled_results
582
 
583
  @staticmethod
584
  def estimate_x_offset(full_text: str, start: int) -> int:
585
  # Estimate the x-offset based on character position
586
  # This is a simple estimation and might need refinement for variable-width fonts
587
  return int(start / len(full_text) * len(full_text))
588
+
589
+ def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
590
+ # Extract the relevant text portion
591
+ relevant_text = ocr_result.text[start:end]
592
+
593
+ # If the relevant text is the same as the full text, return the full width
594
+ if relevant_text == ocr_result.text:
595
+ return ocr_result.width
596
+
597
+ # Estimate width based on the proportion of the relevant text length to the total text length
598
+ total_text_length = len(ocr_result.text)
599
+ relevant_text_length = len(relevant_text)
600
+
601
+ if total_text_length == 0:
602
+ return 0 # Avoid division by zero
603
+
604
+ # Proportion of the relevant text to the total text
605
+ proportion = relevant_text_length / total_text_length
606
+
607
+ # Estimate the width based on the proportion
608
+ estimated_width = int(proportion * ocr_result.width)
609
+
610
+ return estimated_width
611
+
612
+
613
+ # def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
614
+ # # Extract the relevant text portion
615
+ # relevant_text = ocr_result.text[start:end]
616
+
617
+ # # Check if the relevant text is the entire text of the OCR result
618
+ # if relevant_text == ocr_result.text:
619
+ # return ocr_result.width
620
+
621
+ # # Estimate the font size based on the height of the bounding box
622
+ # estimated_font_size = ocr_result.height + 4
623
+
624
+ # # Create a blank image with enough width to measure the text
625
+ # dummy_image = Image.new('RGB', (1000, 50), color=(255, 255, 255))
626
+ # draw = ImageDraw.Draw(dummy_image)
627
+
628
+ # # Specify the font and size
629
+ # try:
630
+ # font = ImageFont.truetype("arial.ttf", estimated_font_size) # Adjust the font file as needed
631
+ # except IOError:
632
+ # font = ImageFont.load_default() # Fallback to default font if the specified font is not found
633
+
634
+ # # Draw the relevant text on the image
635
+ # draw.text((0, 0), relevant_text, fill=(0, 0, 0), font=font)
636
+
637
+ # # Save the image for debugging purposes
638
+ # dummy_image.save("debug_image.png")
639
+
640
+ # # Use pytesseract to get the bounding box of the relevant text
641
+ # bbox = pytesseract.image_to_boxes(dummy_image, config=self.tesseract_config)
642
+
643
+ # # Print the bbox for debugging
644
+ # print("Bounding box:", bbox)
645
+
646
+ # # Calculate the width from the bounding box
647
+ # if bbox:
648
+ # try:
649
+ # # Initialize min_left and max_right with extreme values
650
+ # min_left = float('inf')
651
+ # max_right = float('-inf')
652
+
653
+ # # Split the bbox string into lines
654
+ # bbox_lines = bbox.splitlines()
655
+
656
+ # for line in bbox_lines:
657
+ # parts = line.split()
658
+ # if len(parts) == 6:
659
+ # _, left, _, right, _, _ = parts
660
+ # left = int(left)
661
+ # right = int(right)
662
+ # min_left = min(min_left, left)
663
+ # max_right = max(max_right, right)
664
+
665
+ # width = max_right - min_left
666
+ # except ValueError as e:
667
+ # print("Error parsing bounding box:", e)
668
+ # width = 0
669
+ # else:
670
+ # width = 0
671
+
672
+ # print("Estimated width:", width)
673
+
674
+ # return width
675
+
676
 
 
 
 
 
 
 
677
 
678
  # Function to combine OCR results into line-level results
679
+ def combine_ocr_results(ocr_results, x_threshold=20, y_threshold=3):
680
  # Sort OCR results by 'top' to ensure line order
681
  ocr_results = sorted(ocr_results, key=lambda x: (x.top, x.left))
682
 
683
  combined_results = []
684
+ new_format_results = {}
685
  current_line = []
686
  current_bbox = None
687
+ line_counter = 1
688
 
689
  for result in ocr_results:
690
  if not current_line:
 
709
  else:
710
  # Commit the current line and start a new one
711
  combined_results.append(current_bbox)
712
+ new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
713
+ 'bounding_box': (current_bbox.left, current_bbox.top,
714
+ current_bbox.left + current_bbox.width,
715
+ current_bbox.top + current_bbox.height),
716
+ 'words': [{'text': word.text,
717
+ 'bounding_box': (word.left, word.top,
718
+ word.left + word.width,
719
+ word.top + word.height)}
720
+ for word in current_line]
721
+ }
722
+ line_counter += 1
723
  current_line = [result]
724
  current_bbox = result
725
 
726
  # Append the last line
727
  if current_bbox:
728
  combined_results.append(current_bbox)
729
+ new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
730
+ 'bounding_box': (current_bbox.left, current_bbox.top,
731
+ current_bbox.left + current_bbox.width,
732
+ current_bbox.top + current_bbox.height),
733
+ 'words': [{'text': word.text,
734
+ 'bounding_box': (word.left, word.top,
735
+ word.left + word.width,
736
+ word.top + word.height)}
737
+ for word in current_line]
738
+ }
739
+
740
+ return combined_results, new_format_results
741
 
 
tools/data_anonymise.py CHANGED
@@ -195,7 +195,9 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
195
  df_dict = df.to_dict(orient="list")
196
 
197
  if in_allow_list:
198
- in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
 
 
199
 
200
  #analyzer = nlp_analyser #AnalyzerEngine()
201
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
@@ -371,7 +373,9 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
371
 
372
 
373
  if in_allow_list:
374
- in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
 
 
375
 
376
  anon_df = pd.DataFrame()
377
  #out_file_paths = []
 
195
  df_dict = df.to_dict(orient="list")
196
 
197
  if in_allow_list:
198
+ in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
199
+ else:
200
+ in_allow_list_flat = []
201
 
202
  #analyzer = nlp_analyser #AnalyzerEngine()
203
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
 
373
 
374
 
375
  if in_allow_list:
376
+ in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
377
+ else:
378
+ in_allow_list_flat = []
379
 
380
  anon_df = pd.DataFrame()
381
  #out_file_paths = []
tools/file_conversion.py CHANGED
@@ -98,7 +98,33 @@ def process_file(file_path):
98
 
99
  return img_object
100
 
101
- def prepare_image_or_text_pdf(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  file_paths: List[str],
103
  in_redact_method: str,
104
  in_allow_list: Optional[List[List[str]]] = None,
@@ -159,6 +185,8 @@ def prepare_image_or_text_pdf(
159
 
160
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
161
 
 
 
162
  file_paths_loop = [file_paths[int(latest_file_completed)]]
163
  #print("file_paths_loop:", str(file_paths_loop))
164
 
@@ -173,7 +201,7 @@ def prepare_image_or_text_pdf(
173
 
174
  # Check if the file is an image type
175
  if file_extension in ['.jpg', '.jpeg', '.png']:
176
- in_redact_method = "Image analysis"
177
 
178
  # If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
179
  if file_extension in ['.json']:
@@ -191,7 +219,7 @@ def prepare_image_or_text_pdf(
191
  print(out_message)
192
  return out_message, out_file_paths
193
 
194
- if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
195
  # Analyse and redact image-based pdf or image
196
  if is_pdf_or_image(file_path) == False:
197
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
@@ -201,7 +229,7 @@ def prepare_image_or_text_pdf(
201
  out_file_path = process_file(file_path)
202
  #print("Out file path at image conversion step:", out_file_path)
203
 
204
- elif in_redact_method == "Text analysis":
205
  if is_pdf(file_path) == False:
206
  out_message = "Please upload a PDF file for text analysis."
207
  print(out_message)
 
98
 
99
  return img_object
100
 
101
+ def get_input_file_names(file_input):
102
+ '''
103
+ Get list of input files to report to logs.
104
+ '''
105
+
106
+ all_relevant_files = []
107
+
108
+ for file in file_input:
109
+ file_path = file.name
110
+ print(file_path)
111
+ file_path_without_ext = get_file_path_end(file_path)
112
+
113
+ #print("file:", file_path)
114
+
115
+ file_extension = os.path.splitext(file_path)[1].lower()
116
+
117
+ # Check if the file is an image type
118
+ if file_extension in ['.jpg', '.jpeg', '.png', '.xlsx', '.csv', '.parquet']:
119
+ all_relevant_files.append(file_path_without_ext)
120
+
121
+ all_relevant_files_str = ", ".join(all_relevant_files)
122
+
123
+ print("all_relevant_files_str:", all_relevant_files_str)
124
+
125
+ return all_relevant_files_str
126
+
127
+ def prepare_image_or_pdf(
128
  file_paths: List[str],
129
  in_redact_method: str,
130
  in_allow_list: Optional[List[List[str]]] = None,
 
185
 
186
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
187
 
188
+ progress(0.1, desc='Preparing file')
189
+
190
  file_paths_loop = [file_paths[int(latest_file_completed)]]
191
  #print("file_paths_loop:", str(file_paths_loop))
192
 
 
201
 
202
  # Check if the file is an image type
203
  if file_extension in ['.jpg', '.jpeg', '.png']:
204
+ in_redact_method = "Quick image analysis - typed text"
205
 
206
  # If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
207
  if file_extension in ['.json']:
 
219
  print(out_message)
220
  return out_message, out_file_paths
221
 
222
+ if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
223
  # Analyse and redact image-based pdf or image
224
  if is_pdf_or_image(file_path) == False:
225
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
 
229
  out_file_path = process_file(file_path)
230
  #print("Out file path at image conversion step:", out_file_path)
231
 
232
+ elif in_redact_method == "Simple text analysis - PDFs with selectable text":
233
  if is_pdf(file_path) == False:
234
  out_message = "Please upload a PDF file for text analysis."
235
  print(out_message)
tools/file_redaction.py CHANGED
@@ -4,10 +4,10 @@ import json
4
  import io
5
  import os
6
  from PIL import Image, ImageChops, ImageDraw
7
- from typing import List
8
  import pandas as pd
9
 
10
- from presidio_image_redactor.entities import ImageRecognizerResult
11
  from pdfminer.high_level import extract_pages
12
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
13
  from pikepdf import Pdf, Dictionary, Name
@@ -20,15 +20,38 @@ from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRRes
20
  from tools.file_conversion import process_file
21
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
22
  from tools.helper_functions import get_file_path_end, output_folder
23
- from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
24
  from tools.data_anonymise import generate_decision_process_output
25
  from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
26
 
27
- def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  tic = time.perf_counter()
30
- all_request_metadata = []
31
- all_request_metadata_str = ""
32
 
33
  # If this is the first time around, set variables to 0/blank
34
  if first_loop_state==True:
@@ -48,36 +71,164 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
48
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
49
  if latest_file_completed >= len(file_paths):
50
  print("Last file reached")
51
- # Set to a very high number so as not to mess with subsequent file processing by the user
52
  latest_file_completed = 99
53
  final_out_message = '\n'.join(out_message)
54
  #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- def sum_numbers_before_seconds(string):
57
- """Extracts numbers that precede the word 'seconds' from a string and adds them up.
 
 
 
 
58
 
59
- Args:
60
- string: The input string.
61
 
62
- Returns:
63
- The sum of all numbers before 'seconds' in the string.
64
- """
 
 
 
 
65
 
66
- # Extract numbers before 'seconds' using regular expression
67
- numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
68
 
69
- # Extract the numbers from the matches
70
- numbers = [float(num.split()[0]) for num in numbers]
71
 
72
- # Sum up the extracted numbers
73
- sum_of_numbers = round(sum(numbers),1)
 
74
 
75
- return sum_of_numbers
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
78
  print("Estimated total processing time:", str(estimate_total_processing_time))
79
 
80
- return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata
81
 
82
  file_paths_loop = [file_paths[int(latest_file_completed)]]
83
 
@@ -87,7 +238,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
87
  else:
88
  in_allow_list_flat = []
89
 
90
-
91
  for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
92
  file_path = file.name
93
 
@@ -97,19 +247,20 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
97
  if is_a_pdf == False:
98
  # If user has not submitted a pdf, assume it's an image
99
  print("File is not a pdf, assuming that image analysis needs to be used.")
100
- in_redact_method = "Image analysis"
101
  else:
102
  out_message = "No file selected"
103
  print(out_message)
104
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata
105
 
106
- if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
107
- # Analyse and redact image-based pdf or image
108
- # if is_pdf_or_image(file_path) == False:
109
- # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
 
110
 
111
- print("Redacting file" + file_path_without_ext + "as an image-based file")
112
- pdf_images, output_logs, logging_file_paths, request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
113
 
114
  # Save file
115
  out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
@@ -128,30 +279,29 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
128
  f.write(output_logs_str)
129
  log_files_output_paths.append(logs_output_file_name)
130
 
131
- # Save Textract request metadata (if exists)
132
- if request_metadata:
133
- print("Request metadata:", all_request_metadata)
134
- all_request_metadata.append(request_metadata)
135
 
136
  # Increase latest file completed count unless we are at the last file
137
  if latest_file_completed != len(file_paths):
138
  print("Completed file number:", str(latest_file_completed))
139
  latest_file_completed += 1
140
 
141
- elif in_redact_method == "Text analysis":
142
 
143
  if is_pdf(file_path) == False:
144
  return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
145
 
146
  # Analyse text-based pdf
147
  print('Redacting file as text-based PDF')
148
- pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Text analysis")
149
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
150
  pdf_text.save(out_text_file_path)
151
 
152
  # Convert message
153
  convert_message="Converting PDF to image-based PDF to embed redactions."
154
- #progress(0.8, desc=convert_message)
155
  print(convert_message)
156
 
157
  # Convert document to image-based document to 'embed' redactions
@@ -164,10 +314,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
164
  f.write(output_logs_str)
165
  log_files_output_paths.append(logs_output_file_name)
166
 
167
- # Add confirmation for converting to image if you want
168
- # out_message.append(img_output_summary)
169
-
170
- #out_file_paths.append(out_text_file_path)
171
  out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
172
  out_message.append(out_message_new)
173
 
@@ -178,8 +324,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
178
  else:
179
  out_message = "No redaction method selected"
180
  print(out_message)
181
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata
182
-
183
 
184
  toc = time.perf_counter()
185
  out_time = f"in {toc - tic:0.1f} seconds."
@@ -188,48 +333,105 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
188
  out_message_out = '\n'.join(out_message)
189
  out_message_out = out_message_out + " " + out_time
190
 
191
- # If textract requests made, write to logging file
192
  if all_request_metadata:
193
  all_request_metadata_str = '\n'.join(all_request_metadata)
194
 
195
- print("all_request_metadata_file_path")
196
- all_request_metadata_file_path = output_folder + "textract_request_metadata.txt"
197
 
198
  with open(all_request_metadata_file_path, "w") as f:
199
  f.write(all_request_metadata_str)
200
- log_files_output_paths.append(all_request_metadata_file_path)
 
 
 
 
201
 
202
  return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
203
 
204
- def merge_img_bboxes(bboxes, signature_recogniser_results = [], handwriting_recogniser_results = [], handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold=150, vertical_threshold=25):
 
 
 
 
 
 
 
205
  merged_bboxes = []
206
  grouped_bboxes = defaultdict(list)
207
 
 
208
  if signature_recogniser_results or handwriting_recogniser_results:
209
-
210
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
211
  print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
212
  bboxes.extend(handwriting_recogniser_results)
213
-
214
 
215
  if "Redact all identified signatures" in handwrite_signature_checkbox:
216
- print("Signature boxes exist at merge:", handwriting_recogniser_results)
217
  bboxes.extend(signature_recogniser_results)
218
 
219
- # 1. Group by approximate vertical proximity
220
- for box in bboxes:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  grouped_bboxes[round(box.top / vertical_threshold)].append(box)
222
 
223
- # 2. Merge within each group
224
  for _, group in grouped_bboxes.items():
225
  group.sort(key=lambda box: box.left)
226
 
227
  merged_box = group[0]
228
  for next_box in group[1:]:
229
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
230
- #print("Merging a box")
231
  # Calculate new dimensions for the merged box
232
- #print("Merged box:", merged_box)
233
  if merged_box.text == next_box.text:
234
  new_text = merged_box.text
235
  else:
@@ -247,9 +449,10 @@ def merge_img_bboxes(bboxes, signature_recogniser_results = [], handwriting_reco
247
  merged_box = next_box
248
 
249
  merged_bboxes.append(merged_box)
 
250
  return merged_bboxes
251
 
252
- def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Image analysis", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], progress=Progress(track_tqdm=True)):
253
  '''
254
  Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
255
  '''
@@ -259,7 +462,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
259
  fill = (0, 0, 0) # Fill colour
260
  decision_process_output_str = ""
261
  images = []
262
- request_metadata = {}
263
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
264
 
265
  if not image_paths:
@@ -297,11 +500,13 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
297
  all_ocr_results = []
298
  all_decision_process = []
299
 
300
- if analysis_type == "Image analysis": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
301
- elif analysis_type == "AWS Textract": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
302
 
303
  for n in range(0, number_of_pages):
304
  handwriting_or_signature_boxes = []
 
 
305
 
306
  try:
307
  image = image_paths[0][n]#.copy()
@@ -339,17 +544,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
339
  else: ocr_lang = language
340
 
341
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
342
- if analysis_type == "Image analysis":
343
 
344
  ocr_results = image_analyser.perform_ocr(image)
345
 
346
  # Combine OCR results
347
- ocr_results = combine_ocr_results(ocr_results)
 
 
 
 
 
 
348
 
349
  # Import results from json and convert
350
- if analysis_type == "AWS Textract":
351
 
352
-
353
  # Convert the image to bytes using an in-memory buffer
354
  image_buffer = io.BytesIO()
355
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
@@ -358,8 +568,9 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
358
  json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
359
 
360
  if not os.path.exists(json_file_path):
361
- text_blocks, request_metadata = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
362
  logging_file_paths.append(json_file_path)
 
363
  else:
364
  # Open the file and load the JSON data
365
  print("Found existing Textract json results file for this page.")
@@ -367,7 +578,13 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
367
  text_blocks = json.load(json_file)
368
  text_blocks = text_blocks['Blocks']
369
 
370
- ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results = json_to_ocrresult(text_blocks, page_width, page_height)
 
 
 
 
 
 
371
 
372
  # Step 2: Analyze text and identify PII
373
  bboxes = image_analyser.analyze_text(
@@ -376,10 +593,18 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
376
  entities=chosen_redact_entities,
377
  allow_list=allow_list,
378
  score_threshold=score_threshold,
379
- )
 
 
 
 
 
 
 
 
380
 
381
  # Merge close bounding boxes
382
- merged_bboxes = merge_img_bboxes(bboxes, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
383
 
384
  # Export the decision making process
385
  if merged_bboxes:
@@ -434,82 +659,19 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
434
  return [], []
435
 
436
  # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
437
- # def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
438
- # '''
439
- # Merge identified bounding boxes containing PII that are very close to one another
440
- # '''
441
- # analyzed_bounding_boxes = []
442
- # if len(analyzer_results) > 0 and len(characters) > 0:
443
- # merged_bounding_boxes = []
444
- # current_box = None
445
- # current_y = None
446
-
447
- # for i, result in enumerate(analyzer_results):
448
- # print("Considering result", str(i))
449
- # for char in characters[result.start : result.end]:
450
- # if isinstance(char, LTChar):
451
- # char_box = list(char.bbox)
452
- # # Add vertical padding to the top of the box
453
- # char_box[3] += vertical_padding
454
-
455
- # if current_y is None or current_box is None:
456
- # current_box = char_box
457
- # current_y = char_box[1]
458
- # else:
459
- # vertical_diff_bboxes = abs(char_box[1] - current_y)
460
- # horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
461
-
462
- # if (
463
- # vertical_diff_bboxes <= 5
464
- # and horizontal_diff_bboxes <= combine_pixel_dist
465
- # ):
466
- # current_box[2] = char_box[2] # Extend the current box horizontally
467
- # current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
468
- # else:
469
- # merged_bounding_boxes.append(
470
- # {"boundingBox": current_box, "result": result})
471
-
472
- # # Reset current_box and current_y after appending
473
- # current_box = char_box
474
- # current_y = char_box[1]
475
-
476
- # # After finishing with the current result, add the last box for this result
477
- # if current_box:
478
- # merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
479
- # current_box = None
480
- # current_y = None # Reset for the next result
481
-
482
- # if not merged_bounding_boxes:
483
- # analyzed_bounding_boxes.extend(
484
- # {"boundingBox": char.bbox, "result": result}
485
- # for result in analyzer_results
486
- # for char in characters[result.start:result.end]
487
- # if isinstance(char, LTChar)
488
- # )
489
- # else:
490
- # analyzed_bounding_boxes.extend(merged_bounding_boxes)
491
-
492
- # print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
493
-
494
- # return analyzed_bounding_boxes
495
-
496
- def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2, signature_bounding_boxes=None):
497
  '''
498
- Merge identified bounding boxes containing PII or signatures that are very close to one another.
499
  '''
500
  analyzed_bounding_boxes = []
501
- merged_bounding_boxes = []
502
- current_box = None
503
- current_y = None
504
-
505
- # Handle PII and text bounding boxes first
506
  if len(analyzer_results) > 0 and len(characters) > 0:
507
- for i, result in enumerate(analyzer_results):
508
- #print("Considering result", str(i))
509
- #print("Result:", result)
510
- #print("Characters:", characters)
511
 
512
- for char in characters[result.start: result.end]:
 
 
513
  if isinstance(char, LTChar):
514
  char_box = list(char.bbox)
515
  # Add vertical padding to the top of the box
@@ -535,58 +697,121 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
535
  # Reset current_box and current_y after appending
536
  current_box = char_box
537
  current_y = char_box[1]
538
-
539
  # After finishing with the current result, add the last box for this result
540
  if current_box:
541
  merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
542
  current_box = None
543
  current_y = None # Reset for the next result
544
 
545
- # Handle signature bounding boxes (without specific characters)
546
- if signature_bounding_boxes is not None:
547
- for sig_box in signature_bounding_boxes:
548
- sig_box = list(sig_box) # Ensure it's a list to modify the values
549
- if current_y is None or current_box is None:
550
- current_box = sig_box
551
- current_y = sig_box[1]
552
- else:
553
- vertical_diff_bboxes = abs(sig_box[1] - current_y)
554
- horizontal_diff_bboxes = abs(sig_box[0] - current_box[2])
555
-
556
- if (
557
- vertical_diff_bboxes <= 5
558
- and horizontal_diff_bboxes <= combine_pixel_dist
559
- ):
560
- current_box[2] = sig_box[2] # Extend the current box horizontally
561
- current_box[3] = max(current_box[3], sig_box[3]) # Ensure the top is the highest
562
- else:
563
- merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
564
-
565
- # Reset current_box and current_y after appending
566
- current_box = sig_box
567
- current_y = sig_box[1]
568
-
569
- # Add the last bounding box for the signature
570
- if current_box:
571
- merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
572
- current_box = None
573
- current_y = None
574
-
575
- # If no bounding boxes were merged, add individual character bounding boxes
576
- if not merged_bounding_boxes:
577
- analyzed_bounding_boxes.extend(
578
- {"boundingBox": char.bbox, "result": result}
579
- for result in analyzer_results
580
- for char in characters[result.start:result.end]
581
- if isinstance(char, LTChar)
582
- )
583
- else:
584
- analyzed_bounding_boxes.extend(merged_bounding_boxes)
585
 
586
- #print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
587
 
588
  return analyzed_bounding_boxes
589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
591
  decision_process_table = pd.DataFrame()
592
 
@@ -625,14 +850,14 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
625
  annotations_on_page.append(annotation)
626
  return annotations_on_page
627
 
628
- def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Text analysis", progress=Progress(track_tqdm=True)):
629
  '''
630
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
631
  '''
632
  annotations_all_pages = []
633
  decision_process_table_all_pages = []
634
 
635
- combine_pixel_dist = 100 # Horizontal distance between PII bounding boxes under/equal they are combined into one
636
 
637
  pdf = Pdf.open(filename)
638
  page_num = 0
@@ -674,7 +899,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
674
  text_container_analyzed_bounding_boxes = []
675
  characters = []
676
 
677
- if analysis_type == "Text analysis":
678
  for i, text_container in enumerate(page_layout):
679
 
680
  text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
@@ -686,11 +911,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
686
  page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
687
  page_analyzer_results.extend(text_container_analyzer_results)
688
 
689
- # Merge bounding boxes if very close together
690
- text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
691
-
692
- page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
693
- page_analyzer_results.extend(text_container_analyzer_results)
694
 
695
  decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
696
 
 
4
  import io
5
  import os
6
  from PIL import Image, ImageChops, ImageDraw
7
+ from typing import List, Dict
8
  import pandas as pd
9
 
10
+ #from presidio_image_redactor.entities import ImageRecognizerResult
11
  from pdfminer.high_level import extract_pages
12
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
13
  from pikepdf import Pdf, Dictionary, Name
 
20
  from tools.file_conversion import process_file
21
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
22
  from tools.helper_functions import get_file_path_end, output_folder
23
+ from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
24
  from tools.data_anonymise import generate_decision_process_output
25
  from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
26
 
27
+ def sum_numbers_before_seconds(string:str):
28
+ """Extracts numbers that precede the word 'seconds' from a string and adds them up.
29
+
30
+ Args:
31
+ string: The input string.
32
+
33
+ Returns:
34
+ The sum of all numbers before 'seconds' in the string.
35
+ """
36
+
37
+ # Extract numbers before 'seconds' using regular expression
38
+ numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
39
+
40
+ # Extract the numbers from the matches
41
+ numbers = [float(num.split()[0]) for num in numbers]
42
+
43
+ # Sum up the extracted numbers
44
+ sum_of_numbers = round(sum(numbers),1)
45
+
46
+ return sum_of_numbers
47
+
48
+ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", progress=gr.Progress(track_tqdm=True)):
49
+ '''
50
+ Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
51
+ '''
52
 
53
  tic = time.perf_counter()
54
+ all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
 
55
 
56
  # If this is the first time around, set variables to 0/blank
57
  if first_loop_state==True:
 
71
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
72
  if latest_file_completed >= len(file_paths):
73
  print("Last file reached")
74
+ # Set to a very high number so as not to mix up with subsequent file processing by the user
75
  latest_file_completed = 99
76
  final_out_message = '\n'.join(out_message)
77
  #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
78
+
79
+ estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
80
+ print("Estimated total processing time:", str(estimate_total_processing_time))
81
+
82
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str
83
+
84
+ file_paths_loop = [file_paths[int(latest_file_completed)]]
85
+
86
+ if not in_allow_list.empty:
87
+ in_allow_list_flat = in_allow_list[0].tolist()
88
+ print("In allow list:", in_allow_list_flat)
89
+ else:
90
+ in_allow_list_flat = []
91
+
92
+ for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
93
+ file_path = file.name
94
+
95
+ if file_path:
96
+ file_path_without_ext = get_file_path_end(file_path)
97
+ is_a_pdf = is_pdf(file_path) == True
98
+ if is_a_pdf == False:
99
+ # If user has not submitted a pdf, assume it's an image
100
+ print("File is not a pdf, assuming that image analysis needs to be used.")
101
+ in_redact_method = "Quick image analysis - typed text"
102
+ else:
103
+ out_message = "No file selected"
104
+ print(out_message)
105
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
106
+
107
+ if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
108
+ #Analyse and redact image-based pdf or image
109
+ if is_pdf_or_image(file_path) == False:
110
+ out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
111
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
112
+
113
+ print("Redacting file " + file_path_without_ext + " as an image-based file")
114
+ pdf_images, output_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
115
+
116
+ # Save file
117
+ out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
118
+ pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
119
+
120
+ out_file_paths.append(out_image_file_path)
121
+ if logging_file_paths:
122
+ log_files_output_paths.extend(logging_file_paths)
123
+
124
+ out_message.append("File '" + file_path_without_ext + "' successfully redacted")
125
+
126
+ # Save decision making process
127
+ output_logs_str = str(output_logs)
128
+ logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
129
+ with open(logs_output_file_name, "w") as f:
130
+ f.write(output_logs_str)
131
+ log_files_output_paths.append(logs_output_file_name)
132
+
133
+ # Save Textract request metadata (if exists)
134
+ if new_request_metadata:
135
+ print("Request metadata:", new_request_metadata)
136
+ all_request_metadata.append(new_request_metadata)
137
+
138
+ # Increase latest file completed count unless we are at the last file
139
+ if latest_file_completed != len(file_paths):
140
+ print("Completed file number:", str(latest_file_completed))
141
+ latest_file_completed += 1
142
+
143
+ elif in_redact_method == "Simple text analysis - PDFs with selectable text":
144
+
145
+ if is_pdf(file_path) == False:
146
+ return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
147
+
148
+ # Analyse text-based pdf
149
+ print('Redacting file as text-based PDF')
150
+ import time
151
+ import re
152
+ import json
153
+ import io
154
+ import os
155
+ from PIL import Image, ImageChops, ImageDraw
156
+ from typing import List, Dict
157
+ import pandas as pd
158
 
159
+ #from presidio_image_redactor.entities import ImageRecognizerResult
160
+ from pdfminer.high_level import extract_pages
161
+ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
162
+ from pikepdf import Pdf, Dictionary, Name
163
+ import gradio as gr
164
+ from gradio import Progress
165
 
166
+ from collections import defaultdict # For efficient grouping
 
167
 
168
+ from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
169
+ from tools.file_conversion import process_file
170
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
171
+ from tools.helper_functions import get_file_path_end, output_folder
172
+ from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
173
+ from tools.data_anonymise import generate_decision_process_output
174
+ from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
175
 
176
+ def sum_numbers_before_seconds(string:str):
177
+ """Extracts numbers that precede the word 'seconds' from a string and adds them up.
178
 
179
+ Args:
180
+ string: The input string.
181
 
182
+ Returns:
183
+ The sum of all numbers before 'seconds' in the string.
184
+ """
185
 
186
+ # Extract numbers before 'seconds' using regular expression
187
+ numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
188
 
189
+ # Extract the numbers from the matches
190
+ numbers = [float(num.split()[0]) for num in numbers]
191
+
192
+ # Sum up the extracted numbers
193
+ sum_of_numbers = round(sum(numbers),1)
194
+
195
+ return sum_of_numbers
196
+
197
+ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", progress=gr.Progress(track_tqdm=True)):
198
+ '''
199
+ Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
200
+ '''
201
+
202
+ tic = time.perf_counter()
203
+ all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
204
+
205
+ # If this is the first time around, set variables to 0/blank
206
+ if first_loop_state==True:
207
+ latest_file_completed = 0
208
+ #out_message = []
209
+ out_file_paths = []
210
+
211
+ # If out message is string or out_file_paths are blank, change to a list so it can be appended to
212
+ if isinstance(out_message, str):
213
+ out_message = [out_message]
214
+
215
+ if not out_file_paths:
216
+ out_file_paths = []
217
+
218
+ latest_file_completed = int(latest_file_completed)
219
+
220
+ # If we have already redacted the last file, return the input out_message and file list to the relevant components
221
+ if latest_file_completed >= len(file_paths):
222
+ print("Last file reached")
223
+ # Set to a very high number so as not to mix up with subsequent file processing by the user
224
+ latest_file_completed = 99
225
+ final_out_message = '\n'.join(out_message)
226
+ #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
227
+
228
  estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
229
  print("Estimated total processing time:", str(estimate_total_processing_time))
230
 
231
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str
232
 
233
  file_paths_loop = [file_paths[int(latest_file_completed)]]
234
 
 
238
  else:
239
  in_allow_list_flat = []
240
 
 
241
  for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
242
  file_path = file.name
243
 
 
247
  if is_a_pdf == False:
248
  # If user has not submitted a pdf, assume it's an image
249
  print("File is not a pdf, assuming that image analysis needs to be used.")
250
+ in_redact_method = "Quick image analysis - typed text"
251
  else:
252
  out_message = "No file selected"
253
  print(out_message)
254
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
255
 
256
+ if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
257
+ #Analyse and redact image-based pdf or image
258
+ if is_pdf_or_image(file_path) == False:
259
+ out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
260
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
261
 
262
+ print("Redacting file " + file_path_without_ext + " as an image-based file")
263
+ pdf_images, output_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
264
 
265
  # Save file
266
  out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
 
279
  f.write(output_logs_str)
280
  log_files_output_paths.append(logs_output_file_name)
281
 
282
+ # Save Textract request metadata (if exists)
283
+ if new_request_metadata:
284
+ print("Request metadata:", new_request_metadata)
285
+ all_request_metadata.append(new_request_metadata)
286
 
287
  # Increase latest file completed count unless we are at the last file
288
  if latest_file_completed != len(file_paths):
289
  print("Completed file number:", str(latest_file_completed))
290
  latest_file_completed += 1
291
 
292
+ elif in_redact_method == "Simple text analysis - PDFs with selectable text":
293
 
294
  if is_pdf(file_path) == False:
295
  return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
296
 
297
  # Analyse text-based pdf
298
  print('Redacting file as text-based PDF')
299
+ pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
300
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
301
  pdf_text.save(out_text_file_path)
302
 
303
  # Convert message
304
  convert_message="Converting PDF to image-based PDF to embed redactions."
 
305
  print(convert_message)
306
 
307
  # Convert document to image-based document to 'embed' redactions
 
314
  f.write(output_logs_str)
315
  log_files_output_paths.append(logs_output_file_name)
316
 
 
 
 
 
317
  out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
318
  out_message.append(out_message_new)
319
 
 
324
  else:
325
  out_message = "No redaction method selected"
326
  print(out_message)
327
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
 
328
 
329
  toc = time.perf_counter()
330
  out_time = f"in {toc - tic:0.1f} seconds."
 
333
  out_message_out = '\n'.join(out_message)
334
  out_message_out = out_message_out + " " + out_time
335
 
336
+ # If textract requests made, write to logging file
337
  if all_request_metadata:
338
  all_request_metadata_str = '\n'.join(all_request_metadata)
339
 
340
+ all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
 
341
 
342
  with open(all_request_metadata_file_path, "w") as f:
343
  f.write(all_request_metadata_str)
344
+
345
+ # Add the request metadata to the log outputs if not there already
346
+ if all_request_metadata_file_path not in log_files_output_paths:
347
+ log_files_output_paths.append(all_request_metadata_file_path)
348
+
349
 
350
  return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
351
 
352
+
353
+
354
+ def bounding_boxes_overlap(box1, box2):
355
+ """Check if two bounding boxes overlap."""
356
+ return (box1[0] < box2[2] and box2[0] < box1[2] and
357
+ box1[1] < box2[3] and box2[1] < box1[3])
358
+
359
+ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold=150, vertical_threshold=25):
360
  merged_bboxes = []
361
  grouped_bboxes = defaultdict(list)
362
 
363
+ # Process signature and handwriting results
364
  if signature_recogniser_results or handwriting_recogniser_results:
 
365
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
366
  print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
367
  bboxes.extend(handwriting_recogniser_results)
 
368
 
369
  if "Redact all identified signatures" in handwrite_signature_checkbox:
370
+ print("Signature boxes exist at merge:", signature_recogniser_results)
371
  bboxes.extend(signature_recogniser_results)
372
 
373
+ # Reconstruct bounding boxes for substrings of interest
374
+ reconstructed_bboxes = []
375
+ for bbox in bboxes:
376
+ bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
377
+ for line_text, line_info in combined_results.items():
378
+ line_box = line_info['bounding_box']
379
+ if bounding_boxes_overlap(bbox_box, line_box):
380
+ if bbox.text in line_text:
381
+ start_char = line_text.index(bbox.text)
382
+ end_char = start_char + len(bbox.text)
383
+
384
+ relevant_words = []
385
+ current_char = 0
386
+ for word in line_info['words']:
387
+ word_end = current_char + len(word['text'])
388
+ if current_char <= start_char < word_end or current_char < end_char <= word_end:
389
+ relevant_words.append(word)
390
+ if word_end >= end_char:
391
+ break
392
+ current_char = word_end # +1 for space
393
+ if not word['text'].endswith(' '):
394
+ current_char += 1 # +1 for space if the word doesn't already end with a space
395
+
396
+ if relevant_words:
397
+ print("Relevant words:", relevant_words)
398
+ left = min(word['bounding_box'][0] for word in relevant_words)
399
+ top = min(word['bounding_box'][1] for word in relevant_words)
400
+ right = max(word['bounding_box'][2] for word in relevant_words)
401
+ bottom = max(word['bounding_box'][3] for word in relevant_words)
402
+
403
+ # Combine the text of the relevant words
404
+ combined_text = " ".join(word['text'] for word in relevant_words)
405
+
406
+ reconstructed_bbox = CustomImageRecognizerResult(
407
+ bbox.entity_type,
408
+ bbox.start,
409
+ bbox.end,
410
+ bbox.score,
411
+ left,
412
+ top,
413
+ right - left, # width
414
+ bottom - top, # height
415
+ combined_text
416
+ )
417
+ reconstructed_bboxes.append(reconstructed_bbox)
418
+ break
419
+ else:
420
+ # If the bbox text is not found in any line in combined_results, keep the original bbox
421
+ reconstructed_bboxes.append(bbox)
422
+
423
+ # Group reconstructed bboxes by approximate vertical proximity
424
+ for box in reconstructed_bboxes:
425
  grouped_bboxes[round(box.top / vertical_threshold)].append(box)
426
 
427
+ # Merge within each group
428
  for _, group in grouped_bboxes.items():
429
  group.sort(key=lambda box: box.left)
430
 
431
  merged_box = group[0]
432
  for next_box in group[1:]:
433
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
 
434
  # Calculate new dimensions for the merged box
 
435
  if merged_box.text == next_box.text:
436
  new_text = merged_box.text
437
  else:
 
449
  merged_box = next_box
450
 
451
  merged_bboxes.append(merged_box)
452
+
453
  return merged_bboxes
454
 
455
+ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", progress=Progress(track_tqdm=True)):
456
  '''
457
  Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
458
  '''
 
462
  fill = (0, 0, 0) # Fill colour
463
  decision_process_output_str = ""
464
  images = []
465
+ #request_metadata = {}
466
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
467
 
468
  if not image_paths:
 
500
  all_ocr_results = []
501
  all_decision_process = []
502
 
503
+ if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
504
+ elif analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
505
 
506
  for n in range(0, number_of_pages):
507
  handwriting_or_signature_boxes = []
508
+ signature_recogniser_results = []
509
+ handwriting_recogniser_results = []
510
 
511
  try:
512
  image = image_paths[0][n]#.copy()
 
544
  else: ocr_lang = language
545
 
546
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
547
+ if analysis_type == "Quick image analysis - typed text":
548
 
549
  ocr_results = image_analyser.perform_ocr(image)
550
 
551
  # Combine OCR results
552
+ ocr_results, ocr_results_with_children = combine_ocr_results(ocr_results)
553
+
554
+ # Save decision making process
555
+ ocr_results_with_children_str = str(ocr_results_with_children)
556
+ logs_output_file_name = output_folder + "ocr_with_children.txt"
557
+ with open(logs_output_file_name, "w") as f:
558
+ f.write(ocr_results_with_children_str)
559
 
560
  # Import results from json and convert
561
+ if analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures":
562
 
 
563
  # Convert the image to bytes using an in-memory buffer
564
  image_buffer = io.BytesIO()
565
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
 
568
  json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
569
 
570
  if not os.path.exists(json_file_path):
571
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
572
  logging_file_paths.append(json_file_path)
573
+ request_metadata = request_metadata + "\n" + new_request_metadata
574
  else:
575
  # Open the file and load the JSON data
576
  print("Found existing Textract json results file for this page.")
 
578
  text_blocks = json.load(json_file)
579
  text_blocks = text_blocks['Blocks']
580
 
581
+ ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
582
+
583
+ # Save decision making process
584
+ ocr_results_with_children_str = str(ocr_results_with_children)
585
+ logs_output_file_name = output_folder + "ocr_with_children_textract.txt"
586
+ with open(logs_output_file_name, "w") as f:
587
+ f.write(ocr_results_with_children_str)
588
 
589
  # Step 2: Analyze text and identify PII
590
  bboxes = image_analyser.analyze_text(
 
593
  entities=chosen_redact_entities,
594
  allow_list=allow_list,
595
  score_threshold=score_threshold,
596
+ )
597
+
598
+ if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
599
+ elif analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
600
+
601
+ # Save decision making process
602
+ bboxes_str = str(bboxes)
603
+ with open(interim_results_file_path, "w") as f:
604
+ f.write(bboxes_str)
605
 
606
  # Merge close bounding boxes
607
+ merged_bboxes = merge_img_bboxes(bboxes, ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
608
 
609
  # Export the decision making process
610
  if merged_bboxes:
 
659
  return [], []
660
 
661
  # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
662
+ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
663
  '''
664
+ Merge identified bounding boxes containing PII that are very close to one another
665
  '''
666
  analyzed_bounding_boxes = []
 
 
 
 
 
667
  if len(analyzer_results) > 0 and len(characters) > 0:
668
+ merged_bounding_boxes = []
669
+ current_box = None
670
+ current_y = None
 
671
 
672
+ for i, result in enumerate(analyzer_results):
673
+ print("Considering result", str(i))
674
+ for char in characters[result.start : result.end]:
675
  if isinstance(char, LTChar):
676
  char_box = list(char.bbox)
677
  # Add vertical padding to the top of the box
 
697
  # Reset current_box and current_y after appending
698
  current_box = char_box
699
  current_y = char_box[1]
700
+
701
  # After finishing with the current result, add the last box for this result
702
  if current_box:
703
  merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
704
  current_box = None
705
  current_y = None # Reset for the next result
706
 
707
+ if not merged_bounding_boxes:
708
+ analyzed_bounding_boxes.extend(
709
+ {"boundingBox": char.bbox, "result": result}
710
+ for result in analyzer_results
711
+ for char in characters[result.start:result.end]
712
+ if isinstance(char, LTChar)
713
+ )
714
+ else:
715
+ analyzed_bounding_boxes.extend(merged_bounding_boxes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
 
717
+ print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
718
 
719
  return analyzed_bounding_boxes
720
 
721
+ # def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2, signature_bounding_boxes=None):
722
+ # '''
723
+ # Merge identified bounding boxes containing PII or signatures that are very close to one another.
724
+ # '''
725
+ # analyzed_bounding_boxes = []
726
+ # merged_bounding_boxes = []
727
+ # current_box = None
728
+ # current_y = None
729
+
730
+ # # Handle PII and text bounding boxes first
731
+ # if len(analyzer_results) > 0 and len(characters) > 0:
732
+ # for i, result in enumerate(analyzer_results):
733
+ # #print("Considering result", str(i))
734
+ # #print("Result:", result)
735
+ # #print("Characters:", characters)
736
+
737
+ # for char in characters[result.start: result.end]:
738
+ # if isinstance(char, LTChar):
739
+ # char_box = list(char.bbox)
740
+ # # Add vertical padding to the top of the box
741
+ # char_box[3] += vertical_padding
742
+
743
+ # if current_y is None or current_box is None:
744
+ # current_box = char_box
745
+ # current_y = char_box[1]
746
+ # else:
747
+ # vertical_diff_bboxes = abs(char_box[1] - current_y)
748
+ # horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
749
+
750
+ # if (
751
+ # vertical_diff_bboxes <= 5
752
+ # and horizontal_diff_bboxes <= combine_pixel_dist
753
+ # ):
754
+ # current_box[2] = char_box[2] # Extend the current box horizontally
755
+ # current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
756
+ # else:
757
+ # merged_bounding_boxes.append(
758
+ # {"boundingBox": current_box, "result": result})
759
+
760
+ # # Reset current_box and current_y after appending
761
+ # current_box = char_box
762
+ # current_y = char_box[1]
763
+
764
+ # # After finishing with the current result, add the last box for this result
765
+ # if current_box:
766
+ # merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
767
+ # current_box = None
768
+ # current_y = None # Reset for the next result
769
+
770
+ # # Handle signature bounding boxes (without specific characters)
771
+ # if signature_bounding_boxes is not None:
772
+ # for sig_box in signature_bounding_boxes:
773
+ # sig_box = list(sig_box) # Ensure it's a list to modify the values
774
+ # if current_y is None or current_box is None:
775
+ # current_box = sig_box
776
+ # current_y = sig_box[1]
777
+ # else:
778
+ # vertical_diff_bboxes = abs(sig_box[1] - current_y)
779
+ # horizontal_diff_bboxes = abs(sig_box[0] - current_box[2])
780
+
781
+ # if (
782
+ # vertical_diff_bboxes <= 5
783
+ # and horizontal_diff_bboxes <= combine_pixel_dist
784
+ # ):
785
+ # current_box[2] = sig_box[2] # Extend the current box horizontally
786
+ # current_box[3] = max(current_box[3], sig_box[3]) # Ensure the top is the highest
787
+ # else:
788
+ # merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
789
+
790
+ # # Reset current_box and current_y after appending
791
+ # current_box = sig_box
792
+ # current_y = sig_box[1]
793
+
794
+ # # Add the last bounding box for the signature
795
+ # if current_box:
796
+ # merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
797
+ # current_box = None
798
+ # current_y = None
799
+
800
+ # # If no bounding boxes were merged, add individual character bounding boxes
801
+ # if not merged_bounding_boxes:
802
+ # analyzed_bounding_boxes.extend(
803
+ # {"boundingBox": char.bbox, "result": result}
804
+ # for result in analyzer_results
805
+ # for char in characters[result.start:result.end]
806
+ # if isinstance(char, LTChar)
807
+ # )
808
+ # else:
809
+ # analyzed_bounding_boxes.extend(merged_bounding_boxes)
810
+
811
+ # #print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
812
+
813
+ # return analyzed_bounding_boxes
814
+
815
  def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
816
  decision_process_table = pd.DataFrame()
817
 
 
850
  annotations_on_page.append(annotation)
851
  return annotations_on_page
852
 
853
+ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Simple text analysis - PDFs with selectable text", progress=Progress(track_tqdm=True)):
854
  '''
855
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
856
  '''
857
  annotations_all_pages = []
858
  decision_process_table_all_pages = []
859
 
860
+ combine_pixel_dist = 200 # Horizontal distance between PII bounding boxes under/equal they are combined into one
861
 
862
  pdf = Pdf.open(filename)
863
  page_num = 0
 
899
  text_container_analyzed_bounding_boxes = []
900
  characters = []
901
 
902
+ if analysis_type == "Simple text analysis - PDFs with selectable text":
903
  for i, text_container in enumerate(page_layout):
904
 
905
  text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
 
911
  page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
912
  page_analyzer_results.extend(text_container_analyzer_results)
913
 
 
 
 
 
 
914
 
915
  decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
916
 
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -18,7 +18,7 @@ score_threshold = 0.001
18
  # Custom title recogniser
19
  import re
20
  titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
21
- titles_regex = '\\b' + ' \\b|\\b'.join(rf"{re.escape(street_type)}" for street_type in titles_list) + ' \\b'
22
  titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
23
  titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
24
 
 
18
  # Custom title recogniser
19
  import re
20
  titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
21
+ titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
22
  titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
23
  titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
24