seanpedrickcase commited on
Commit
e2aae24
1 Parent(s): e69ae00

Only shows AWS options when AWS functions enabled. Can now upload previous review files to continue review later. Some review debugging.

Browse files
app.py CHANGED
@@ -9,11 +9,11 @@ import pandas as pd
9
  from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
 
12
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars, load_in_default_allow_list
13
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
- from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
19
  from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -45,12 +45,6 @@ feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
45
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
46
  usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
47
 
48
- text_ocr_option = "Simple text analysis - PDFs with selectable text"
49
- tesseract_ocr_option = "Quick image analysis - typed text"
50
- textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
51
-
52
- local_pii_detector = "Local"
53
- aws_pii_detector = "AWS Comprehend"
54
 
55
  if RUN_AWS_FUNCTIONS == "1":
56
  default_ocr_val = textract_option
@@ -104,7 +98,8 @@ with app:
104
  textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
105
  comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
106
 
107
- doc_file_name_textbox = gr.Textbox(label = "doc_file_name_textbox", value="", visible=False)
 
108
  doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
109
  data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
110
 
@@ -127,6 +122,9 @@ with app:
127
  zoom_true_bool = gr.State(True)
128
  zoom_false_bool = gr.State(False)
129
 
 
 
 
130
 
131
  ###
132
  # UI DESIGN
@@ -145,8 +143,12 @@ with app:
145
  with gr.Tab("PDFs/images"):
146
  with gr.Accordion("Redact document", open = True):
147
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
148
- in_redaction_method = gr.Radio(label="Choose text extract method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
149
- pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
 
 
 
 
150
 
151
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
152
  document_redact_btn = gr.Button("Redact document(s)", variant="primary")
@@ -178,6 +180,8 @@ with app:
178
  with gr.Row():
179
  annotate_zoom_in = gr.Button("Zoom in")
180
  annotate_zoom_out = gr.Button("Zoom out")
 
 
181
 
182
  annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
183
 
@@ -199,7 +203,8 @@ with app:
199
  annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
200
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
201
 
202
- output_review_files = gr.File(label="Review output files")
 
203
 
204
  # TEXT / TABULAR DATA TAB
205
  with gr.Tab(label="Open text or Excel/csv files"):
@@ -231,6 +236,8 @@ with app:
231
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
232
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
233
 
 
 
234
  # SETTINGS TAB
235
  with gr.Tab(label="Redaction settings"):
236
  gr.Markdown(
@@ -272,10 +279,10 @@ with app:
272
  ###
273
  # PDF/IMAGE REDACTION
274
  ###
275
- in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
276
 
277
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
278
- then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
279
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
280
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
281
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
@@ -283,10 +290,10 @@ with app:
283
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
284
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
285
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
286
- then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
287
 
288
  # If a file has been completed, the function will continue onto the next document
289
- latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
290
  then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
291
  # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
292
  # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
@@ -299,12 +306,12 @@ with app:
299
  # Page controls at top
300
  annotate_current_page.submit(
301
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
302
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
303
 
304
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
305
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
306
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
307
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
308
 
309
  # Zoom in and out on annotator
310
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
@@ -313,20 +320,28 @@ with app:
313
  annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
314
  then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
315
 
316
- annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
 
 
 
317
 
318
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
319
- annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
320
 
321
  # Page controls at bottom
322
  annotate_current_page_bottom.submit(
323
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
324
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
325
 
326
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
327
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
328
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
329
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
 
 
 
 
 
330
 
331
  ###
332
  # TABULAR DATA REDACTION
@@ -364,8 +379,8 @@ with app:
364
 
365
  # User submitted feedback for pdf redactions
366
  pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
367
- pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], feedback_logs_folder)
368
- pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], None, preprocess=False).\
369
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
370
 
371
  # User submitted feedback for data redactions
@@ -376,8 +391,8 @@ with app:
376
 
377
  # Log processing time/token usage when making a query
378
  usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
379
- usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
380
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
381
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
382
 
383
  # Launch the Gradio app
 
9
  from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
 
12
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
13
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
+ from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
19
  from tools.load_spacy_model_custom_recognisers import custom_entities
 
45
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
46
  usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
47
 
 
 
 
 
 
 
48
 
49
  if RUN_AWS_FUNCTIONS == "1":
50
  default_ocr_val = textract_option
 
98
  textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
99
  comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
100
 
101
+ doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
102
+ doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
103
  doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
104
  data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
105
 
 
122
  zoom_true_bool = gr.State(True)
123
  zoom_false_bool = gr.State(False)
124
 
125
+ clear_all_page_redactions = gr.State(True)
126
+ prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
127
+
128
 
129
  ###
130
  # UI DESIGN
 
143
  with gr.Tab("PDFs/images"):
144
  with gr.Accordion("Redact document", open = True):
145
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
146
+ if RUN_AWS_FUNCTIONS == "1":
147
+ in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
148
+ pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
149
+ else:
150
+ in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
151
+ pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
152
 
153
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
154
  document_redact_btn = gr.Button("Redact document(s)", variant="primary")
 
180
  with gr.Row():
181
  annotate_zoom_in = gr.Button("Zoom in")
182
  annotate_zoom_out = gr.Button("Zoom out")
183
+ with gr.Row():
184
+ clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page")
185
 
186
  annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
187
 
 
203
  annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
204
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
205
 
206
+ output_review_files = gr.File(label="Review output files", file_count='multiple')
207
+ upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...redactions.json)")
208
 
209
  # TEXT / TABULAR DATA TAB
210
  with gr.Tab(label="Open text or Excel/csv files"):
 
236
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
237
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
238
 
239
+
240
+
241
  # SETTINGS TAB
242
  with gr.Tab(label="Redaction settings"):
243
  gr.Markdown(
 
279
  ###
280
  # PDF/IMAGE REDACTION
281
  ###
282
+ in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox])
283
 
284
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
285
+ then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
286
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
287
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
288
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
 
290
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
291
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
292
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
293
+ then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
294
 
295
  # If a file has been completed, the function will continue onto the next document
296
+ latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
297
  then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
298
  # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
299
  # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
 
306
  # Page controls at top
307
  annotate_current_page.submit(
308
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
309
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
310
 
311
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
312
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
313
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
314
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
315
 
316
  # Zoom in and out on annotator
317
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
 
320
  annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
321
  then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
322
 
323
+ annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
324
+
325
+ clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
326
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
327
 
328
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
329
+ annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
330
 
331
  # Page controls at bottom
332
  annotate_current_page_bottom.submit(
333
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
334
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
335
 
336
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
337
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
338
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
339
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
340
+
341
+ # Upload previous files for modifying redactions
342
+ upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox]).\
343
+ then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
344
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
345
 
346
  ###
347
  # TABULAR DATA REDACTION
 
379
 
380
  # User submitted feedback for pdf redactions
381
  pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
382
+ pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
383
+ pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
384
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
385
 
386
  # User submitted feedback for data redactions
 
391
 
392
  # Log processing time/token usage when making a query
393
  usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
394
+ usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
395
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
396
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
397
 
398
  # Launch the Gradio app
tools/aws_functions.py CHANGED
@@ -10,17 +10,13 @@ PandasDataFrame = Type[pd.DataFrame]
10
  # Get AWS credentials
11
  bucket_name=""
12
 
13
- RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
14
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
15
 
16
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
17
  print(f'The value of AWS_REGION is {AWS_REGION}')
18
 
19
- try:
20
- comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
21
- except Exception as e:
22
- print(e)
23
- comprehend_client = ""
24
 
25
  def get_assumed_role_info():
26
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
 
10
  # Get AWS credentials
11
  bucket_name=""
12
 
13
+ RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
14
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
15
 
16
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
17
  print(f'The value of AWS_REGION is {AWS_REGION}')
18
 
19
+
 
 
 
 
20
 
21
  def get_assumed_role_info():
22
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
tools/aws_textract.py CHANGED
@@ -23,15 +23,16 @@ def extract_textract_metadata(response):
23
  #'NumberOfPages': number_of_pages
24
  })
25
 
26
- def analyse_page_with_textract(pdf_page_bytes, page_no):
27
  '''
28
  Analyse page with AWS Textract
29
  '''
30
- try:
31
- client = boto3.client('textract')
32
- except:
33
- print("Cannot connect to AWS Textract")
34
- return [], "" # Return an empty list and an empty string
 
35
 
36
  print("Analysing page with AWS Textract")
37
 
 
23
  #'NumberOfPages': number_of_pages
24
  })
25
 
26
+ def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
27
  '''
28
  Analyse page with AWS Textract
29
  '''
30
+ if client == "":
31
+ try:
32
+ client = boto3.client('textract')
33
+ except:
34
+ print("Cannot connect to AWS Textract")
35
+ return [], "" # Return an empty list and an empty string
36
 
37
  print("Analysing page with AWS Textract")
38
 
tools/custom_image_analyser_engine.py CHANGED
@@ -11,7 +11,6 @@ from PIL import ImageDraw, ImageFont, Image
11
  from typing import Optional, Tuple, Union
12
  from copy import deepcopy
13
  from tools.helper_functions import clean_unicode_text
14
- from tools.aws_functions import comprehend_client
15
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
16
  from tools.load_spacy_model_custom_recognisers import custom_entities
17
  #import string # Import string to get a list of common punctuation characters
@@ -464,7 +463,8 @@ class CustomImageAnalyzerEngine:
464
  line_level_ocr_results: List[OCRResult],
465
  ocr_results_with_children: Dict[str, Dict],
466
  chosen_redact_comprehend_entities:List[str],
467
- pii_identification_method:str="Local",
 
468
  **text_analyzer_kwargs
469
  ) -> List[CustomImageRecognizerResult]:
470
  # Define English as default language, if not specified
 
11
  from typing import Optional, Tuple, Union
12
  from copy import deepcopy
13
  from tools.helper_functions import clean_unicode_text
 
14
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  #import string # Import string to get a list of common punctuation characters
 
463
  line_level_ocr_results: List[OCRResult],
464
  ocr_results_with_children: Dict[str, Dict],
465
  chosen_redact_comprehend_entities:List[str],
466
+ pii_identification_method:str="Local",
467
+ comprehend_client="",
468
  **text_analyzer_kwargs
469
  ) -> List[CustomImageRecognizerResult]:
470
  # Define English as default language, if not specified
tools/file_conversion.py CHANGED
@@ -1,9 +1,10 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
 
6
  import os
 
7
  import gradio as gr
8
  import time
9
  import json
@@ -96,8 +97,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag
96
 
97
  return images
98
 
99
-
100
- # %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
101
  def process_file(file_path):
102
  # Get the file extension
103
  file_extension = os.path.splitext(file_path)[1].lower()
@@ -127,11 +127,15 @@ def get_input_file_names(file_input):
127
  '''
128
 
129
  all_relevant_files = []
 
 
130
 
131
  #print("file_input:", file_input)
132
 
133
  if isinstance(file_input, str):
134
  file_input_list = [file_input]
 
 
135
 
136
  for file in file_input_list:
137
  if isinstance(file, str):
@@ -141,21 +145,19 @@ def get_input_file_names(file_input):
141
 
142
  file_path_without_ext = get_file_path_end(file_path)
143
 
144
- #print("file:", file_path)
145
-
146
  file_extension = os.path.splitext(file_path)[1].lower()
147
 
148
- file_name_with_extension = file_path_without_ext + file_extension
149
-
150
  # Check if the file is an image type
151
  if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
152
  all_relevant_files.append(file_path_without_ext)
 
 
153
 
154
  all_relevant_files_str = ", ".join(all_relevant_files)
155
 
156
- #print("all_relevant_files_str:", all_relevant_files_str)
157
 
158
- return all_relevant_files_str, file_name_with_extension
159
 
160
  def prepare_image_or_pdf(
161
  file_paths: List[str],
@@ -166,6 +168,8 @@ def prepare_image_or_pdf(
166
  first_loop_state: bool = False,
167
  number_of_pages:int = 1,
168
  current_loop_page_number:int=0,
 
 
169
  progress: Progress = Progress(track_tqdm=True)
170
  ) -> tuple[List[str], List[str]]:
171
  """
@@ -182,7 +186,10 @@ def prepare_image_or_pdf(
182
  out_message (List[str]): List to store output messages.
183
  first_loop_state (bool): Flag indicating if this is the first iteration.
184
  number_of_pages (int): integer indicating the number of pages in the document
 
 
185
  progress (Progress): Progress tracker for the operation.
 
186
 
187
  Returns:
188
  tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
@@ -194,7 +201,8 @@ def prepare_image_or_pdf(
194
  if first_loop_state==True:
195
  print("first_loop_state is True")
196
  latest_file_completed = 0
197
- out_message = []
 
198
  else:
199
  print("Now attempting file:", str(latest_file_completed))
200
 
@@ -222,7 +230,7 @@ def prepare_image_or_pdf(
222
  else:
223
  file_path_number = len(file_paths)
224
 
225
- print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
226
  print("Number of file paths:", file_path_number)
227
  print("Latest_file_completed:", latest_file_completed)
228
 
@@ -235,7 +243,7 @@ def prepare_image_or_pdf(
235
  final_out_message = '\n'.join(out_message)
236
  else:
237
  final_out_message = out_message
238
- return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
239
 
240
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
241
 
@@ -245,13 +253,16 @@ def prepare_image_or_pdf(
245
  file_paths_list = [file_paths]
246
  file_paths_loop = file_paths_list
247
  else:
248
- file_paths_list = file_paths
249
- file_paths_loop = [file_paths_list[int(latest_file_completed)]]
250
-
251
-
252
- #print("file_paths_loop:", str(file_paths_loop))
 
 
 
253
 
254
- #for file in progress.tqdm(file_paths, desc="Preparing files"):
255
  for file in file_paths_loop:
256
  if isinstance(file, str):
257
  file_path = file
@@ -259,50 +270,87 @@ def prepare_image_or_pdf(
259
  file_path = file.name
260
  file_path_without_ext = get_file_path_end(file_path)
261
 
262
- #print("file:", file_path)
 
 
 
263
 
264
  file_extension = os.path.splitext(file_path)[1].lower()
265
 
266
  # Check if the file is an image type
267
  if file_extension in ['.jpg', '.jpeg', '.png']:
268
- in_redact_method = "Quick image analysis - typed text"
269
-
270
- # If the file loaded in is json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
271
- if file_extension in ['.json']:
272
- json_contents = json.load(file_path)
273
- # Write the response to a JSON file
274
- out_folder = output_folder + file_path
275
- with open(file_path, 'w') as json_file:
276
- json.dump(json_contents, out_folder, indent=4) # indent=4 makes the JSON file pretty-printed
277
- continue
278
-
279
- #if file_path:
280
- # file_path_without_ext = get_file_path_end(file_path)
281
- if not file_path:
282
- out_message = "No file selected"
283
- print(out_message)
284
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
285
-
286
- if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
287
- # Analyse and redact image-based pdf or image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  if is_pdf_or_image(file_path) == False:
289
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
290
  print(out_message)
291
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
292
 
293
  converted_file_path = process_file(file_path)
294
  image_file_path = converted_file_path
295
- #print("Out file path at image conversion step:", converted_file_path)
296
 
297
- elif in_redact_method == "Simple text analysis - PDFs with selectable text":
298
  if is_pdf(file_path) == False:
299
  out_message = "Please upload a PDF file for text analysis."
300
  print(out_message)
301
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
302
 
303
  converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
304
  image_file_path = process_file(file_path)
305
-
306
 
307
  converted_file_paths.append(converted_file_path)
308
  image_file_paths.extend(image_file_path)
@@ -310,7 +358,7 @@ def prepare_image_or_pdf(
310
  # If a pdf, load as a pymupdf document
311
  if is_pdf(file_path):
312
  pymupdf_doc = pymupdf.open(file_path)
313
- #print("pymupdf_doc:", pymupdf_doc)
314
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
315
  # Convert image to a pymupdf document
316
  pymupdf_doc = pymupdf.open() # Create a new empty document
@@ -318,9 +366,7 @@ def prepare_image_or_pdf(
318
  rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
319
  page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
320
  page.insert_image(rect, filename=file_path) # Insert the image into the page
321
- # Ensure to save the document after processing
322
- #pymupdf_doc.save(output_path) # Uncomment and specify output_path if needed
323
- #pymupdf_doc.close() # Close the PDF document
324
 
325
  toc = time.perf_counter()
326
  out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
@@ -332,9 +378,8 @@ def prepare_image_or_pdf(
332
 
333
  number_of_pages = len(image_file_paths)
334
 
335
- print("At end of prepare_image_or_pdf function - current_loop_page_number:", current_loop_page_number)
336
-
337
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
338
 
339
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
340
  file_path_without_ext = get_file_path_end(in_file_path)
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
 
6
  import os
7
+ import re
8
  import gradio as gr
9
  import time
10
  import json
 
97
 
98
  return images
99
 
100
+ # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 
101
  def process_file(file_path):
102
  # Get the file extension
103
  file_extension = os.path.splitext(file_path)[1].lower()
 
127
  '''
128
 
129
  all_relevant_files = []
130
+ file_name_with_extension = ""
131
+ full_file_name = ""
132
 
133
  #print("file_input:", file_input)
134
 
135
  if isinstance(file_input, str):
136
  file_input_list = [file_input]
137
+ else:
138
+ file_input_list = file_input
139
 
140
  for file in file_input_list:
141
  if isinstance(file, str):
 
145
 
146
  file_path_without_ext = get_file_path_end(file_path)
147
 
 
 
148
  file_extension = os.path.splitext(file_path)[1].lower()
149
 
 
 
150
  # Check if the file is an image type
151
  if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
152
  all_relevant_files.append(file_path_without_ext)
153
+ file_name_with_extension = file_path_without_ext + file_extension
154
+ full_file_name = file_path
155
 
156
  all_relevant_files_str = ", ".join(all_relevant_files)
157
 
158
+ print("all_relevant_files_str:", all_relevant_files_str)
159
 
160
+ return all_relevant_files_str, file_name_with_extension, full_file_name
161
 
162
  def prepare_image_or_pdf(
163
  file_paths: List[str],
 
168
  first_loop_state: bool = False,
169
  number_of_pages:int = 1,
170
  current_loop_page_number:int=0,
171
+ all_annotations_object:List = [],
172
+ prepare_for_review:bool = False,
173
  progress: Progress = Progress(track_tqdm=True)
174
  ) -> tuple[List[str], List[str]]:
175
  """
 
186
  out_message (List[str]): List to store output messages.
187
  first_loop_state (bool): Flag indicating if this is the first iteration.
188
  number_of_pages (int): integer indicating the number of pages in the document
189
+ all_annotations_object(List of annotation objects): All annotations for current document
190
+ prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
191
  progress (Progress): Progress tracker for the operation.
192
+
193
 
194
  Returns:
195
  tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
 
201
  if first_loop_state==True:
202
  print("first_loop_state is True")
203
  latest_file_completed = 0
204
+ out_message = []
205
+ all_annotations_object = []
206
  else:
207
  print("Now attempting file:", str(latest_file_completed))
208
 
 
230
  else:
231
  file_path_number = len(file_paths)
232
 
233
+ #print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
234
  print("Number of file paths:", file_path_number)
235
  print("Latest_file_completed:", latest_file_completed)
236
 
 
243
  final_out_message = '\n'.join(out_message)
244
  else:
245
  final_out_message = out_message
246
+ return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
247
 
248
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
249
 
 
253
  file_paths_list = [file_paths]
254
  file_paths_loop = file_paths_list
255
  else:
256
+ if prepare_for_review == False:
257
+ file_paths_list = file_paths
258
+ file_paths_loop = [file_paths_list[int(latest_file_completed)]]
259
+ else:
260
+ file_paths_list = file_paths
261
+ file_paths_loop = file_paths
262
+ # Sort files to prioritise PDF files first, then JSON files. This means that the pdf can be loaded in, and pdf page path locations can be added to the json
263
+ file_paths_loop = sorted(file_paths_loop, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
264
 
265
+ # Loop through files to load in
266
  for file in file_paths_loop:
267
  if isinstance(file, str):
268
  file_path = file
 
270
  file_path = file.name
271
  file_path_without_ext = get_file_path_end(file_path)
272
 
273
+ if not file_path:
274
+ out_message = "Please select a file."
275
+ print(out_message)
276
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
277
 
278
  file_extension = os.path.splitext(file_path)[1].lower()
279
 
280
  # Check if the file is an image type
281
  if file_extension in ['.jpg', '.jpeg', '.png']:
282
+ in_redact_method = tesseract_ocr_option
283
+
284
+
285
+ # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
286
+ if file_path.endswith(".json"):
287
+
288
+ if prepare_for_review == True:
289
+ if isinstance(file_path, str):
290
+ with open(file_path, 'r') as json_file:
291
+ all_annotations_object = json.load(json_file)
292
+ else:
293
+ # Assuming file_path is a NamedString or similar
294
+ all_annotations_object = json.loads(file_path) # Use loads for string content
295
+
296
+ # Get list of page numbers
297
+ image_file_paths_pages = [
298
+ int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
299
+ for s in image_file_paths
300
+ if re.search(r'_(\d+)\.png$', os.path.basename(s))
301
+ ]
302
+ image_file_paths_pages = [int(i) for i in image_file_paths_pages]
303
+
304
+
305
+ # If PDF pages have been converted to image files, replace the current image paths in the json to this
306
+ if image_file_paths:
307
+ for i, annotation in enumerate(all_annotations_object):
308
+ annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
309
+
310
+ # Check if the annotation page number exists in the image file paths pages
311
+ if annotation_page_number in image_file_paths_pages:
312
+
313
+ # Set the correct image page directly since we know it's in the list
314
+ correct_image_page = annotation_page_number
315
+ annotation["image"] = image_file_paths[correct_image_page]
316
+ else:
317
+ print("Page not found.")
318
+
319
+ #print("all_annotations_object:", all_annotations_object)
320
+
321
+ # Write the response to a JSON file in output folder
322
+ out_folder = output_folder + file_path_without_ext + file_extension
323
+ with open(out_folder, 'w') as json_file:
324
+ json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
325
+ continue
326
+
327
+ else:
328
+ # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
329
+ json_contents = json.load(file_path)
330
+ # Write the response to a JSON file in output folder
331
+ out_folder = output_folder + file_path_without_ext + file_extension
332
+ with open(out_folder, 'w') as json_file:
333
+ json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
334
+ continue
335
+
336
+ # Convert pdf/image file to correct format for redaction
337
+ if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
338
  if is_pdf_or_image(file_path) == False:
339
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
340
  print(out_message)
341
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
342
 
343
  converted_file_path = process_file(file_path)
344
  image_file_path = converted_file_path
 
345
 
346
+ elif in_redact_method == text_ocr_option:
347
  if is_pdf(file_path) == False:
348
  out_message = "Please upload a PDF file for text analysis."
349
  print(out_message)
350
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
351
 
352
  converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
353
  image_file_path = process_file(file_path)
 
354
 
355
  converted_file_paths.append(converted_file_path)
356
  image_file_paths.extend(image_file_path)
 
358
  # If a pdf, load as a pymupdf document
359
  if is_pdf(file_path):
360
  pymupdf_doc = pymupdf.open(file_path)
361
+
362
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
363
  # Convert image to a pymupdf document
364
  pymupdf_doc = pymupdf.open() # Create a new empty document
 
366
  rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
367
  page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
368
  page.insert_image(rect, filename=file_path) # Insert the image into the page
369
+
 
 
370
 
371
  toc = time.perf_counter()
372
  out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
 
378
 
379
  number_of_pages = len(image_file_paths)
380
 
381
+
382
+ return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
 
383
 
384
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
385
  file_path_without_ext = get_file_path_end(in_file_path)
tools/file_redaction.py CHANGED
@@ -8,7 +8,6 @@ import boto3
8
  from tqdm import tqdm
9
  from PIL import Image, ImageChops, ImageFile, ImageDraw
10
  ImageFile.LOAD_TRUNCATED_IMAGES = True
11
-
12
  from typing import List, Dict, Tuple
13
  import pandas as pd
14
 
@@ -19,32 +18,27 @@ from pikepdf import Pdf, Dictionary, Name
19
  import pymupdf
20
  from pymupdf import Rect
21
  from fitz import Document, Page
22
-
23
  import gradio as gr
24
  from gradio import Progress
25
  from collections import defaultdict # For efficient grouping
26
 
27
  from presidio_analyzer import RecognizerResult
28
-
29
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
30
  from tools.file_conversion import process_file, image_dpi
31
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
32
- from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
33
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
34
- # from tools.data_anonymise import generate_decision_process_output
35
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
36
- from tools.aws_functions import comprehend_client
37
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
38
 
39
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
40
-
41
  page_break_value = get_or_create_env_var('page_break_value', '500')
42
  print(f'The value of page_break_value is {page_break_value}')
43
 
44
  max_time_value = get_or_create_env_var('max_time_value', '105')
45
  print(f'The value of max_time_value is {max_time_value}')
46
 
47
-
48
  def sum_numbers_before_seconds(string:str):
49
  """Extracts numbers that precede the word 'seconds' from a string and adds them up.
50
 
@@ -192,8 +186,33 @@ def choose_and_run_redactor(file_paths:List[str],
192
  else:
193
  in_allow_list_flat = []
194
 
195
- progress(0.5, desc="Redacting file")
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  if isinstance(file_paths, str):
199
  file_paths_list = [file_paths]
@@ -217,28 +236,21 @@ def choose_and_run_redactor(file_paths:List[str],
217
  if is_a_pdf == False:
218
  # If user has not submitted a pdf, assume it's an image
219
  print("File is not a pdf, assuming that image analysis needs to be used.")
220
- in_redact_method = "Quick image analysis - typed text"
221
  else:
222
  out_message = "No file selected"
223
  print(out_message)
224
 
225
  return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
226
 
227
- if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
228
 
229
- if in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
230
- # Try accessing Textract through boto3
231
- try:
232
- boto3.client('textract')
233
- except:
234
- out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
235
- print(out_message)
236
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
237
 
238
  #Analyse and redact image-based pdf or image
239
  if is_pdf_or_image(file_path) == False:
240
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
241
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
242
 
243
  print("Redacting file " + file_path_without_ext + " as an image-based file")
244
 
@@ -262,14 +274,16 @@ def choose_and_run_redactor(file_paths:List[str],
262
  all_decision_process_table,
263
  pymupdf_doc,
264
  pii_identification_method,
265
- comprehend_query_number)
 
 
266
 
267
  # Save Textract request metadata (if exists)
268
  if new_request_metadata:
269
  print("Request metadata:", new_request_metadata)
270
  all_request_metadata.append(new_request_metadata)
271
 
272
- elif in_redact_method == "Simple text analysis - PDFs with selectable text":
273
 
274
  logging_file_paths = ""
275
 
@@ -287,7 +301,7 @@ def choose_and_run_redactor(file_paths:List[str],
287
  in_allow_list_flat,
288
  page_min,
289
  page_max,
290
- "Simple text analysis - PDFs with selectable text",
291
  current_loop_page,
292
  page_break_return,
293
  annotations_all_pages,
@@ -295,7 +309,8 @@ def choose_and_run_redactor(file_paths:List[str],
295
  all_decision_process_table,
296
  pymupdf_doc,
297
  pii_identification_method,
298
- comprehend_query_number)
 
299
 
300
  else:
301
  out_message = "No redaction method selected"
@@ -328,14 +343,21 @@ def choose_and_run_redactor(file_paths:List[str],
328
 
329
  logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
330
  all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
331
- #log_files_output_paths.append(logs_output_file_name)
332
  out_file_paths.append(logs_output_file_name)
333
 
334
  all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
335
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
336
- #log_files_output_paths.append(all_text_output_file_name)
337
  out_file_paths.append(all_text_output_file_name)
338
 
 
 
 
 
 
 
 
 
 
339
  # Make a combined message for the file
340
  if isinstance(out_message, list):
341
  combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
@@ -351,38 +373,6 @@ def choose_and_run_redactor(file_paths:List[str],
351
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
352
  print("Estimated total processing time:", str(estimate_total_processing_time))
353
 
354
- #out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
355
- #combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
356
-
357
- # Increase latest file completed count unless we are at the last file
358
- # if latest_file_completed != len(file_paths):
359
- # print("Completed file number:", str(latest_file_completed), "more files to do")
360
-
361
- # if current_loop_page >= number_of_pages:
362
-
363
- # print("Current page loop", current_loop_page, "is greater than or equal to number of pages:", number_of_pages)
364
- # latest_file_completed += 1
365
-
366
- # # Set to 999 to be a big number not to interrupt processing of large files by user
367
- # current_loop_page = 999
368
-
369
- # out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
370
- # pymupdf_doc.save(out_text_file_path)
371
- # out_file_paths.append(out_text_file_path)
372
-
373
- # # Write logs to file
374
- # decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
375
- # all_decision_process_table.to_csv(decision_logs_output_file_name)
376
- # log_files_output_paths.append(decision_logs_output_file_name)
377
-
378
- # all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
379
- # all_line_level_ocr_results_df.to_csv(all_text_output_file_name)
380
- # log_files_output_paths.append(all_text_output_file_name)
381
-
382
- # out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
383
-
384
- # if isinstance(out_message, list):
385
- # out_message.append(out_message_new) # Ensure out_message is a list of strings
386
  else:
387
  toc = time.perf_counter()
388
  time_taken = toc - tic
@@ -501,27 +491,6 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
501
 
502
  return x1, new_y1, x2, new_y2
503
 
504
- # def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
505
- # '''
506
- # Converts coordinates from pymupdf format to image coordinates.
507
- # '''
508
-
509
- # rect_height = pymupdf_page.rect.height
510
- # rect_width = pymupdf_page.rect.width
511
-
512
- # image_page_width, image_page_height = image.size
513
-
514
- # # Calculate scaling factors between pymupdf and PIL image
515
- # scale_width = image_page_width / rect_width
516
- # scale_height = image_page_height / rect_height
517
-
518
- # x1_image = x1 * scale_width
519
- # y1_image = ((rect_height - y2) * scale_height)
520
- # x2_image = x2 * scale_width
521
- # y2_image = ((rect_height - y1) * scale_height)
522
-
523
- # return x1_image, y1_image, x2_image, y2_image
524
-
525
  def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
526
  '''
527
  Converts coordinates from pymupdf format to image coordinates,
@@ -625,10 +594,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
625
  # Should already be in correct format if img_annotator_box is an input
626
  if isinstance(annot, dict):
627
  img_annotation_box = annot
628
- #try:
629
- # img_annotation_box["label"] = annot["label"]
630
- #except:
631
- # img_annotation_box["label"] = "Redaction"
632
 
633
  x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
634
 
@@ -823,7 +788,7 @@ def redact_image_pdf(file_path:str,
823
  is_a_pdf:bool=True,
824
  page_min:int=0,
825
  page_max:int=999,
826
- analysis_type:str="Quick image analysis - typed text",
827
  handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
828
  request_metadata:str="", current_loop_page:int=0,
829
  page_break_return:bool=False,
@@ -834,6 +799,8 @@ def redact_image_pdf(file_path:str,
834
  pymupdf_doc = [],
835
  pii_identification_method:str="Local",
836
  comprehend_query_number:int=0,
 
 
837
  page_break_val:int=int(page_break_value),
838
  logging_file_paths:List=[],
839
  max_time:int=int(max_time_value),
@@ -851,7 +818,7 @@ def redact_image_pdf(file_path:str,
851
  - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
852
  - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
853
  - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
854
- - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
855
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
856
  - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
857
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
@@ -862,6 +829,8 @@ def redact_image_pdf(file_path:str,
862
  - pymupdf_doc (List, optional): The document as a PyMupdf object.
863
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
864
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
 
 
865
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
866
  - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
867
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
@@ -874,7 +843,15 @@ def redact_image_pdf(file_path:str,
874
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
875
  comprehend_query_number_new = 0
876
 
877
- #print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
 
 
 
 
 
 
 
 
878
 
879
  tic = time.perf_counter()
880
 
@@ -897,8 +874,8 @@ def redact_image_pdf(file_path:str,
897
  print("Page range:", str(page_min + 1), "to", str(page_max))
898
  #print("Current_loop_page:", current_loop_page)
899
 
900
- if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
901
- elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
902
 
903
  if current_loop_page == 0: page_loop_start = 0
904
  else: page_loop_start = current_loop_page
@@ -942,7 +919,7 @@ def redact_image_pdf(file_path:str,
942
  else: ocr_lang = language
943
 
944
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
945
- if analysis_type == "Quick image analysis - typed text":
946
 
947
  word_level_ocr_results = image_analyser.perform_ocr(image)
948
 
@@ -951,7 +928,7 @@ def redact_image_pdf(file_path:str,
951
 
952
 
953
  # Import results from json and convert
954
- if analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
955
 
956
  # Convert the image to bytes using an in-memory buffer
957
  image_buffer = io.BytesIO()
@@ -962,7 +939,7 @@ def redact_image_pdf(file_path:str,
962
  json_file_path = output_folder + file_name + "_textract.json"
963
 
964
  if not os.path.exists(json_file_path):
965
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number) # Analyse page with Textract
966
  logging_file_paths.append(json_file_path)
967
  request_metadata = request_metadata + "\n" + new_request_metadata
968
 
@@ -1010,7 +987,8 @@ def redact_image_pdf(file_path:str,
1010
  line_level_ocr_results,
1011
  line_level_ocr_results_with_children,
1012
  chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
1013
- pii_identification_method = pii_identification_method,
 
1014
  language=language,
1015
  entities=chosen_redact_entities,
1016
  allow_list=allow_list,
@@ -1018,21 +996,13 @@ def redact_image_pdf(file_path:str,
1018
  )
1019
 
1020
  comprehend_query_number = comprehend_query_number + comprehend_query_number_new
1021
-
1022
- # redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
1023
- # line_level_ocr_results_with_children,
1024
- # language,
1025
- # chosen_redact_entities,
1026
- # allow_list,
1027
- # score_threshold,
1028
- # pii_identification_method)
1029
 
1030
  else:
1031
  redaction_bboxes = []
1032
 
1033
 
1034
- if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
1035
- elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
1036
 
1037
  # Save decision making process
1038
  bboxes_str = str(redaction_bboxes)
@@ -1409,7 +1379,7 @@ def redact_text_pdf(
1409
  allow_list: List[str] = None, # Optional list of allowed entities
1410
  page_min: int = 0, # Minimum page number to start redaction
1411
  page_max: int = 999, # Maximum page number to end redaction
1412
- analysis_type: str = "Simple text analysis - PDFs with selectable text", # Type of analysis to perform
1413
  current_loop_page: int = 0, # Current page being processed in the loop
1414
  page_break_return: bool = False, # Flag to indicate if a page break should be returned
1415
  annotations_all_pages: List = [], # List of annotations across all pages
@@ -1418,6 +1388,7 @@ def redact_text_pdf(
1418
  pymupdf_doc: List = [], # List of PyMuPDF documents
1419
  pii_identification_method: str = "Local",
1420
  comprehend_query_number:int = 0,
 
1421
  page_break_val: int = int(page_break_value), # Value for page break
1422
  max_time: int = int(max_time_value),
1423
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
@@ -1443,12 +1414,18 @@ def redact_text_pdf(
1443
  - all_decision_process_table: DataFrame for decision process table
1444
  - pymupdf_doc: List of PyMuPDF documents
1445
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
1446
- - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
 
1447
  - page_break_val: Value for page break
1448
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1449
  - progress: Progress tracking object
1450
  '''
1451
 
 
 
 
 
 
1452
  tic = time.perf_counter()
1453
 
1454
  # Open with Pikepdf to get text lines
@@ -1500,7 +1477,7 @@ def redact_text_pdf(
1500
  decision_process_table_on_page = pd.DataFrame()
1501
  page_text_outputs = pd.DataFrame()
1502
 
1503
- if analysis_type == "Simple text analysis - PDFs with selectable text":
1504
  for n, text_container in enumerate(page_layout):
1505
 
1506
  text_container_analyser_results = []
 
8
  from tqdm import tqdm
9
  from PIL import Image, ImageChops, ImageFile, ImageDraw
10
  ImageFile.LOAD_TRUNCATED_IMAGES = True
 
11
  from typing import List, Dict, Tuple
12
  import pandas as pd
13
 
 
18
  import pymupdf
19
  from pymupdf import Rect
20
  from fitz import Document, Page
 
21
  import gradio as gr
22
  from gradio import Progress
23
  from collections import defaultdict # For efficient grouping
24
 
25
  from presidio_analyzer import RecognizerResult
26
+ from tools.aws_functions import RUN_AWS_FUNCTIONS
27
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
28
  from tools.file_conversion import process_file, image_dpi
29
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
30
+ from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
31
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 
32
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 
33
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
34
 
35
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
 
36
  page_break_value = get_or_create_env_var('page_break_value', '500')
37
  print(f'The value of page_break_value is {page_break_value}')
38
 
39
  max_time_value = get_or_create_env_var('max_time_value', '105')
40
  print(f'The value of max_time_value is {max_time_value}')
41
 
 
42
  def sum_numbers_before_seconds(string:str):
43
  """Extracts numbers that precede the word 'seconds' from a string and adds them up.
44
 
 
186
  else:
187
  in_allow_list_flat = []
188
 
 
189
 
190
+ # Try to connect to AWS services only if RUN_AWS_FUNCTIONS environmental variable is 1
191
+ if pii_identification_method == "AWS Comprehend":
192
+ print("Trying to connect to AWS Comprehend service")
193
+ if RUN_AWS_FUNCTIONS == "1":
194
+ comprehend_client = boto3.client('comprehend')
195
+ else:
196
+ comprehend_client = ""
197
+ out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
198
+ print(out_message)
199
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
200
+ else:
201
+ comprehend_client = ""
202
+
203
+ if in_redact_method == textract_option:
204
+ print("Trying to connect to AWS Comprehend service")
205
+ if RUN_AWS_FUNCTIONS == "1":
206
+ textract_client = boto3.client('textract')
207
+ else:
208
+ textract_client = ""
209
+ out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
210
+ print(out_message)
211
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
212
+ else:
213
+ textract_client = ""
214
+
215
+ progress(0.5, desc="Redacting file")
216
 
217
  if isinstance(file_paths, str):
218
  file_paths_list = [file_paths]
 
236
  if is_a_pdf == False:
237
  # If user has not submitted a pdf, assume it's an image
238
  print("File is not a pdf, assuming that image analysis needs to be used.")
239
+ in_redact_method = tesseract_ocr_option
240
  else:
241
  out_message = "No file selected"
242
  print(out_message)
243
 
244
  return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
245
 
246
+ if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
247
 
248
+
 
 
 
 
 
 
 
249
 
250
  #Analyse and redact image-based pdf or image
251
  if is_pdf_or_image(file_path) == False:
252
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
253
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
254
 
255
  print("Redacting file " + file_path_without_ext + " as an image-based file")
256
 
 
274
  all_decision_process_table,
275
  pymupdf_doc,
276
  pii_identification_method,
277
+ comprehend_query_number,
278
+ comprehend_client,
279
+ textract_client)
280
 
281
  # Save Textract request metadata (if exists)
282
  if new_request_metadata:
283
  print("Request metadata:", new_request_metadata)
284
  all_request_metadata.append(new_request_metadata)
285
 
286
+ elif in_redact_method == text_ocr_option:
287
 
288
  logging_file_paths = ""
289
 
 
301
  in_allow_list_flat,
302
  page_min,
303
  page_max,
304
+ text_ocr_option,
305
  current_loop_page,
306
  page_break_return,
307
  annotations_all_pages,
 
309
  all_decision_process_table,
310
  pymupdf_doc,
311
  pii_identification_method,
312
+ comprehend_query_number,
313
+ comprehend_client)
314
 
315
  else:
316
  out_message = "No redaction method selected"
 
343
 
344
  logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
345
  all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
 
346
  out_file_paths.append(logs_output_file_name)
347
 
348
  all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
349
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
 
350
  out_file_paths.append(all_text_output_file_name)
351
 
352
+ # Save the gradio_annotation_boxes to a JSON file
353
+ try:
354
+ out_annotation_file_path = out_image_file_path + '_redactions.json'
355
+ with open(out_annotation_file_path, 'w') as f:
356
+ json.dump(annotations_all_pages, f)
357
+ out_file_paths.append(out_annotation_file_path)
358
+ except:
359
+ print("Could not save annotations to json file.")
360
+
361
  # Make a combined message for the file
362
  if isinstance(out_message, list):
363
  combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
 
373
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
374
  print("Estimated total processing time:", str(estimate_total_processing_time))
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  else:
377
  toc = time.perf_counter()
378
  time_taken = toc - tic
 
491
 
492
  return x1, new_y1, x2, new_y2
493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
495
  '''
496
  Converts coordinates from pymupdf format to image coordinates,
 
594
  # Should already be in correct format if img_annotator_box is an input
595
  if isinstance(annot, dict):
596
  img_annotation_box = annot
 
 
 
 
597
 
598
  x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
599
 
 
788
  is_a_pdf:bool=True,
789
  page_min:int=0,
790
  page_max:int=999,
791
+ analysis_type:str=tesseract_ocr_option,
792
  handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
793
  request_metadata:str="", current_loop_page:int=0,
794
  page_break_return:bool=False,
 
799
  pymupdf_doc = [],
800
  pii_identification_method:str="Local",
801
  comprehend_query_number:int=0,
802
+ comprehend_client="",
803
+ textract_client="",
804
  page_break_val:int=int(page_break_value),
805
  logging_file_paths:List=[],
806
  max_time:int=int(max_time_value),
 
818
  - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
819
  - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
820
  - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
821
+ - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
822
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
823
  - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
824
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
 
829
  - pymupdf_doc (List, optional): The document as a PyMupdf object.
830
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
831
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
832
+ - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
833
+ - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
834
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
835
  - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
836
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
 
843
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
844
  comprehend_query_number_new = 0
845
 
846
+ if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
847
+ print("Connection to AWS Comprehend service unsuccessful.")
848
+
849
+ return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
850
+
851
+ if analysis_type == textract_option and textract_client == "":
852
+ print("Connection to AWS Textract service unsuccessful.")
853
+
854
+ return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
855
 
856
  tic = time.perf_counter()
857
 
 
874
  print("Page range:", str(page_min + 1), "to", str(page_max))
875
  #print("Current_loop_page:", current_loop_page)
876
 
877
+ if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
878
+ elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
879
 
880
  if current_loop_page == 0: page_loop_start = 0
881
  else: page_loop_start = current_loop_page
 
919
  else: ocr_lang = language
920
 
921
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
922
+ if analysis_type == tesseract_ocr_option:
923
 
924
  word_level_ocr_results = image_analyser.perform_ocr(image)
925
 
 
928
 
929
 
930
  # Import results from json and convert
931
+ if analysis_type == textract_option:
932
 
933
  # Convert the image to bytes using an in-memory buffer
934
  image_buffer = io.BytesIO()
 
939
  json_file_path = output_folder + file_name + "_textract.json"
940
 
941
  if not os.path.exists(json_file_path):
942
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client) # Analyse page with Textract
943
  logging_file_paths.append(json_file_path)
944
  request_metadata = request_metadata + "\n" + new_request_metadata
945
 
 
987
  line_level_ocr_results,
988
  line_level_ocr_results_with_children,
989
  chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
990
+ pii_identification_method = pii_identification_method,
991
+ comprehend_client=comprehend_client,
992
  language=language,
993
  entities=chosen_redact_entities,
994
  allow_list=allow_list,
 
996
  )
997
 
998
  comprehend_query_number = comprehend_query_number + comprehend_query_number_new
 
 
 
 
 
 
 
 
999
 
1000
  else:
1001
  redaction_bboxes = []
1002
 
1003
 
1004
+ if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
1005
+ elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
1006
 
1007
  # Save decision making process
1008
  bboxes_str = str(redaction_bboxes)
 
1379
  allow_list: List[str] = None, # Optional list of allowed entities
1380
  page_min: int = 0, # Minimum page number to start redaction
1381
  page_max: int = 999, # Maximum page number to end redaction
1382
+ analysis_type: str = text_ocr_option, # Type of analysis to perform
1383
  current_loop_page: int = 0, # Current page being processed in the loop
1384
  page_break_return: bool = False, # Flag to indicate if a page break should be returned
1385
  annotations_all_pages: List = [], # List of annotations across all pages
 
1388
  pymupdf_doc: List = [], # List of PyMuPDF documents
1389
  pii_identification_method: str = "Local",
1390
  comprehend_query_number:int = 0,
1391
+ comprehend_client="",
1392
  page_break_val: int = int(page_break_value), # Value for page break
1393
  max_time: int = int(max_time_value),
1394
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
 
1414
  - all_decision_process_table: DataFrame for decision process table
1415
  - pymupdf_doc: List of PyMuPDF documents
1416
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
1417
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
1418
+ - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
1419
  - page_break_val: Value for page break
1420
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1421
  - progress: Progress tracking object
1422
  '''
1423
 
1424
+ if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1425
+ print("Connection to AWS Comprehend service not found.")
1426
+
1427
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1428
+
1429
  tic = time.perf_counter()
1430
 
1431
  # Open with Pikepdf to get text lines
 
1477
  decision_process_table_on_page = pd.DataFrame()
1478
  page_text_outputs = pd.DataFrame()
1479
 
1480
+ if analysis_type == text_ocr_option:
1481
  for n, text_container in enumerate(page_layout):
1482
 
1483
  text_container_analyser_results = []
tools/helper_functions.py CHANGED
@@ -29,6 +29,16 @@ def get_or_create_env_var(var_name, default_value):
29
 
30
  return value
31
 
 
 
 
 
 
 
 
 
 
 
32
  # Retrieving or setting output folder
33
  env_var_name = 'GRADIO_OUTPUT_FOLDER'
34
  default_value = 'output/'
 
29
 
30
  return value
31
 
32
+
33
+ # Names for options labels
34
+ text_ocr_option = "Simple text analysis - docs with selectable text"
35
+ tesseract_ocr_option = "OCR analysis for documents without selectable text - best for typed text"
36
+ textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
37
+
38
+ local_pii_detector = "Local"
39
+ aws_pii_detector = "AWS Comprehend"
40
+
41
+
42
  # Retrieving or setting output folder
43
  env_var_name = 'GRADIO_OUTPUT_FOLDER'
44
  default_value = 'output/'
tools/redaction_review.py CHANGED
@@ -47,24 +47,32 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
47
 
48
  return current_zoom_level, annotate_current_page
49
 
50
-
51
  def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
52
- # print("\nImage annotator object:", image_annotator_object)
 
 
53
 
54
  zoom_str = str(zoom) + '%'
55
 
56
  if not image_annotator_object:
57
- return image_annotator(
58
  label="Modify redaction boxes",
59
  #label_list=["Redaction"],
60
  #label_colors=[(0, 0, 0)],
 
 
61
  show_label=False,
62
- sources=["upload"],
63
  show_clear_button=False,
64
  show_share_button=False,
65
  show_remove_button=False,
66
- interactive=False
67
- ), gr.Number(label = "Page (press enter to change)", value=1, precision=0)
 
 
 
 
 
68
 
69
  if page_num is None:
70
  page_num = 0
@@ -72,8 +80,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
72
  # Check bounding values for current page and page max
73
  if page_num > 0:
74
  page_num_reported = page_num
75
- #page_num = page_num - 1
76
  elif page_num == 0: page_num_reported = 1
 
77
  else:
78
  page_num = 0
79
  page_num_reported = 1
@@ -83,7 +92,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
83
  if page_num_reported > page_max_reported:
84
  page_num_reported = page_max_reported
85
 
86
- out_image_annotator = image_annotator(value = image_annotator_object[page_num_reported - 1],
 
 
87
  boxes_alpha=0.1,
88
  box_thickness=1,
89
  #label_list=["Redaction"],
@@ -104,30 +115,26 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
104
 
105
  number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
106
 
107
- return out_image_annotator, number_reported, number_reported
108
 
109
- def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]):
110
  '''
111
  Overwrite current image annotations with modifications
112
  '''
113
-
114
  if not current_page:
115
  current_page = 1
116
 
117
  #If no previous page or is 0, i.e. first time run, then rewrite current page
118
- if not previous_page:
119
- previous_page = current_page
120
- #return all_image_annotations, current_page, current_page
121
-
122
- #print("all_image_annotations before:",all_image_annotations)
123
 
124
  image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
125
 
126
- #print("image_annotated:", image_annotated)
127
-
128
- all_image_annotations[previous_page - 1] = image_annotated
129
-
130
- #print("all_image_annotations after:",all_image_annotations)
131
 
132
  return all_image_annotations, current_page, current_page
133
 
@@ -178,7 +185,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
178
 
179
  draw.rectangle(coords, fill=fill)
180
 
181
- image.save(output_folder + file_base + "_redacted_mod.png")
182
 
183
  doc = [image]
184
 
@@ -213,13 +220,13 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
213
  pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
214
 
215
  #try:
216
- out_pdf_file_path = output_folder + file_base + "_redacted_mod.pdf"
217
  unredacted_doc.save(out_pdf_file_path)
218
  output_files.append(out_pdf_file_path)
219
 
220
  # Save the gradio_annotation_boxes to a JSON file
221
  try:
222
- out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
223
  with open(out_annotation_file_path, 'w') as f:
224
  json.dump(all_image_annotations, f)
225
  output_files.append(out_annotation_file_path)
 
47
 
48
  return current_zoom_level, annotate_current_page
49
 
 
50
  def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
51
+ '''
52
+ Update a gradio_image_annotation object with new annotation data
53
+ '''
54
 
55
  zoom_str = str(zoom) + '%'
56
 
57
  if not image_annotator_object:
58
+ out_image_annotator = image_annotator(
59
  label="Modify redaction boxes",
60
  #label_list=["Redaction"],
61
  #label_colors=[(0, 0, 0)],
62
+ height=zoom_str,
63
+ width=zoom_str,
64
  show_label=False,
65
+ sources=None,
66
  show_clear_button=False,
67
  show_share_button=False,
68
  show_remove_button=False,
69
+ interactive=False)
70
+
71
+ number_reported = gr.Number(label = "Page (press enter to change)", value=1, precision=0)
72
+
73
+ return out_image_annotator, number_reported, number_reported
74
+
75
+ print("page_num at start of update_annotator function:", page_num)
76
 
77
  if page_num is None:
78
  page_num = 0
 
80
  # Check bounding values for current page and page max
81
  if page_num > 0:
82
  page_num_reported = page_num
83
+
84
  elif page_num == 0: page_num_reported = 1
85
+
86
  else:
87
  page_num = 0
88
  page_num_reported = 1
 
92
  if page_num_reported > page_max_reported:
93
  page_num_reported = page_max_reported
94
 
95
+
96
+ out_image_annotator = image_annotator(
97
+ value = image_annotator_object[page_num_reported - 1],
98
  boxes_alpha=0.1,
99
  box_thickness=1,
100
  #label_list=["Redaction"],
 
115
 
116
  number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
117
 
118
+ return out_image_annotator, number_reported, number_reported, page_num_reported
119
 
120
+ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
121
  '''
122
  Overwrite current image annotations with modifications
123
  '''
124
+
125
  if not current_page:
126
  current_page = 1
127
 
128
  #If no previous page or is 0, i.e. first time run, then rewrite current page
129
+ #if not previous_page:
130
+ # previous_page = current_page
 
 
 
131
 
132
  image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
133
 
134
+ if clear_all == False:
135
+ all_image_annotations[previous_page - 1] = image_annotated
136
+ else:
137
+ all_image_annotations[previous_page - 1]["boxes"] = []
 
138
 
139
  return all_image_annotations, current_page, current_page
140
 
 
185
 
186
  draw.rectangle(coords, fill=fill)
187
 
188
+ image.save(output_folder + file_base + "_redacted.png")
189
 
190
  doc = [image]
191
 
 
220
  pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
221
 
222
  #try:
223
+ out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
224
  unredacted_doc.save(out_pdf_file_path)
225
  output_files.append(out_pdf_file_path)
226
 
227
  # Save the gradio_annotation_boxes to a JSON file
228
  try:
229
+ out_annotation_file_path = output_folder + file_base + '_redactions.json'
230
  with open(out_annotation_file_path, 'w') as f:
231
  json.dump(all_image_annotations, f)
232
  output_files.append(out_annotation_file_path)