seanpedrickcase commited on
Commit
4276db1
·
1 Parent(s): b805ec6

Added workaround to issue with selectdata and dataframes for filtered dataframes. Rearranged some components.

Browse files
app.py CHANGED
@@ -4,11 +4,11 @@ import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
 
6
  from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
7
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken
8
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
9
  from tools.file_redaction import choose_and_run_redactor
10
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
11
- from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df
12
  from tools.data_anonymise import anonymise_data_files
13
  from tools.auth import authenticate_user
14
  from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -60,7 +60,7 @@ with app:
60
  pdf_doc_state = gr.State([])
61
  all_image_annotations_state = gr.State([])
62
 
63
- all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas", wrap=True)
64
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
65
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
66
 
@@ -110,6 +110,7 @@ with app:
110
  doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
111
  doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
112
  latest_review_file_path = gr.Textbox(label = "latest_review_file_path", value="", visible=False) # Latest review file path output from redaction
 
113
 
114
  data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
115
  data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
@@ -152,8 +153,10 @@ with app:
152
  default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
153
  enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
154
 
155
- # Base dataframe for recognisers that is not modified subsequent to load
156
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
 
 
157
 
158
  # Duplicate page detection
159
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
@@ -198,11 +201,11 @@ with app:
198
 
199
  text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
200
 
201
- with gr.Row(equal_height=True):
202
  pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
203
 
204
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
205
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
206
 
207
  if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
208
  with gr.Accordion("AWS Textract bulk document API call", open = False, visible=True):
@@ -216,22 +219,23 @@ with app:
216
  textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
217
 
218
  if SHOW_COSTS == "True":
219
- with gr.Accordion("Estimated costs and time taken", open = False, visible=True):
220
  with gr.Row(equal_height=True):
221
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
222
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
223
  estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0.00, precision=2, visible=True)
224
- estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
225
-
226
- gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
227
 
228
- document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
229
-
230
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
231
  with gr.Accordion("Apply cost code", open = True, visible=True):
232
- with gr.Row(equal_height=True):
233
- cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='search', visible=True, wrap=True, max_height=200)
234
- cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=True)
 
 
 
 
 
235
 
236
  with gr.Row():
237
  output_summary = gr.Textbox(label="Output summary", scale=1)
@@ -253,7 +257,7 @@ with app:
253
  with gr.Tab("Review redactions", id="tab_object_annotation"):
254
 
255
  with gr.Accordion(label = "Review PDF redactions", open=True):
256
- output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions", file_count='multiple', height=file_input_height)
257
  upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="secondary")
258
  with gr.Row():
259
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
@@ -269,7 +273,8 @@ with app:
269
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
270
  annotation_next_page_button = gr.Button("Next page", scale = 4)
271
  with gr.Column(scale=1):
272
- annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
 
273
 
274
  with gr.Row():
275
  with gr.Column(scale=2):
@@ -293,21 +298,27 @@ with app:
293
  interactive=False
294
  )
295
  with gr.Column(scale=1):
296
- with gr.Row(equal_height=True):
297
- recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
298
- page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
299
- text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
300
- recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
 
 
301
 
302
- with gr.Row(equal_height=True):
303
- exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
304
- exclude_selected_btn = gr.Button(value="Exclude all items in table from redactions")
305
- with gr.Row(equal_height=True):
306
- reset_dropdowns_btn = gr.Button(value="Reset filters")
 
 
307
 
308
- undo_last_removal_btn = gr.Button(value="Undo last element removal")
309
- update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
310
- selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
 
 
311
 
312
  with gr.Row():
313
  with gr.Column(scale=2):
@@ -317,7 +328,7 @@ with app:
317
  annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
318
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
319
  with gr.Column(scale=1):
320
- blank_markdown_bot = gr.Markdown(value="", label="")
321
 
322
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
323
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -447,30 +458,34 @@ with app:
447
  # Allow user to select items from cost code dataframe for cost code
448
  if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
449
  cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
 
450
 
451
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
452
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
453
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
454
 
455
  # Run redaction function
456
- document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, output_summary]).\
457
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
458
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number],
459
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number], api_name="redact_doc").\
460
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
461
 
462
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
463
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number],
464
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number]).\
465
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
466
 
467
  # If a file has been completed, the function will continue onto the next document
468
- latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number],
469
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number]).\
470
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
471
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
472
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
473
 
 
 
 
474
  ###
475
  # REVIEW PDF REDACTIONS
476
  ###
@@ -478,7 +493,7 @@ with app:
478
  # Upload previous files for modifying redactions
479
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
480
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
481
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state], api_name="prepare_doc").\
482
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
483
 
484
  # Page number controls
@@ -531,14 +546,18 @@ with app:
531
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
532
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
533
 
 
 
 
 
534
  # Convert review file to xfdf Adobe format
535
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
536
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
537
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
538
 
539
  # Convert xfdf Adobe file back to review_file.csv
540
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
541
- success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
542
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
543
 
544
  ###
@@ -599,11 +618,11 @@ with app:
599
  if GET_COST_CODES == "True" and COST_CODES_PATH:
600
  if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
601
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
602
- success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_choice_drop])
603
  print("Successfully loaded cost codes from S3")
604
  elif os.path.exists(COST_CODES_PATH):
605
  print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
606
- app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_choice_drop])
607
  else: print("Could not load in cost code data")
608
 
609
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
@@ -650,7 +669,7 @@ if __name__ == "__main__":
650
 
651
  main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
652
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
653
- current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
654
 
655
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
656
  # with gr.Tab(label="Advanced options"):
 
4
  from gradio_image_annotation import image_annotator
5
 
6
  from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
7
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe
8
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
9
  from tools.file_redaction import choose_and_run_redactor
10
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
11
+ from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr
12
  from tools.data_anonymise import anonymise_data_files
13
  from tools.auth import authenticate_user
14
  from tools.load_spacy_model_custom_recognisers import custom_entities
 
60
  pdf_doc_state = gr.State([])
61
  all_image_annotations_state = gr.State([])
62
 
63
+
64
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
65
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
66
 
 
110
  doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
111
  doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
112
  latest_review_file_path = gr.Textbox(label = "latest_review_file_path", value="", visible=False) # Latest review file path output from redaction
113
+ latest_ocr_file_path = gr.Textbox(label = "latest_ocr_file_path", value="", visible=False) # Latest ocr file path output from text extraction
114
 
115
  data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
116
  data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
 
153
  default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
154
  enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
155
 
156
+ # Base tables that are not modified subsequent to load
157
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
158
+ all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
159
+ cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
160
 
161
  # Duplicate page detection
162
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
 
201
 
202
  text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
203
 
204
+ with gr.Row(equal_height=True):
205
  pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
206
 
207
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
208
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
209
 
210
  if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
211
  with gr.Accordion("AWS Textract bulk document API call", open = False, visible=True):
 
219
  textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
220
 
221
  if SHOW_COSTS == "True":
222
+ with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
223
  with gr.Row(equal_height=True):
224
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
225
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
226
  estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0.00, precision=2, visible=True)
227
+ estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
 
 
228
 
 
 
229
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
230
  with gr.Accordion("Apply cost code", open = True, visible=True):
231
+ with gr.Row():
232
+ cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
233
+ with gr.Column():
234
+ reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
235
+ cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=True)
236
+
237
+ gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
238
+ document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
239
 
240
  with gr.Row():
241
  output_summary = gr.Textbox(label="Output summary", scale=1)
 
257
  with gr.Tab("Review redactions", id="tab_object_annotation"):
258
 
259
  with gr.Accordion(label = "Review PDF redactions", open=True):
260
+ output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions. The 'ocr_output' file can also be optionally provided for text search.", file_count='multiple', height=file_input_height)
261
  upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="secondary")
262
  with gr.Row():
263
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
 
273
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
274
  annotation_next_page_button = gr.Button("Next page", scale = 4)
275
  with gr.Column(scale=1):
276
+ annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
277
+
278
 
279
  with gr.Row():
280
  with gr.Column(scale=2):
 
298
  interactive=False
299
  )
300
  with gr.Column(scale=1):
301
+ update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
302
+ with gr.Accordion("Search suggested redactions", open=True):
303
+ with gr.Row(equal_height=True):
304
+ recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
305
+ page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
306
+ text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
307
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
308
 
309
+ with gr.Row(equal_height=True):
310
+ exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
311
+ exclude_selected_btn = gr.Button(value="Exclude all items in table from redactions")
312
+ with gr.Row(equal_height=True):
313
+ reset_dropdowns_btn = gr.Button(value="Reset filters")
314
+
315
+ undo_last_removal_btn = gr.Button(value="Undo last element removal")
316
 
317
+ selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
318
+
319
+ with gr.Accordion("Search all extracted text", open=True):
320
+ all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
321
+ reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
322
 
323
  with gr.Row():
324
  with gr.Column(scale=2):
 
328
  annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
329
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
330
  with gr.Column(scale=1):
331
+ blank_markdown_bot = gr.Markdown(value="", label="")
332
 
333
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
334
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
 
458
  # Allow user to select items from cost code dataframe for cost code
459
  if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
460
  cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
461
+ reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
462
 
463
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
464
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
465
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
466
 
467
  # Run redaction function
468
+ document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, output_summary]).\
469
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
470
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
471
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
472
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
473
 
474
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
475
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
476
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
477
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
478
 
479
  # If a file has been completed, the function will continue onto the next document
480
+ latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
481
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
482
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
483
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
484
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
485
 
486
+ # If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
487
+ all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
488
+
489
  ###
490
  # REVIEW PDF REDACTIONS
491
  ###
 
493
  # Upload previous files for modifying redactions
494
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
495
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
496
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
497
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
498
 
499
  # Page number controls
 
546
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
547
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
548
 
549
+ # Review OCR text buttom
550
+ all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
551
+ reset_all_ocr_results_btn.click(reset_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
552
+
553
  # Convert review file to xfdf Adobe format
554
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
555
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
556
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
557
 
558
  # Convert xfdf Adobe file back to review_file.csv
559
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
560
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
561
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
562
 
563
  ###
 
618
  if GET_COST_CODES == "True" and COST_CODES_PATH:
619
  if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
620
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
621
+ success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
622
  print("Successfully loaded cost codes from S3")
623
  elif os.path.exists(COST_CODES_PATH):
624
  print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
625
+ app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
626
  else: print("Could not load in cost code data")
627
 
628
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
 
669
 
670
  main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
671
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
672
+ current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
673
 
674
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
675
  # with gr.Tab(label="Advanced options"):
tools/config.py CHANGED
@@ -161,7 +161,7 @@ COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CE
161
 
162
  S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
163
 
164
- ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, are they compulsory?
165
 
166
  if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
167
  if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
 
161
 
162
  S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
163
 
164
+ ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
165
 
166
  if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
167
  if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
tools/file_conversion.py CHANGED
@@ -251,7 +251,7 @@ def get_input_file_names(file_input:List[str]):
251
  file_extension = os.path.splitext(file_path)[1].lower()
252
 
253
  # Check if the file is in acceptable types
254
- if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
255
  all_relevant_files.append(file_path_without_ext)
256
  file_name_with_extension = file_path_without_ext + file_extension
257
  full_file_name = file_path
@@ -480,6 +480,7 @@ def prepare_image_or_pdf(
480
  pymupdf_doc = []
481
  all_img_details = []
482
  review_file_csv = pd.DataFrame()
 
483
 
484
  if isinstance(in_fully_redacted_list, pd.DataFrame):
485
  if not in_fully_redacted_list.empty:
@@ -512,7 +513,7 @@ def prepare_image_or_pdf(
512
  final_out_message = '\n'.join(out_message)
513
  else:
514
  final_out_message = out_message
515
- return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details
516
 
517
  progress(0.1, desc='Preparing file')
518
 
@@ -600,11 +601,17 @@ def prepare_image_or_pdf(
600
  pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
601
 
602
  elif file_extension in ['.csv']:
603
- review_file_csv = read_file(file)
604
- all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
605
- json_from_csv = True
606
- print("Converted CSV review file to json")
607
-
 
 
 
 
 
 
608
  # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
609
  if (file_extension in ['.json']) | (json_from_csv == True):
610
 
@@ -623,11 +630,10 @@ def prepare_image_or_pdf(
623
 
624
  # Use shutil to copy the file directly
625
  shutil.copy2(file_path, out_textract_path) # Preserves metadata
626
-
627
- textract_output_found = True
628
-
629
  continue
630
 
 
631
  # If you have an annotations object from the above code
632
  if all_annotations_object:
633
 
@@ -669,7 +675,6 @@ def prepare_image_or_pdf(
669
  print("Page", annotation_page_number, "image file not found.")
670
 
671
  all_annotations_object[i] = annotation
672
-
673
 
674
  if isinstance(in_fully_redacted_list, list):
675
  in_fully_redacted_list = pd.DataFrame(data={"fully_redacted_pages_list":in_fully_redacted_list})
@@ -717,6 +722,9 @@ def prepare_image_or_pdf(
717
  else:
718
  print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
719
 
 
 
 
720
  # Must be something else, return with error message
721
  else:
722
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
@@ -744,7 +752,7 @@ def prepare_image_or_pdf(
744
 
745
  number_of_pages = len(image_file_paths)
746
 
747
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details
748
 
749
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
750
  file_path_without_ext = get_file_name_without_type(in_file_path)
@@ -1196,7 +1204,7 @@ def create_annotation_dicts_from_annotation_df(
1196
 
1197
  # Check if the DataFrame is empty or lacks necessary columns
1198
  if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
1199
- print("Warning: Annotation DataFrame is empty or missing 'image' column.")
1200
  return list(image_dict.values()) # Return based on page_sizes only
1201
 
1202
  # 2. Define columns to extract for boxes and check availability
 
251
  file_extension = os.path.splitext(file_path)[1].lower()
252
 
253
  # Check if the file is in acceptable types
254
+ if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext) & ("ocr_output" not in file_path_without_ext):
255
  all_relevant_files.append(file_path_without_ext)
256
  file_name_with_extension = file_path_without_ext + file_extension
257
  full_file_name = file_path
 
480
  pymupdf_doc = []
481
  all_img_details = []
482
  review_file_csv = pd.DataFrame()
483
+ all_line_level_ocr_results_df = pd.DataFrame()
484
 
485
  if isinstance(in_fully_redacted_list, pd.DataFrame):
486
  if not in_fully_redacted_list.empty:
 
513
  final_out_message = '\n'.join(out_message)
514
  else:
515
  final_out_message = out_message
516
+ return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
517
 
518
  progress(0.1, desc='Preparing file')
519
 
 
601
  pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
602
 
603
  elif file_extension in ['.csv']:
604
+ if '_review_file' in file_path_without_ext:
605
+ #print("file_path:", file_path)
606
+ review_file_csv = read_file(file_path)
607
+ all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
608
+ json_from_csv = True
609
+ print("Converted CSV review file to image annotation object")
610
+ elif '_ocr_output' in file_path_without_ext:
611
+ all_line_level_ocr_results_df = read_file(file_path)
612
+ json_from_csv = False
613
+
614
+ # NEW IF STATEMENT
615
  # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
616
  if (file_extension in ['.json']) | (json_from_csv == True):
617
 
 
630
 
631
  # Use shutil to copy the file directly
632
  shutil.copy2(file_path, out_textract_path) # Preserves metadata
633
+ textract_output_found = True
 
 
634
  continue
635
 
636
+ # NEW IF STATEMENT
637
  # If you have an annotations object from the above code
638
  if all_annotations_object:
639
 
 
675
  print("Page", annotation_page_number, "image file not found.")
676
 
677
  all_annotations_object[i] = annotation
 
678
 
679
  if isinstance(in_fully_redacted_list, list):
680
  in_fully_redacted_list = pd.DataFrame(data={"fully_redacted_pages_list":in_fully_redacted_list})
 
722
  else:
723
  print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
724
 
725
+ elif file_extension in ['.csv'] and "ocr_output" in file_path:
726
+ continue
727
+
728
  # Must be something else, return with error message
729
  else:
730
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
 
752
 
753
  number_of_pages = len(image_file_paths)
754
 
755
+ return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
756
 
757
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
758
  file_path_without_ext = get_file_name_without_type(in_file_path)
 
1204
 
1205
  # Check if the DataFrame is empty or lacks necessary columns
1206
  if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
1207
+ #print("Warning: Annotation DataFrame is empty or missing 'image' column.")
1208
  return list(image_dict.values()) # Return based on page_sizes only
1209
 
1210
  # 2. Define columns to extract for boxes and check availability
tools/file_redaction.py CHANGED
@@ -100,6 +100,7 @@ def choose_and_run_redactor(file_paths:List[str],
100
  review_file_path:str="",
101
  input_folder:str=INPUT_FOLDER,
102
  textract_query_number:int=0,
 
103
  prepare_images:bool=True,
104
  progress=gr.Progress(track_tqdm=True)):
105
  '''
@@ -148,6 +149,7 @@ def choose_and_run_redactor(file_paths:List[str],
148
  - review_file_path (str, optional): The latest review file path created by the app
149
  - input_folder (str, optional): The custom input path, if provided
150
  - textract_query_number (int, optional): The number of textract queries up until this point.
 
151
  - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
152
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
153
 
@@ -211,9 +213,9 @@ def choose_and_run_redactor(file_paths:List[str],
211
  print("Completed last file")
212
  current_loop_page = 0
213
 
214
- if isinstance(out_message, list):
215
  combined_out_message = combined_out_message + '\n'.join(out_message)
216
- else:
217
  combined_out_message = combined_out_message + '\n' + out_message
218
 
219
  # Only send across review file if redaction has been done
@@ -226,7 +228,7 @@ def choose_and_run_redactor(file_paths:List[str],
226
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
227
  print("Estimated total processing time:", str(estimate_total_processing_time))
228
 
229
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number
230
 
231
  #if first_loop_state == False:
232
  # Prepare documents and images as required if they don't already exist
@@ -257,7 +259,7 @@ def choose_and_run_redactor(file_paths:List[str],
257
  # Call prepare_image_or_pdf only if needed
258
  if prepare_images_flag is not None:# and first_loop_state==True:
259
  #print("Calling preparation function. prepare_images_flag:", prepare_images_flag)
260
- out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state = prepare_image_or_pdf(
261
  file_paths_loop, text_extraction_method, 0, out_message, True,
262
  annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
263
  output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
@@ -279,7 +281,8 @@ def choose_and_run_redactor(file_paths:List[str],
279
 
280
  # Set to a very high number so as not to mix up with subsequent file processing by the user
281
  current_loop_page = 999
282
- combined_out_message = combined_out_message + "\n" + out_message
 
283
 
284
  # Only send across review file if redaction has been done
285
  if pii_identification_method != no_redaction_option:
@@ -288,7 +291,7 @@ def choose_and_run_redactor(file_paths:List[str],
288
  #review_file_path = [x for x in out_file_paths if "review_file" in x]
289
  if review_file_path: review_out_file_paths.append(review_file_path)
290
 
291
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number
292
 
293
  # Load/create allow list
294
  # If string, assume file path
@@ -513,14 +516,14 @@ def choose_and_run_redactor(file_paths:List[str],
513
  all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
514
  else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
515
 
516
- all_text_output_file_name = orig_pdf_file_path + "_ocr_output.csv"
517
 
518
  all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
519
 
520
- all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
521
- out_file_paths.append(all_text_output_file_name)
522
 
523
- duplication_file_path_outputs.append(all_text_output_file_name)
524
 
525
  # Convert the gradio annotation boxes to relative coordinates
526
  # Convert annotations_all_pages to a consistent relative coordinate format output
@@ -543,9 +546,10 @@ def choose_and_run_redactor(file_paths:List[str],
543
  out_file_paths.append(review_file_path)
544
 
545
  # Make a combined message for the file
546
- if isinstance(out_message, list):
547
  combined_out_message = combined_out_message + '\n'.join(out_message) # Ensure out_message is a list of strings
548
- else: combined_out_message = combined_out_message + '\n' + out_message
 
549
 
550
  toc = time.perf_counter()
551
  time_taken = toc - tic
@@ -588,7 +592,7 @@ def choose_and_run_redactor(file_paths:List[str],
588
  if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
589
  else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
590
 
591
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number
592
 
593
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
594
  '''
 
100
  review_file_path:str="",
101
  input_folder:str=INPUT_FOLDER,
102
  textract_query_number:int=0,
103
+ ocr_file_path:str="",
104
  prepare_images:bool=True,
105
  progress=gr.Progress(track_tqdm=True)):
106
  '''
 
149
  - review_file_path (str, optional): The latest review file path created by the app
150
  - input_folder (str, optional): The custom input path, if provided
151
  - textract_query_number (int, optional): The number of textract queries up until this point.
152
+ - ocr_file_path (str, optional): The latest ocr file path created by the app
153
  - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
154
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
155
 
 
213
  print("Completed last file")
214
  current_loop_page = 0
215
 
216
+ if isinstance(out_message, list) and out_message:
217
  combined_out_message = combined_out_message + '\n'.join(out_message)
218
+ elif out_message:
219
  combined_out_message = combined_out_message + '\n' + out_message
220
 
221
  # Only send across review file if redaction has been done
 
228
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
229
  print("Estimated total processing time:", str(estimate_total_processing_time))
230
 
231
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
232
 
233
  #if first_loop_state == False:
234
  # Prepare documents and images as required if they don't already exist
 
259
  # Call prepare_image_or_pdf only if needed
260
  if prepare_images_flag is not None:# and first_loop_state==True:
261
  #print("Calling preparation function. prepare_images_flag:", prepare_images_flag)
262
+ out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df = prepare_image_or_pdf(
263
  file_paths_loop, text_extraction_method, 0, out_message, True,
264
  annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
265
  output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
 
281
 
282
  # Set to a very high number so as not to mix up with subsequent file processing by the user
283
  current_loop_page = 999
284
+ if out_message:
285
+ combined_out_message = combined_out_message + "\n" + out_message
286
 
287
  # Only send across review file if redaction has been done
288
  if pii_identification_method != no_redaction_option:
 
291
  #review_file_path = [x for x in out_file_paths if "review_file" in x]
292
  if review_file_path: review_out_file_paths.append(review_file_path)
293
 
294
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
295
 
296
  # Load/create allow list
297
  # If string, assume file path
 
516
  all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
517
  else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
518
 
519
+ ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
520
 
521
  all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
522
 
523
+ all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
524
+ out_file_paths.append(ocr_file_path)
525
 
526
+ duplication_file_path_outputs.append(ocr_file_path)
527
 
528
  # Convert the gradio annotation boxes to relative coordinates
529
  # Convert annotations_all_pages to a consistent relative coordinate format output
 
546
  out_file_paths.append(review_file_path)
547
 
548
  # Make a combined message for the file
549
+ if isinstance(out_message, list) and out_message:
550
  combined_out_message = combined_out_message + '\n'.join(out_message) # Ensure out_message is a list of strings
551
+ elif out_message:
552
+ combined_out_message = combined_out_message + '\n' + out_message
553
 
554
  toc = time.perf_counter()
555
  time_taken = toc - tic
 
592
  if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
593
  else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
594
 
595
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
596
 
597
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
598
  '''
tools/helper_functions.py CHANGED
@@ -53,7 +53,7 @@ def load_in_default_cost_codes(cost_codes_path:str):
53
 
54
  out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
55
 
56
- return cost_codes_df, out_dropdown
57
 
58
  def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
59
  if enforce_cost_code_textbox == "True":
@@ -485,4 +485,10 @@ def calculate_time_taken(number_of_pages:str,
485
  calculated_time_taken = (page_conversion_time_taken + page_extraction_time_taken + page_redaction_time_taken)/60
486
 
487
  return calculated_time_taken
 
 
 
 
 
 
488
 
 
53
 
54
  out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
55
 
56
+ return cost_codes_df, cost_codes_df, out_dropdown
57
 
58
  def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
59
  if enforce_cost_code_textbox == "True":
 
485
  calculated_time_taken = (page_conversion_time_taken + page_extraction_time_taken + page_redaction_time_taken)/60
486
 
487
  return calculated_time_taken
488
+
489
+ def reset_base_dataframe(df:pd.DataFrame):
490
+ return df
491
+
492
+ def reset_ocr_base_dataframe(df:pd.DataFrame):
493
+ return df.iloc[:, [0,1]]
494
 
tools/redaction_review.py CHANGED
@@ -114,7 +114,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
114
  page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
115
  page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
116
 
117
- recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
118
 
119
  recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
120
 
@@ -151,7 +151,7 @@ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData,
151
 
152
  review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
153
 
154
- recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
155
 
156
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
157
  recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
@@ -180,10 +180,6 @@ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
180
  out_image_annotations_state = current_image_annotations_state
181
  out_current_page_annotator = current_page_annotator
182
 
183
- print("page_sizes:", page_sizes)
184
-
185
- review_df.to_csv(OUTPUT_FOLDER + "review_df_in_update_annotator.csv")
186
-
187
  if not review_df.empty:
188
 
189
  out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
@@ -195,9 +191,6 @@ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
195
 
196
  return out_current_page_annotator, out_image_annotations_state
197
 
198
-
199
-
200
-
201
  def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
202
  selected_rows_df: pd.DataFrame,
203
  image_file_paths:List[str],
@@ -241,7 +234,7 @@ def update_annotator_object_and_filter_df(
241
  recogniser_entities_dropdown_value:str="ALL",
242
  page_dropdown_value:str="ALL",
243
  text_dropdown_value:str="ALL",
244
- recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True),
245
  zoom:int=100,
246
  review_df:pd.DataFrame=[],
247
  page_sizes:List[dict]=[],
@@ -584,6 +577,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
584
  output_files.append(orig_pdf_file_path)
585
 
586
  try:
 
587
  review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
588
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
589
 
@@ -765,12 +759,21 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
765
  def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
766
 
767
  row_value_code = evt.row_value[0] # This is the value for cost code
768
- row_value_label = evt.row_value[1] # This is the label number value
769
 
770
  #row_value_df = pd.DataFrame(data={"page":[row_value_code], "label":[row_value_label]})
771
 
772
  return row_value_code
773
 
 
 
 
 
 
 
 
 
 
774
  def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, colour:tuple=(0,0,255)):
775
  '''
776
  Update the colour of a single redaction box based on the values in a selection row
@@ -889,12 +892,12 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
889
  annots = SubElement(xfdf, 'annots')
890
 
891
  # Check if page size object exists, and if current coordinates are in relative format or image coordinates format.
892
- if page_sizes:
 
893
  page_sizes_df = pd.DataFrame(page_sizes)
894
 
895
  # If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
896
- #if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
897
- print("Using pymupdf coordinates for conversion.")
898
 
899
  pages_are_images = False
900
 
 
114
  page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
115
  page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
116
 
117
+ recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
118
 
119
  recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
120
 
 
151
 
152
  review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
153
 
154
+ recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
155
 
156
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
157
  recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
 
180
  out_image_annotations_state = current_image_annotations_state
181
  out_current_page_annotator = current_page_annotator
182
 
 
 
 
 
183
  if not review_df.empty:
184
 
185
  out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
 
191
 
192
  return out_current_page_annotator, out_image_annotations_state
193
 
 
 
 
194
  def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
195
  selected_rows_df: pd.DataFrame,
196
  image_file_paths:List[str],
 
234
  recogniser_entities_dropdown_value:str="ALL",
235
  page_dropdown_value:str="ALL",
236
  text_dropdown_value:str="ALL",
237
+ recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, show_search='filter', max_height=400),
238
  zoom:int=100,
239
  review_df:pd.DataFrame=[],
240
  page_sizes:List[dict]=[],
 
577
  output_files.append(orig_pdf_file_path)
578
 
579
  try:
580
+ print("Saving review file.")
581
  review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
582
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
583
 
 
759
  def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
760
 
761
  row_value_code = evt.row_value[0] # This is the value for cost code
762
+ #row_value_label = evt.row_value[1] # This is the label number value
763
 
764
  #row_value_df = pd.DataFrame(data={"page":[row_value_code], "label":[row_value_label]})
765
 
766
  return row_value_code
767
 
768
+ def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
769
+
770
+ row_value_page = evt.row_value[0] # This is the page_number value
771
+ row_value_text = evt.row_value[1] # This is the text contents
772
+
773
+ row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
774
+
775
+ return row_value_page, row_value_df
776
+
777
  def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, colour:tuple=(0,0,255)):
778
  '''
779
  Update the colour of a single redaction box based on the values in a selection row
 
892
  annots = SubElement(xfdf, 'annots')
893
 
894
  # Check if page size object exists, and if current coordinates are in relative format or image coordinates format.
895
+ if page_sizes:
896
+
897
  page_sizes_df = pd.DataFrame(page_sizes)
898
 
899
  # If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
900
+ #print("Using pymupdf coordinates for conversion.")
 
901
 
902
  pages_are_images = False
903