seanpedrickcase commited on
Commit
dacc782
·
1 Parent(s): f13e98b

Allowed for output files to be saved into user-specific folders. Added deny list capability to xlsx/csv file redaction

Browse files
Dockerfile CHANGED
@@ -63,7 +63,7 @@ RUN mkdir -p /home/user/app/output \
63
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
64
 
65
  # Download NLTK data packages
66
- RUN python -m nltk.downloader punkt stopwords punkt_tab
67
 
68
  # Entrypoint helps to switch between Gradio and Lambda mode
69
  COPY entrypoint.sh /entrypoint.sh
 
63
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
64
 
65
  # Download NLTK data packages
66
+ RUN python -m nltk.downloader --quiet punkt stopwords punkt_tab
67
 
68
  # Entrypoint helps to switch between Gradio and Lambda mode
69
  COPY entrypoint.sh /entrypoint.sh
app.py CHANGED
@@ -10,7 +10,7 @@ from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
@@ -70,37 +70,37 @@ with app:
70
  pdf_doc_state = gr.State([])
71
  all_image_annotations_state = gr.State([])
72
 
73
- all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
74
- all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
75
- review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
76
 
77
- session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False) #.State()
78
- s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False) #.State()
 
79
 
80
- first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False) #.State(True)
81
- second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False) #.State(False)
82
- do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False) #.State(False)
83
 
84
- prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
85
- images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
86
 
87
- output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
88
- output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
89
- text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
90
- log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False) #gr.State([])
91
 
92
 
93
  # Logging state
94
  log_file_name = 'log.csv'
95
 
96
- feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False) #State(feedback_logs_folder + log_file_name)
97
- feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False) #State(feedback_logs_folder)
98
- access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False) #State(access_logs_folder + log_file_name)
99
- access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False) #State(access_logs_folder)
100
- usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False) #State(usage_logs_folder + log_file_name)
101
- usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False) #State(usage_logs_folder)
102
-
103
- # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
104
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
105
  textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
106
  comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
@@ -122,10 +122,10 @@ with app:
122
 
123
  ## Annotator zoom value
124
  annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
125
- zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False) #State(True)
126
- zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False) #State(False)
127
 
128
- clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False) #State(True)
129
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
130
 
131
  ## Settings page variables
@@ -352,8 +352,12 @@ with app:
352
  log_files_output = gr.File(label="Log file output", interactive=False)
353
 
354
  with gr.Accordion("Combine multiple review files", open = False):
355
- multiple_review_files_in_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv'])
356
- merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
 
 
 
 
357
 
358
 
359
  ### UI INTERACTION ###
@@ -364,12 +368,12 @@ with app:
364
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
365
 
366
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
367
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
368
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
369
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
370
 
371
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
372
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
373
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
374
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
375
 
@@ -391,17 +395,17 @@ with app:
391
  annotate_current_page.submit(
392
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
393
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
394
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
395
 
396
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
397
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
398
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
399
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
400
 
401
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
402
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
403
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
404
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
405
 
406
  # Zoom in and out on annotator
407
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
@@ -415,13 +419,13 @@ with app:
415
  clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
416
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
417
 
418
- annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
419
 
420
  # Page controls at bottom
421
  annotate_current_page_bottom.submit(
422
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
423
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
424
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
425
 
426
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
427
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
@@ -431,7 +435,7 @@ with app:
431
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
432
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
433
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
434
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
435
 
436
  # Review table controls
437
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
@@ -439,28 +443,28 @@ with app:
439
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
440
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
441
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
442
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
443
 
444
  # Convert review file to xfdf Adobe format
445
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
446
  then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
447
- then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
448
 
449
  # Convert xfdf Adobe file back to review_file.csv
450
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
451
  then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
452
- then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
453
 
454
  ###
455
  # TABULAR DATA REDACTION
456
- ###
457
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
458
  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
459
 
460
- tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
461
 
462
  # If the output file count text box changes, keep going with redacting each data file until done
463
- text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
464
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
465
 
466
  ###
@@ -479,6 +483,10 @@ with app:
479
 
480
  # Merge multiple review csv files together
481
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
 
 
 
 
482
 
483
 
484
  ###
@@ -486,7 +494,7 @@ with app:
486
  ###
487
 
488
  # Get connection details on app load
489
- app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
490
 
491
  # If running on AWS, load in the default allow list file from S3
492
  # if RUN_AWS_FUNCTIONS == "1":
 
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
 
70
  pdf_doc_state = gr.State([])
71
  all_image_annotations_state = gr.State([])
72
 
73
+ all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas")
74
+ all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas")
75
+ review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas")
76
 
77
+ session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
78
+ s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
79
+ output_folder_textbox = gr.Textbox(value = output_folder, label="output_folder_textbox", visible=False)
80
 
81
+ first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
82
+ second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
83
+ do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
84
 
85
+ prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
86
+ images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
87
 
88
+ output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
89
+ output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
90
+ text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
91
+ log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
92
 
93
 
94
  # Logging state
95
  log_file_name = 'log.csv'
96
 
97
+ feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False)
98
+ feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False)
99
+ access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False)
100
+ access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False)
101
+ usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False)
102
+ usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False)
103
+
 
104
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
105
  textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
106
  comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
 
122
 
123
  ## Annotator zoom value
124
  annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
125
+ zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
126
+ zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
127
 
128
+ clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
129
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
130
 
131
  ## Settings page variables
 
352
  log_files_output = gr.File(label="Log file output", interactive=False)
353
 
354
  with gr.Accordion("Combine multiple review files", open = False):
355
+ multiple_review_files_in_out = gr.File(label="Combine multiple review_file.csv files together here.", file_count='multiple', file_types=['.csv'])
356
+ merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
357
+
358
+ with gr.Accordion("View all output files from this session", open = False):
359
+ all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
360
+ all_output_files = gr.File(label="All output files.", file_count='multiple', file_types=['.csv'], interactive=False)
361
 
362
 
363
  ### UI INTERACTION ###
 
368
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
369
 
370
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
371
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
372
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
373
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
374
 
375
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
376
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
377
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
378
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
379
 
 
395
  annotate_current_page.submit(
396
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
397
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
398
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
399
 
400
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
401
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
402
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
403
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
404
 
405
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
406
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
407
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
408
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
409
 
410
  # Zoom in and out on annotator
411
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
 
419
  clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
420
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
421
 
422
+ annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
423
 
424
  # Page controls at bottom
425
  annotate_current_page_bottom.submit(
426
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
427
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
428
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
429
 
430
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
431
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
 
435
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
436
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
437
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
438
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
439
 
440
  # Review table controls
441
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
 
443
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
444
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
445
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
446
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
447
 
448
  # Convert review file to xfdf Adobe format
449
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
450
  then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
451
+ then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[adobe_review_files_out])
452
 
453
  # Convert xfdf Adobe file back to review_file.csv
454
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
455
  then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
456
+ then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
457
 
458
  ###
459
  # TABULAR DATA REDACTION
460
+ ###
461
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
462
  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
463
 
464
+ tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
465
 
466
  # If the output file count text box changes, keep going with redacting each data file until done
467
+ text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
468
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
469
 
470
  ###
 
483
 
484
  # Merge multiple review csv files together
485
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
486
+
487
+
488
+ #
489
+ all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
490
 
491
 
492
  ###
 
494
  ###
495
 
496
  # Get connection details on app load
497
+ app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
498
 
499
  # If running on AWS, load in the default allow list file from S3
500
  # if RUN_AWS_FUNCTIONS == "1":
tools/data_anonymise.py CHANGED
@@ -13,7 +13,7 @@ from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
 
15
  from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
16
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
17
 
18
  # Use custom version of analyze_dict to be able to track progress
19
  from tools.presidio_analyzer_custom import analyze_dict
@@ -108,9 +108,6 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
108
 
109
  decision_process_output_str = '\n'.join(decision_process_output)
110
 
111
- print("decision_process_output_str:\n\n", decision_process_output_str)
112
-
113
-
114
  return decision_process_output_str
115
 
116
  def anon_consistent_names(df):
@@ -205,7 +202,7 @@ def anon_consistent_names(df):
205
 
206
  return scrubbed_df_consistent_names
207
 
208
- def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
209
 
210
  print("Identifying personal information")
211
  analyse_tic = time.perf_counter()
@@ -220,6 +217,21 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
220
  else:
221
  in_allow_list_flat = []
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  #analyzer = nlp_analyser #AnalyzerEngine()
224
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
225
 
@@ -242,8 +254,6 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
242
  # Usage in the main function:
243
  decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
244
 
245
- #print("decision_process_output_str:\n\n", decision_process_output_str)
246
-
247
  analyse_toc = time.perf_counter()
248
  analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
249
  print(analyse_time_out)
@@ -287,8 +297,46 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
287
 
288
  return scrubbed_df, key_string, decision_process_output_str
289
 
290
- def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths):
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  def check_lists(list1, list2):
293
  return any(string in list2 for string in list1)
294
 
@@ -327,7 +375,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
327
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
328
 
329
  # Anonymise the selected columns
330
- anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
331
 
332
  # Rejoin the dataframe together
333
  anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
@@ -374,7 +422,28 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
374
 
375
  return out_file_paths, out_message, key_string, log_files_output_paths
376
 
377
- def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], log_files_output_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
  tic = time.perf_counter()
380
 
@@ -389,7 +458,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
389
  if isinstance(out_message, str):
390
  out_message = [out_message]
391
 
392
- print("log_files_output_paths:",log_files_output_paths)
393
 
394
  if isinstance(log_files_output_paths, str):
395
  log_files_output_paths = []
@@ -433,7 +502,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
433
  file_type = ""
434
  out_file_part = anon_file
435
 
436
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
437
  else:
438
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
439
  file_type = detect_file_type(anon_file)
@@ -472,14 +541,14 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
472
  print(anon_df.head()) # Print the first few rows
473
 
474
 
475
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths)
476
 
477
  else:
478
  sheet_name = ""
479
  anon_df = read_file(anon_file)
480
  out_file_part = get_file_name_without_type(anon_file.name)
481
 
482
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
483
 
484
  # Increase latest file completed count unless we are at the last file
485
  if latest_file_completed != len(file_paths):
 
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
 
15
  from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
16
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser
17
 
18
  # Use custom version of analyze_dict to be able to track progress
19
  from tools.presidio_analyzer_custom import analyze_dict
 
108
 
109
  decision_process_output_str = '\n'.join(decision_process_output)
110
 
 
 
 
111
  return decision_process_output_str
112
 
113
  def anon_consistent_names(df):
 
202
 
203
  return scrubbed_df_consistent_names
204
 
205
+ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], progress=Progress(track_tqdm=False)):
206
 
207
  print("Identifying personal information")
208
  analyse_tic = time.perf_counter()
 
217
  else:
218
  in_allow_list_flat = []
219
 
220
+ if isinstance(in_deny_list, pd.DataFrame):
221
+ if not in_deny_list.empty:
222
+ in_deny_list = in_deny_list.iloc[:, 0].tolist()
223
+ else:
224
+ # Handle the case where the DataFrame is empty
225
+ in_deny_list = [] # or some default value
226
+
227
+ # Sort the strings in order from the longest string to the shortest
228
+ in_deny_list = sorted(in_deny_list, key=len, reverse=True)
229
+
230
+ if in_deny_list:
231
+ nlp_analyser.registry.remove_recognizer("CUSTOM")
232
+ new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
233
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
234
+
235
  #analyzer = nlp_analyser #AnalyzerEngine()
236
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
237
 
 
254
  # Usage in the main function:
255
  decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
256
 
 
 
257
  analyse_toc = time.perf_counter()
258
  analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
259
  print(analyse_time_out)
 
297
 
298
  return scrubbed_df, key_string, decision_process_output_str
299
 
 
300
 
301
+ def anon_wrapper_func(
302
+ anon_file: str,
303
+ anon_df: pd.DataFrame,
304
+ chosen_cols: List[str],
305
+ out_file_paths: List[str],
306
+ out_file_part: str,
307
+ out_message: str,
308
+ excel_sheet_name: str,
309
+ anon_strat: str,
310
+ language: str,
311
+ chosen_redact_entities: List[str],
312
+ in_allow_list: List[str],
313
+ file_type: str,
314
+ anon_xlsx_export_file_name: str,
315
+ log_files_output_paths: List[str],
316
+ in_deny_list: List[str]=[],
317
+ output_folder: str = output_folder
318
+ ):
319
+ """
320
+ This function wraps the anonymization process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymization strategy, and exports the anonymized data to a file.
321
+
322
+ Input Variables:
323
+ - anon_file: The path to the file containing the data to be anonymized.
324
+ - anon_df: The pandas DataFrame containing the data to be anonymized.
325
+ - chosen_cols: A list of column names to be anonymized.
326
+ - out_file_paths: A list of paths where the anonymized files will be saved.
327
+ - out_file_part: A part of the output file name.
328
+ - out_message: A message to be displayed during the anonymization process.
329
+ - excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
330
+ - anon_strat: The anonymization strategy to be applied.
331
+ - language: The language of the data to be anonymized.
332
+ - chosen_redact_entities: A list of entities to be redacted.
333
+ - in_allow_list: A list of allowed values.
334
+ - file_type: The type of file to be exported.
335
+ - anon_xlsx_export_file_name: The name of the anonymized Excel file.
336
+ - log_files_output_paths: A list of paths where the log files will be saved.
337
+ - in_deny_list: List of specific terms to remove from the data.
338
+ - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
339
+ """
340
  def check_lists(list1, list2):
341
  return any(string in list2 for string in list1)
342
 
 
375
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
376
 
377
  # Anonymise the selected columns
378
+ anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list)
379
 
380
  # Rejoin the dataframe together
381
  anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
 
422
 
423
  return out_file_paths, out_message, key_string, log_files_output_paths
424
 
425
+ def anonymise_data_files(file_paths: List[str], in_text: str, anon_strat: str, chosen_cols: List[str], language: str, chosen_redact_entities: List[str], in_allow_list: List[str] = None, latest_file_completed: int = 0, out_message: list = [], out_file_paths: list = [], log_files_output_paths: list = [], in_excel_sheets: list = [], first_loop_state: bool = False, output_folder: str = output_folder, in_deny_list:list[str]=[], progress: Progress = Progress(track_tqdm=True)):
426
+ """
427
+ This function anonymises data files based on the provided parameters.
428
+
429
+ Parameters:
430
+ - file_paths (List[str]): A list of file paths to anonymise.
431
+ - in_text (str): The text to anonymise if file_paths is 'open_text'.
432
+ - anon_strat (str): The anonymisation strategy to use.
433
+ - chosen_cols (List[str]): A list of column names to anonymise.
434
+ - language (str): The language of the text to anonymise.
435
+ - chosen_redact_entities (List[str]): A list of entities to redact.
436
+ - in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
437
+ - latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
438
+ - out_message (list, optional): A list to store output messages. Defaults to an empty list.
439
+ - out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
440
+ - log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
441
+ - in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
442
+ - first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
443
+ - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
444
+ - in_deny_list (list[str], optional): A list of specific terms to redact.
445
+ - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
446
+ """
447
 
448
  tic = time.perf_counter()
449
 
 
458
  if isinstance(out_message, str):
459
  out_message = [out_message]
460
 
461
+ #print("log_files_output_paths:",log_files_output_paths)
462
 
463
  if isinstance(log_files_output_paths, str):
464
  log_files_output_paths = []
 
502
  file_type = ""
503
  out_file_part = anon_file
504
 
505
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
506
  else:
507
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
508
  file_type = detect_file_type(anon_file)
 
541
  print(anon_df.head()) # Print the first few rows
542
 
543
 
544
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, output_folder=output_folder)
545
 
546
  else:
547
  sheet_name = ""
548
  anon_df = read_file(anon_file)
549
  out_file_part = get_file_name_without_type(anon_file.name)
550
 
551
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
552
 
553
  # Increase latest file completed count unless we are at the last file
554
  if latest_file_completed != len(file_paths):
tools/file_redaction.py CHANGED
@@ -375,7 +375,8 @@ def choose_and_run_redactor(file_paths:List[str],
375
  redact_whole_page_list,
376
  max_fuzzy_spelling_mistakes_num,
377
  match_fuzzy_whole_phrase_bool,
378
- log_files_output_paths=log_files_output_paths)
 
379
 
380
  # Save Textract request metadata (if exists)
381
  if new_request_metadata:
@@ -443,15 +444,15 @@ def choose_and_run_redactor(file_paths:List[str],
443
 
444
  out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
445
 
446
- logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
447
- all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
448
- log_files_output_paths.append(logs_output_file_name)
449
 
450
  all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
451
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
452
  out_file_paths.append(all_text_output_file_name)
453
 
454
- # Save the gradio_annotation_boxes to a JSON file
455
  try:
456
  review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
457
 
@@ -461,15 +462,15 @@ def choose_and_run_redactor(file_paths:List[str],
461
 
462
  #print("Saved review file to csv")
463
 
464
- out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
465
- with open(out_annotation_file_path, 'w') as f:
466
- json.dump(annotations_all_pages, f)
467
- log_files_output_paths.append(out_annotation_file_path)
468
 
469
  #print("Saving annotations to JSON")
470
 
471
  except Exception as e:
472
- print("Could not save annotations to json or csv file:", e)
473
 
474
  # Make a combined message for the file
475
  if isinstance(out_message, list):
@@ -942,7 +943,8 @@ def redact_image_pdf(file_path:str,
942
  match_fuzzy_whole_phrase_bool:bool=True,
943
  page_break_val:int=int(page_break_value),
944
  log_files_output_paths:List=[],
945
- max_time:int=int(max_time_value),
 
946
  progress=Progress(track_tqdm=True)):
947
 
948
  '''
@@ -976,7 +978,8 @@ def redact_image_pdf(file_path:str,
976
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
977
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
978
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
979
- - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
 
980
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
981
 
982
  The function returns a redacted PDF document along with processing output objects.
 
375
  redact_whole_page_list,
376
  max_fuzzy_spelling_mistakes_num,
377
  match_fuzzy_whole_phrase_bool,
378
+ log_files_output_paths=log_files_output_paths,
379
+ output_folder=output_folder)
380
 
381
  # Save Textract request metadata (if exists)
382
  if new_request_metadata:
 
444
 
445
  out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
446
 
447
+ #logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
448
+ #all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
449
+ #log_files_output_paths.append(logs_output_file_name)
450
 
451
  all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
452
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
453
  out_file_paths.append(all_text_output_file_name)
454
 
455
+ # Save the gradio_annotation_boxes to a review csv file
456
  try:
457
  review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
458
 
 
462
 
463
  #print("Saved review file to csv")
464
 
465
+ # out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
466
+ # with open(out_annotation_file_path, 'w') as f:
467
+ # json.dump(annotations_all_pages, f)
468
+ # log_files_output_paths.append(out_annotation_file_path)
469
 
470
  #print("Saving annotations to JSON")
471
 
472
  except Exception as e:
473
+ print("Could not save annotations to csv file:", e)
474
 
475
  # Make a combined message for the file
476
  if isinstance(out_message, list):
 
943
  match_fuzzy_whole_phrase_bool:bool=True,
944
  page_break_val:int=int(page_break_value),
945
  log_files_output_paths:List=[],
946
+ max_time:int=int(max_time_value),
947
+ output_folder:str=output_folder,
948
  progress=Progress(track_tqdm=True)):
949
 
950
  '''
 
978
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
979
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
980
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
981
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
982
+ - output_folder (str, optional): The folder for file outputs.
983
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
984
 
985
  The function returns a redacted PDF document along with processing output objects.
tools/helper_functions.py CHANGED
@@ -34,6 +34,9 @@ aws_pii_detector = "AWS Comprehend"
34
  output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
35
  print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
36
 
 
 
 
37
  input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
38
  print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
39
 
@@ -62,8 +65,6 @@ def reset_state_vars():
62
  def reset_review_vars():
63
  return [], pd.DataFrame(), pd.DataFrame()
64
 
65
-
66
-
67
  def load_in_default_allow_list(allow_list_file_path):
68
  if isinstance(allow_list_file_path, str):
69
  allow_list_file_path = [allow_list_file_path]
@@ -269,8 +270,7 @@ def merge_csv_files(file_list):
269
 
270
 
271
 
272
- async def get_connection_params(request: gr.Request):
273
- base_folder = ""
274
 
275
  #print("request user:", request.username)
276
 
@@ -304,17 +304,14 @@ async def get_connection_params(request: gr.Request):
304
 
305
  if request.username:
306
  out_session_hash = request.username
307
- base_folder = "user-files/"
308
  print("Request username found:", out_session_hash)
309
 
310
  elif 'x-cognito-id' in request.headers:
311
  out_session_hash = request.headers['x-cognito-id']
312
- base_folder = "user-files/"
313
  print("Cognito ID found:", out_session_hash)
314
 
315
  elif 'x-amzn-oidc-identity' in request.headers:
316
  out_session_hash = request.headers['x-amzn-oidc-identity']
317
- base_folder = "user-files/"
318
 
319
  # Fetch email address using Cognito client
320
  cognito_client = boto3.client('cognito-idp')
@@ -331,20 +328,23 @@ async def get_connection_params(request: gr.Request):
331
  print("Error fetching user details:", e)
332
  email = None
333
 
334
-
335
  print("Cognito ID found:", out_session_hash)
336
 
337
  else:
338
  out_session_hash = request.session_hash
339
- base_folder = "temp-files/"
340
- # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
341
 
342
- output_folder = base_folder + out_session_hash + "/"
 
 
 
 
 
 
 
343
  #if bucket_name:
344
  # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
345
 
346
  return out_session_hash, output_folder, out_session_hash
347
-
348
 
349
  def clean_unicode_text(text):
350
  # Step 1: Normalize unicode characters to decompose any special forms
@@ -365,4 +365,20 @@ def clean_unicode_text(text):
365
  # Comment this line if you want to keep all Unicode characters.
366
  cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
367
 
368
- return cleaned_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
35
  print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
36
 
37
+ session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True')
38
+ print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
39
+
40
  input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
41
  print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
42
 
 
65
  def reset_review_vars():
66
  return [], pd.DataFrame(), pd.DataFrame()
67
 
 
 
68
  def load_in_default_allow_list(allow_list_file_path):
69
  if isinstance(allow_list_file_path, str):
70
  allow_list_file_path = [allow_list_file_path]
 
270
 
271
 
272
 
273
+ async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
 
274
 
275
  #print("request user:", request.username)
276
 
 
304
 
305
  if request.username:
306
  out_session_hash = request.username
 
307
  print("Request username found:", out_session_hash)
308
 
309
  elif 'x-cognito-id' in request.headers:
310
  out_session_hash = request.headers['x-cognito-id']
 
311
  print("Cognito ID found:", out_session_hash)
312
 
313
  elif 'x-amzn-oidc-identity' in request.headers:
314
  out_session_hash = request.headers['x-amzn-oidc-identity']
 
315
 
316
  # Fetch email address using Cognito client
317
  cognito_client = boto3.client('cognito-idp')
 
328
  print("Error fetching user details:", e)
329
  email = None
330
 
 
331
  print("Cognito ID found:", out_session_hash)
332
 
333
  else:
334
  out_session_hash = request.session_hash
 
 
335
 
336
+ if session_output_folder == 'True':
337
+ output_folder = output_folder_textbox + out_session_hash + "/"
338
+ else:
339
+ output_folder = output_folder_textbox
340
+
341
+ if not os.path.exists(output_folder):
342
+ os.mkdir(output_folder)
343
+
344
  #if bucket_name:
345
  # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
346
 
347
  return out_session_hash, output_folder, out_session_hash
 
348
 
349
  def clean_unicode_text(text):
350
  # Step 1: Normalize unicode characters to decompose any special forms
 
365
  # Comment this line if you want to keep all Unicode characters.
366
  cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
367
 
368
+ return cleaned_text
369
+
370
+ def load_all_output_files(folder_path:str=output_folder) -> List[str]:
371
+ """Get the file paths of all files in the given folder."""
372
+ file_paths = []
373
+
374
+ # List all files in the specified folder
375
+ for filename in os.listdir(folder_path):
376
+ # Construct full file path
377
+ full_path = os.path.join(folder_path, filename)
378
+ # Check if it's a file (not a directory)
379
+ if os.path.isfile(full_path):
380
+ file_paths.append(full_path)
381
+
382
+ return file_paths
383
+
384
+
tools/redaction_review.py CHANGED
@@ -247,7 +247,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
247
 
248
  return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
249
 
250
- def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
251
  '''
252
  Apply modified redactions to a pymupdf and export review files
253
  '''
@@ -363,10 +363,10 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
363
  try:
364
  #print("Saving annotations to JSON")
365
 
366
- out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
367
- with open(out_annotation_file_path, 'w') as f:
368
- json.dump(all_image_annotations, f)
369
- output_log_files.append(out_annotation_file_path)
370
 
371
  #print("Saving annotations to CSV review file")
372
 
@@ -379,7 +379,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
379
  output_files.append(out_review_file_file_path)
380
 
381
  except Exception as e:
382
- print("Could not save annotations to json or csv file:", e)
383
 
384
  return doc, all_image_annotations, output_files, output_log_files
385
 
@@ -535,7 +535,7 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
535
 
536
  return xml_str
537
 
538
- def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
539
  '''
540
  Load in files to convert a review file into an Adobe comment file format
541
  '''
@@ -586,7 +586,7 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
586
 
587
  ### Convert xfdf coordinates back to image for app
588
 
589
- def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
590
  '''
591
  Converts coordinates from Adobe PDF space to image space.
592
 
@@ -660,7 +660,7 @@ def parse_xfdf(xfdf_path):
660
 
661
  return redactions
662
 
663
- def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
664
  '''
665
  Convert redaction annotations from XFDF and associated images into a DataFrame.
666
 
 
247
 
248
  return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
249
 
250
+ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, output_folder:str = output_folder, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
251
  '''
252
  Apply modified redactions to a pymupdf and export review files
253
  '''
 
363
  try:
364
  #print("Saving annotations to JSON")
365
 
366
+ # out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
367
+ # with open(out_annotation_file_path, 'w') as f:
368
+ # json.dump(all_image_annotations, f)
369
+ # output_log_files.append(out_annotation_file_path)
370
 
371
  #print("Saving annotations to CSV review file")
372
 
 
379
  output_files.append(out_review_file_file_path)
380
 
381
  except Exception as e:
382
+ print("Could not save annotations to csv file:", e)
383
 
384
  return doc, all_image_annotations, output_files, output_log_files
385
 
 
535
 
536
  return xml_str
537
 
538
+ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder):
539
  '''
540
  Load in files to convert a review file into an Adobe comment file format
541
  '''
 
586
 
587
  ### Convert xfdf coordinates back to image for app
588
 
589
+ def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
590
  '''
591
  Converts coordinates from Adobe PDF space to image space.
592
 
 
660
 
661
  return redactions
662
 
663
+ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_paths:List[str], output_folder:str=output_folder):
664
  '''
665
  Convert redaction annotations from XFDF and associated images into a DataFrame.
666