Commit
·
dacc782
1
Parent(s):
f13e98b
Allowed for output files to be saved into user-specific folders. Added deny list capability to xlsx/csv file redaction
Browse files- Dockerfile +1 -1
- app.py +51 -43
- tools/data_anonymise.py +83 -14
- tools/file_redaction.py +15 -12
- tools/helper_functions.py +29 -13
- tools/redaction_review.py +9 -9
Dockerfile
CHANGED
@@ -63,7 +63,7 @@ RUN mkdir -p /home/user/app/output \
|
|
63 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
64 |
|
65 |
# Download NLTK data packages
|
66 |
-
RUN python -m nltk.downloader punkt stopwords punkt_tab
|
67 |
|
68 |
# Entrypoint helps to switch between Gradio and Lambda mode
|
69 |
COPY entrypoint.sh /entrypoint.sh
|
|
|
63 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
64 |
|
65 |
# Download NLTK data packages
|
66 |
+
RUN python -m nltk.downloader --quiet punkt stopwords punkt_tab
|
67 |
|
68 |
# Entrypoint helps to switch between Gradio and Lambda mode
|
69 |
COPY entrypoint.sh /entrypoint.sh
|
app.py
CHANGED
@@ -10,7 +10,7 @@ from datetime import datetime
|
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
@@ -70,37 +70,37 @@ with app:
|
|
70 |
pdf_doc_state = gr.State([])
|
71 |
all_image_annotations_state = gr.State([])
|
72 |
|
73 |
-
all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas")
|
74 |
-
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas")
|
75 |
-
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas")
|
76 |
|
77 |
-
session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
|
78 |
-
s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
|
|
|
79 |
|
80 |
-
first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
|
81 |
-
second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
|
82 |
-
do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
|
83 |
|
84 |
-
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
|
85 |
-
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
|
86 |
|
87 |
-
output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
|
88 |
-
output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
|
89 |
-
text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
|
90 |
-
log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
|
91 |
|
92 |
|
93 |
# Logging state
|
94 |
log_file_name = 'log.csv'
|
95 |
|
96 |
-
feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False)
|
97 |
-
feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False)
|
98 |
-
access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False)
|
99 |
-
access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False)
|
100 |
-
usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False)
|
101 |
-
usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False)
|
102 |
-
|
103 |
-
# Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
104 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
105 |
textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
|
106 |
comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
|
@@ -122,10 +122,10 @@ with app:
|
|
122 |
|
123 |
## Annotator zoom value
|
124 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
125 |
-
zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
|
126 |
-
zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
|
127 |
|
128 |
-
clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
|
129 |
prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
|
130 |
|
131 |
## Settings page variables
|
@@ -352,8 +352,12 @@ with app:
|
|
352 |
log_files_output = gr.File(label="Log file output", interactive=False)
|
353 |
|
354 |
with gr.Accordion("Combine multiple review files", open = False):
|
355 |
-
multiple_review_files_in_out = gr.File(label="
|
356 |
-
merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
|
|
|
|
|
|
|
|
|
357 |
|
358 |
|
359 |
### UI INTERACTION ###
|
@@ -364,12 +368,12 @@ with app:
|
|
364 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
365 |
|
366 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
367 |
-
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
|
368 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
|
369 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
370 |
|
371 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
372 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
|
373 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
|
374 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
375 |
|
@@ -391,17 +395,17 @@ with app:
|
|
391 |
annotate_current_page.submit(
|
392 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
393 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
394 |
-
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
395 |
|
396 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
397 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
398 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
399 |
-
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
400 |
|
401 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
402 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
403 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
404 |
-
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
405 |
|
406 |
# Zoom in and out on annotator
|
407 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
@@ -415,13 +419,13 @@ with app:
|
|
415 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
416 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
417 |
|
418 |
-
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
419 |
|
420 |
# Page controls at bottom
|
421 |
annotate_current_page_bottom.submit(
|
422 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
423 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
424 |
-
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
425 |
|
426 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
427 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
@@ -431,7 +435,7 @@ with app:
|
|
431 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
432 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
433 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
434 |
-
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
435 |
|
436 |
# Review table controls
|
437 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
@@ -439,28 +443,28 @@ with app:
|
|
439 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
|
440 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
441 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
442 |
-
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
443 |
|
444 |
# Convert review file to xfdf Adobe format
|
445 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
446 |
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
447 |
-
then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
|
448 |
|
449 |
# Convert xfdf Adobe file back to review_file.csv
|
450 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
451 |
then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
452 |
-
then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
|
453 |
|
454 |
###
|
455 |
# TABULAR DATA REDACTION
|
456 |
-
###
|
457 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
458 |
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
459 |
|
460 |
-
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
461 |
|
462 |
# If the output file count text box changes, keep going with redacting each data file until done
|
463 |
-
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
464 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
465 |
|
466 |
###
|
@@ -479,6 +483,10 @@ with app:
|
|
479 |
|
480 |
# Merge multiple review csv files together
|
481 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
|
|
|
|
|
|
|
|
482 |
|
483 |
|
484 |
###
|
@@ -486,7 +494,7 @@ with app:
|
|
486 |
###
|
487 |
|
488 |
# Get connection details on app load
|
489 |
-
app.load(get_connection_params, inputs=
|
490 |
|
491 |
# If running on AWS, load in the default allow list file from S3
|
492 |
# if RUN_AWS_FUNCTIONS == "1":
|
|
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
|
|
70 |
pdf_doc_state = gr.State([])
|
71 |
all_image_annotations_state = gr.State([])
|
72 |
|
73 |
+
all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas")
|
74 |
+
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas")
|
75 |
+
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas")
|
76 |
|
77 |
+
session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
|
78 |
+
s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
|
79 |
+
output_folder_textbox = gr.Textbox(value = output_folder, label="output_folder_textbox", visible=False)
|
80 |
|
81 |
+
first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
|
82 |
+
second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
|
83 |
+
do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
|
84 |
|
85 |
+
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
|
86 |
+
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
|
87 |
|
88 |
+
output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
|
89 |
+
output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
|
90 |
+
text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
|
91 |
+
log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
|
92 |
|
93 |
|
94 |
# Logging state
|
95 |
log_file_name = 'log.csv'
|
96 |
|
97 |
+
feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False)
|
98 |
+
feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False)
|
99 |
+
access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False)
|
100 |
+
access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False)
|
101 |
+
usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False)
|
102 |
+
usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False)
|
103 |
+
|
|
|
104 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
105 |
textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
|
106 |
comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
|
|
|
122 |
|
123 |
## Annotator zoom value
|
124 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
125 |
+
zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
|
126 |
+
zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
|
127 |
|
128 |
+
clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
|
129 |
prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
|
130 |
|
131 |
## Settings page variables
|
|
|
352 |
log_files_output = gr.File(label="Log file output", interactive=False)
|
353 |
|
354 |
with gr.Accordion("Combine multiple review files", open = False):
|
355 |
+
multiple_review_files_in_out = gr.File(label="Combine multiple review_file.csv files together here.", file_count='multiple', file_types=['.csv'])
|
356 |
+
merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
|
357 |
+
|
358 |
+
with gr.Accordion("View all output files from this session", open = False):
|
359 |
+
all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
|
360 |
+
all_output_files = gr.File(label="All output files.", file_count='multiple', file_types=['.csv'], interactive=False)
|
361 |
|
362 |
|
363 |
### UI INTERACTION ###
|
|
|
368 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
369 |
|
370 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
371 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
|
372 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
|
373 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
374 |
|
375 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
376 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
|
377 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
|
378 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
379 |
|
|
|
395 |
annotate_current_page.submit(
|
396 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
397 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
398 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
399 |
|
400 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
401 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
402 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
403 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
404 |
|
405 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
406 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
407 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
408 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
409 |
|
410 |
# Zoom in and out on annotator
|
411 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
|
|
419 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
420 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
421 |
|
422 |
+
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
423 |
|
424 |
# Page controls at bottom
|
425 |
annotate_current_page_bottom.submit(
|
426 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
427 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
428 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
429 |
|
430 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
431 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
|
|
435 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
436 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
437 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
438 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
439 |
|
440 |
# Review table controls
|
441 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
|
|
443 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
|
444 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
445 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
446 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
447 |
|
448 |
# Convert review file to xfdf Adobe format
|
449 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
450 |
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
451 |
+
then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[adobe_review_files_out])
|
452 |
|
453 |
# Convert xfdf Adobe file back to review_file.csv
|
454 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
455 |
then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
456 |
+
then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
457 |
|
458 |
###
|
459 |
# TABULAR DATA REDACTION
|
460 |
+
###
|
461 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
462 |
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
463 |
|
464 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
465 |
|
466 |
# If the output file count text box changes, keep going with redacting each data file until done
|
467 |
+
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
468 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
469 |
|
470 |
###
|
|
|
483 |
|
484 |
# Merge multiple review csv files together
|
485 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
486 |
+
|
487 |
+
|
488 |
+
#
|
489 |
+
all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
|
490 |
|
491 |
|
492 |
###
|
|
|
494 |
###
|
495 |
|
496 |
# Get connection details on app load
|
497 |
+
app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
|
498 |
|
499 |
# If running on AWS, load in the default allow list file from S3
|
500 |
# if RUN_AWS_FUNCTIONS == "1":
|
tools/data_anonymise.py
CHANGED
@@ -13,7 +13,7 @@ from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
|
13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
14 |
|
15 |
from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
|
16 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
17 |
|
18 |
# Use custom version of analyze_dict to be able to track progress
|
19 |
from tools.presidio_analyzer_custom import analyze_dict
|
@@ -108,9 +108,6 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
|
|
108 |
|
109 |
decision_process_output_str = '\n'.join(decision_process_output)
|
110 |
|
111 |
-
print("decision_process_output_str:\n\n", decision_process_output_str)
|
112 |
-
|
113 |
-
|
114 |
return decision_process_output_str
|
115 |
|
116 |
def anon_consistent_names(df):
|
@@ -205,7 +202,7 @@ def anon_consistent_names(df):
|
|
205 |
|
206 |
return scrubbed_df_consistent_names
|
207 |
|
208 |
-
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
|
209 |
|
210 |
print("Identifying personal information")
|
211 |
analyse_tic = time.perf_counter()
|
@@ -220,6 +217,21 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
220 |
else:
|
221 |
in_allow_list_flat = []
|
222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
#analyzer = nlp_analyser #AnalyzerEngine()
|
224 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
225 |
|
@@ -242,8 +254,6 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
242 |
# Usage in the main function:
|
243 |
decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
|
244 |
|
245 |
-
#print("decision_process_output_str:\n\n", decision_process_output_str)
|
246 |
-
|
247 |
analyse_toc = time.perf_counter()
|
248 |
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
249 |
print(analyse_time_out)
|
@@ -287,8 +297,46 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
287 |
|
288 |
return scrubbed_df, key_string, decision_process_output_str
|
289 |
|
290 |
-
def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths):
|
291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
def check_lists(list1, list2):
|
293 |
return any(string in list2 for string in list1)
|
294 |
|
@@ -327,7 +375,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
|
|
327 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
328 |
|
329 |
# Anonymise the selected columns
|
330 |
-
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
|
331 |
|
332 |
# Rejoin the dataframe together
|
333 |
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
@@ -374,7 +422,28 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
|
|
374 |
|
375 |
return out_file_paths, out_message, key_string, log_files_output_paths
|
376 |
|
377 |
-
def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], log_files_output_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
|
379 |
tic = time.perf_counter()
|
380 |
|
@@ -389,7 +458,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
389 |
if isinstance(out_message, str):
|
390 |
out_message = [out_message]
|
391 |
|
392 |
-
print("log_files_output_paths:",log_files_output_paths)
|
393 |
|
394 |
if isinstance(log_files_output_paths, str):
|
395 |
log_files_output_paths = []
|
@@ -433,7 +502,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
433 |
file_type = ""
|
434 |
out_file_part = anon_file
|
435 |
|
436 |
-
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
437 |
else:
|
438 |
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
439 |
file_type = detect_file_type(anon_file)
|
@@ -472,14 +541,14 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
472 |
print(anon_df.head()) # Print the first few rows
|
473 |
|
474 |
|
475 |
-
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths)
|
476 |
|
477 |
else:
|
478 |
sheet_name = ""
|
479 |
anon_df = read_file(anon_file)
|
480 |
out_file_part = get_file_name_without_type(anon_file.name)
|
481 |
|
482 |
-
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
483 |
|
484 |
# Increase latest file completed count unless we are at the last file
|
485 |
if latest_file_completed != len(file_paths):
|
|
|
13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
14 |
|
15 |
from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
|
16 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser
|
17 |
|
18 |
# Use custom version of analyze_dict to be able to track progress
|
19 |
from tools.presidio_analyzer_custom import analyze_dict
|
|
|
108 |
|
109 |
decision_process_output_str = '\n'.join(decision_process_output)
|
110 |
|
|
|
|
|
|
|
111 |
return decision_process_output_str
|
112 |
|
113 |
def anon_consistent_names(df):
|
|
|
202 |
|
203 |
return scrubbed_df_consistent_names
|
204 |
|
205 |
+
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], progress=Progress(track_tqdm=False)):
|
206 |
|
207 |
print("Identifying personal information")
|
208 |
analyse_tic = time.perf_counter()
|
|
|
217 |
else:
|
218 |
in_allow_list_flat = []
|
219 |
|
220 |
+
if isinstance(in_deny_list, pd.DataFrame):
|
221 |
+
if not in_deny_list.empty:
|
222 |
+
in_deny_list = in_deny_list.iloc[:, 0].tolist()
|
223 |
+
else:
|
224 |
+
# Handle the case where the DataFrame is empty
|
225 |
+
in_deny_list = [] # or some default value
|
226 |
+
|
227 |
+
# Sort the strings in order from the longest string to the shortest
|
228 |
+
in_deny_list = sorted(in_deny_list, key=len, reverse=True)
|
229 |
+
|
230 |
+
if in_deny_list:
|
231 |
+
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
232 |
+
new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
|
233 |
+
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
234 |
+
|
235 |
#analyzer = nlp_analyser #AnalyzerEngine()
|
236 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
237 |
|
|
|
254 |
# Usage in the main function:
|
255 |
decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
|
256 |
|
|
|
|
|
257 |
analyse_toc = time.perf_counter()
|
258 |
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
259 |
print(analyse_time_out)
|
|
|
297 |
|
298 |
return scrubbed_df, key_string, decision_process_output_str
|
299 |
|
|
|
300 |
|
301 |
+
def anon_wrapper_func(
|
302 |
+
anon_file: str,
|
303 |
+
anon_df: pd.DataFrame,
|
304 |
+
chosen_cols: List[str],
|
305 |
+
out_file_paths: List[str],
|
306 |
+
out_file_part: str,
|
307 |
+
out_message: str,
|
308 |
+
excel_sheet_name: str,
|
309 |
+
anon_strat: str,
|
310 |
+
language: str,
|
311 |
+
chosen_redact_entities: List[str],
|
312 |
+
in_allow_list: List[str],
|
313 |
+
file_type: str,
|
314 |
+
anon_xlsx_export_file_name: str,
|
315 |
+
log_files_output_paths: List[str],
|
316 |
+
in_deny_list: List[str]=[],
|
317 |
+
output_folder: str = output_folder
|
318 |
+
):
|
319 |
+
"""
|
320 |
+
This function wraps the anonymization process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymization strategy, and exports the anonymized data to a file.
|
321 |
+
|
322 |
+
Input Variables:
|
323 |
+
- anon_file: The path to the file containing the data to be anonymized.
|
324 |
+
- anon_df: The pandas DataFrame containing the data to be anonymized.
|
325 |
+
- chosen_cols: A list of column names to be anonymized.
|
326 |
+
- out_file_paths: A list of paths where the anonymized files will be saved.
|
327 |
+
- out_file_part: A part of the output file name.
|
328 |
+
- out_message: A message to be displayed during the anonymization process.
|
329 |
+
- excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
|
330 |
+
- anon_strat: The anonymization strategy to be applied.
|
331 |
+
- language: The language of the data to be anonymized.
|
332 |
+
- chosen_redact_entities: A list of entities to be redacted.
|
333 |
+
- in_allow_list: A list of allowed values.
|
334 |
+
- file_type: The type of file to be exported.
|
335 |
+
- anon_xlsx_export_file_name: The name of the anonymized Excel file.
|
336 |
+
- log_files_output_paths: A list of paths where the log files will be saved.
|
337 |
+
- in_deny_list: List of specific terms to remove from the data.
|
338 |
+
- output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
|
339 |
+
"""
|
340 |
def check_lists(list1, list2):
|
341 |
return any(string in list2 for string in list1)
|
342 |
|
|
|
375 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
376 |
|
377 |
# Anonymise the selected columns
|
378 |
+
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list)
|
379 |
|
380 |
# Rejoin the dataframe together
|
381 |
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
|
|
422 |
|
423 |
return out_file_paths, out_message, key_string, log_files_output_paths
|
424 |
|
425 |
+
def anonymise_data_files(file_paths: List[str], in_text: str, anon_strat: str, chosen_cols: List[str], language: str, chosen_redact_entities: List[str], in_allow_list: List[str] = None, latest_file_completed: int = 0, out_message: list = [], out_file_paths: list = [], log_files_output_paths: list = [], in_excel_sheets: list = [], first_loop_state: bool = False, output_folder: str = output_folder, in_deny_list:list[str]=[], progress: Progress = Progress(track_tqdm=True)):
|
426 |
+
"""
|
427 |
+
This function anonymises data files based on the provided parameters.
|
428 |
+
|
429 |
+
Parameters:
|
430 |
+
- file_paths (List[str]): A list of file paths to anonymise.
|
431 |
+
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
432 |
+
- anon_strat (str): The anonymisation strategy to use.
|
433 |
+
- chosen_cols (List[str]): A list of column names to anonymise.
|
434 |
+
- language (str): The language of the text to anonymise.
|
435 |
+
- chosen_redact_entities (List[str]): A list of entities to redact.
|
436 |
+
- in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
|
437 |
+
- latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
|
438 |
+
- out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
439 |
+
- out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
|
440 |
+
- log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
|
441 |
+
- in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
|
442 |
+
- first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
|
443 |
+
- output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
|
444 |
+
- in_deny_list (list[str], optional): A list of specific terms to redact.
|
445 |
+
- progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
|
446 |
+
"""
|
447 |
|
448 |
tic = time.perf_counter()
|
449 |
|
|
|
458 |
if isinstance(out_message, str):
|
459 |
out_message = [out_message]
|
460 |
|
461 |
+
#print("log_files_output_paths:",log_files_output_paths)
|
462 |
|
463 |
if isinstance(log_files_output_paths, str):
|
464 |
log_files_output_paths = []
|
|
|
502 |
file_type = ""
|
503 |
out_file_part = anon_file
|
504 |
|
505 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
|
506 |
else:
|
507 |
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
508 |
file_type = detect_file_type(anon_file)
|
|
|
541 |
print(anon_df.head()) # Print the first few rows
|
542 |
|
543 |
|
544 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, output_folder=output_folder)
|
545 |
|
546 |
else:
|
547 |
sheet_name = ""
|
548 |
anon_df = read_file(anon_file)
|
549 |
out_file_part = get_file_name_without_type(anon_file.name)
|
550 |
|
551 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
|
552 |
|
553 |
# Increase latest file completed count unless we are at the last file
|
554 |
if latest_file_completed != len(file_paths):
|
tools/file_redaction.py
CHANGED
@@ -375,7 +375,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
375 |
redact_whole_page_list,
|
376 |
max_fuzzy_spelling_mistakes_num,
|
377 |
match_fuzzy_whole_phrase_bool,
|
378 |
-
log_files_output_paths=log_files_output_paths
|
|
|
379 |
|
380 |
# Save Textract request metadata (if exists)
|
381 |
if new_request_metadata:
|
@@ -443,15 +444,15 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
443 |
|
444 |
out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
445 |
|
446 |
-
logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
447 |
-
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
448 |
-
log_files_output_paths.append(logs_output_file_name)
|
449 |
|
450 |
all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
|
451 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
452 |
out_file_paths.append(all_text_output_file_name)
|
453 |
|
454 |
-
# Save the gradio_annotation_boxes to a
|
455 |
try:
|
456 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
457 |
|
@@ -461,15 +462,15 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
461 |
|
462 |
#print("Saved review file to csv")
|
463 |
|
464 |
-
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
465 |
-
with open(out_annotation_file_path, 'w') as f:
|
466 |
-
|
467 |
-
log_files_output_paths.append(out_annotation_file_path)
|
468 |
|
469 |
#print("Saving annotations to JSON")
|
470 |
|
471 |
except Exception as e:
|
472 |
-
print("Could not save annotations to
|
473 |
|
474 |
# Make a combined message for the file
|
475 |
if isinstance(out_message, list):
|
@@ -942,7 +943,8 @@ def redact_image_pdf(file_path:str,
|
|
942 |
match_fuzzy_whole_phrase_bool:bool=True,
|
943 |
page_break_val:int=int(page_break_value),
|
944 |
log_files_output_paths:List=[],
|
945 |
-
max_time:int=int(max_time_value),
|
|
|
946 |
progress=Progress(track_tqdm=True)):
|
947 |
|
948 |
'''
|
@@ -976,7 +978,8 @@ def redact_image_pdf(file_path:str,
|
|
976 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
977 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
978 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
979 |
-
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
|
|
980 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
981 |
|
982 |
The function returns a redacted PDF document along with processing output objects.
|
|
|
375 |
redact_whole_page_list,
|
376 |
max_fuzzy_spelling_mistakes_num,
|
377 |
match_fuzzy_whole_phrase_bool,
|
378 |
+
log_files_output_paths=log_files_output_paths,
|
379 |
+
output_folder=output_folder)
|
380 |
|
381 |
# Save Textract request metadata (if exists)
|
382 |
if new_request_metadata:
|
|
|
444 |
|
445 |
out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
446 |
|
447 |
+
#logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
448 |
+
#all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
449 |
+
#log_files_output_paths.append(logs_output_file_name)
|
450 |
|
451 |
all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
|
452 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
453 |
out_file_paths.append(all_text_output_file_name)
|
454 |
|
455 |
+
# Save the gradio_annotation_boxes to a review csv file
|
456 |
try:
|
457 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
458 |
|
|
|
462 |
|
463 |
#print("Saved review file to csv")
|
464 |
|
465 |
+
# out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
466 |
+
# with open(out_annotation_file_path, 'w') as f:
|
467 |
+
# json.dump(annotations_all_pages, f)
|
468 |
+
# log_files_output_paths.append(out_annotation_file_path)
|
469 |
|
470 |
#print("Saving annotations to JSON")
|
471 |
|
472 |
except Exception as e:
|
473 |
+
print("Could not save annotations to csv file:", e)
|
474 |
|
475 |
# Make a combined message for the file
|
476 |
if isinstance(out_message, list):
|
|
|
943 |
match_fuzzy_whole_phrase_bool:bool=True,
|
944 |
page_break_val:int=int(page_break_value),
|
945 |
log_files_output_paths:List=[],
|
946 |
+
max_time:int=int(max_time_value),
|
947 |
+
output_folder:str=output_folder,
|
948 |
progress=Progress(track_tqdm=True)):
|
949 |
|
950 |
'''
|
|
|
978 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
979 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
980 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
981 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
982 |
+
- output_folder (str, optional): The folder for file outputs.
|
983 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
984 |
|
985 |
The function returns a redacted PDF document along with processing output objects.
|
tools/helper_functions.py
CHANGED
@@ -34,6 +34,9 @@ aws_pii_detector = "AWS Comprehend"
|
|
34 |
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
35 |
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
36 |
|
|
|
|
|
|
|
37 |
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
38 |
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
39 |
|
@@ -62,8 +65,6 @@ def reset_state_vars():
|
|
62 |
def reset_review_vars():
|
63 |
return [], pd.DataFrame(), pd.DataFrame()
|
64 |
|
65 |
-
|
66 |
-
|
67 |
def load_in_default_allow_list(allow_list_file_path):
|
68 |
if isinstance(allow_list_file_path, str):
|
69 |
allow_list_file_path = [allow_list_file_path]
|
@@ -269,8 +270,7 @@ def merge_csv_files(file_list):
|
|
269 |
|
270 |
|
271 |
|
272 |
-
async def get_connection_params(request: gr.Request):
|
273 |
-
base_folder = ""
|
274 |
|
275 |
#print("request user:", request.username)
|
276 |
|
@@ -304,17 +304,14 @@ async def get_connection_params(request: gr.Request):
|
|
304 |
|
305 |
if request.username:
|
306 |
out_session_hash = request.username
|
307 |
-
base_folder = "user-files/"
|
308 |
print("Request username found:", out_session_hash)
|
309 |
|
310 |
elif 'x-cognito-id' in request.headers:
|
311 |
out_session_hash = request.headers['x-cognito-id']
|
312 |
-
base_folder = "user-files/"
|
313 |
print("Cognito ID found:", out_session_hash)
|
314 |
|
315 |
elif 'x-amzn-oidc-identity' in request.headers:
|
316 |
out_session_hash = request.headers['x-amzn-oidc-identity']
|
317 |
-
base_folder = "user-files/"
|
318 |
|
319 |
# Fetch email address using Cognito client
|
320 |
cognito_client = boto3.client('cognito-idp')
|
@@ -331,20 +328,23 @@ async def get_connection_params(request: gr.Request):
|
|
331 |
print("Error fetching user details:", e)
|
332 |
email = None
|
333 |
|
334 |
-
|
335 |
print("Cognito ID found:", out_session_hash)
|
336 |
|
337 |
else:
|
338 |
out_session_hash = request.session_hash
|
339 |
-
base_folder = "temp-files/"
|
340 |
-
# print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
|
341 |
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
#if bucket_name:
|
344 |
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
345 |
|
346 |
return out_session_hash, output_folder, out_session_hash
|
347 |
-
|
348 |
|
349 |
def clean_unicode_text(text):
|
350 |
# Step 1: Normalize unicode characters to decompose any special forms
|
@@ -365,4 +365,20 @@ def clean_unicode_text(text):
|
|
365 |
# Comment this line if you want to keep all Unicode characters.
|
366 |
cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
|
367 |
|
368 |
-
return cleaned_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
35 |
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
36 |
|
37 |
+
session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True')
|
38 |
+
print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
|
39 |
+
|
40 |
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
41 |
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
42 |
|
|
|
65 |
def reset_review_vars():
|
66 |
return [], pd.DataFrame(), pd.DataFrame()
|
67 |
|
|
|
|
|
68 |
def load_in_default_allow_list(allow_list_file_path):
|
69 |
if isinstance(allow_list_file_path, str):
|
70 |
allow_list_file_path = [allow_list_file_path]
|
|
|
270 |
|
271 |
|
272 |
|
273 |
+
async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
|
|
|
274 |
|
275 |
#print("request user:", request.username)
|
276 |
|
|
|
304 |
|
305 |
if request.username:
|
306 |
out_session_hash = request.username
|
|
|
307 |
print("Request username found:", out_session_hash)
|
308 |
|
309 |
elif 'x-cognito-id' in request.headers:
|
310 |
out_session_hash = request.headers['x-cognito-id']
|
|
|
311 |
print("Cognito ID found:", out_session_hash)
|
312 |
|
313 |
elif 'x-amzn-oidc-identity' in request.headers:
|
314 |
out_session_hash = request.headers['x-amzn-oidc-identity']
|
|
|
315 |
|
316 |
# Fetch email address using Cognito client
|
317 |
cognito_client = boto3.client('cognito-idp')
|
|
|
328 |
print("Error fetching user details:", e)
|
329 |
email = None
|
330 |
|
|
|
331 |
print("Cognito ID found:", out_session_hash)
|
332 |
|
333 |
else:
|
334 |
out_session_hash = request.session_hash
|
|
|
|
|
335 |
|
336 |
+
if session_output_folder == 'True':
|
337 |
+
output_folder = output_folder_textbox + out_session_hash + "/"
|
338 |
+
else:
|
339 |
+
output_folder = output_folder_textbox
|
340 |
+
|
341 |
+
if not os.path.exists(output_folder):
|
342 |
+
os.mkdir(output_folder)
|
343 |
+
|
344 |
#if bucket_name:
|
345 |
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
346 |
|
347 |
return out_session_hash, output_folder, out_session_hash
|
|
|
348 |
|
349 |
def clean_unicode_text(text):
|
350 |
# Step 1: Normalize unicode characters to decompose any special forms
|
|
|
365 |
# Comment this line if you want to keep all Unicode characters.
|
366 |
cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
|
367 |
|
368 |
+
return cleaned_text
|
369 |
+
|
370 |
+
def load_all_output_files(folder_path:str=output_folder) -> List[str]:
|
371 |
+
"""Get the file paths of all files in the given folder."""
|
372 |
+
file_paths = []
|
373 |
+
|
374 |
+
# List all files in the specified folder
|
375 |
+
for filename in os.listdir(folder_path):
|
376 |
+
# Construct full file path
|
377 |
+
full_path = os.path.join(folder_path, filename)
|
378 |
+
# Check if it's a file (not a directory)
|
379 |
+
if os.path.isfile(full_path):
|
380 |
+
file_paths.append(full_path)
|
381 |
+
|
382 |
+
return file_paths
|
383 |
+
|
384 |
+
|
tools/redaction_review.py
CHANGED
@@ -247,7 +247,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
247 |
|
248 |
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
|
249 |
|
250 |
-
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
|
251 |
'''
|
252 |
Apply modified redactions to a pymupdf and export review files
|
253 |
'''
|
@@ -363,10 +363,10 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
363 |
try:
|
364 |
#print("Saving annotations to JSON")
|
365 |
|
366 |
-
out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
|
367 |
-
with open(out_annotation_file_path, 'w') as f:
|
368 |
-
|
369 |
-
output_log_files.append(out_annotation_file_path)
|
370 |
|
371 |
#print("Saving annotations to CSV review file")
|
372 |
|
@@ -379,7 +379,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
379 |
output_files.append(out_review_file_file_path)
|
380 |
|
381 |
except Exception as e:
|
382 |
-
print("Could not save annotations to
|
383 |
|
384 |
return doc, all_image_annotations, output_files, output_log_files
|
385 |
|
@@ -535,7 +535,7 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
|
|
535 |
|
536 |
return xml_str
|
537 |
|
538 |
-
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
|
539 |
'''
|
540 |
Load in files to convert a review file into an Adobe comment file format
|
541 |
'''
|
@@ -586,7 +586,7 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
|
|
586 |
|
587 |
### Convert xfdf coordinates back to image for app
|
588 |
|
589 |
-
def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
590 |
'''
|
591 |
Converts coordinates from Adobe PDF space to image space.
|
592 |
|
@@ -660,7 +660,7 @@ def parse_xfdf(xfdf_path):
|
|
660 |
|
661 |
return redactions
|
662 |
|
663 |
-
def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
|
664 |
'''
|
665 |
Convert redaction annotations from XFDF and associated images into a DataFrame.
|
666 |
|
|
|
247 |
|
248 |
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
|
249 |
|
250 |
+
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, output_folder:str = output_folder, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
|
251 |
'''
|
252 |
Apply modified redactions to a pymupdf and export review files
|
253 |
'''
|
|
|
363 |
try:
|
364 |
#print("Saving annotations to JSON")
|
365 |
|
366 |
+
# out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
|
367 |
+
# with open(out_annotation_file_path, 'w') as f:
|
368 |
+
# json.dump(all_image_annotations, f)
|
369 |
+
# output_log_files.append(out_annotation_file_path)
|
370 |
|
371 |
#print("Saving annotations to CSV review file")
|
372 |
|
|
|
379 |
output_files.append(out_review_file_file_path)
|
380 |
|
381 |
except Exception as e:
|
382 |
+
print("Could not save annotations to csv file:", e)
|
383 |
|
384 |
return doc, all_image_annotations, output_files, output_log_files
|
385 |
|
|
|
535 |
|
536 |
return xml_str
|
537 |
|
538 |
+
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder):
|
539 |
'''
|
540 |
Load in files to convert a review file into an Adobe comment file format
|
541 |
'''
|
|
|
586 |
|
587 |
### Convert xfdf coordinates back to image for app
|
588 |
|
589 |
+
def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|
590 |
'''
|
591 |
Converts coordinates from Adobe PDF space to image space.
|
592 |
|
|
|
660 |
|
661 |
return redactions
|
662 |
|
663 |
+
def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_paths:List[str], output_folder:str=output_folder):
|
664 |
'''
|
665 |
Convert redaction annotations from XFDF and associated images into a DataFrame.
|
666 |
|