Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Mar 3

Commit

dacc782

1 Parent(s): f13e98b

Allowed for output files to be saved into user-specific folders. Added deny list capability to xlsx/csv file redaction

Browse files

Files changed (6) hide show

Dockerfile +1 -1
app.py +51 -43
tools/data_anonymise.py +83 -14
tools/file_redaction.py +15 -12
tools/helper_functions.py +29 -13
tools/redaction_review.py +9 -9

Dockerfile CHANGED Viewed

@@ -63,7 +63,7 @@ RUN mkdir -p /home/user/app/output \
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 # Download NLTK data packages
-RUN python -m nltk.downloader punkt stopwords punkt_tab
 # Entrypoint helps to switch between Gradio and Lambda mode
 COPY entrypoint.sh /entrypoint.sh

 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 # Download NLTK data packages
+RUN python -m nltk.downloader --quiet punkt stopwords punkt_tab
 # Entrypoint helps to switch between Gradio and Lambda mode
 COPY entrypoint.sh /entrypoint.sh

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from datetime import datetime
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
@@ -70,37 +70,37 @@ with app:
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
-    all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
-    review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False) #.State()
-    s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False) #.State()
-    first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False) #.State(True)
-    second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False) #.State(False)
-    do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False) #.State(False)
-    prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
-    images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
-    output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
-    output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
-    text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
-    log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     # Logging state
     log_file_name = 'log.csv'
-    feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False) #State(feedback_logs_folder + log_file_name)
-    feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False) #State(feedback_logs_folder)
-    access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False) #State(access_logs_folder + log_file_name)
-    access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False) #State(access_logs_folder)
-    usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False) #State(usage_logs_folder + log_file_name)
-    usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False) #State(usage_logs_folder)
-    # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
     comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
@@ -122,10 +122,10 @@ with app:
     ## Annotator zoom value
     annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
-    zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False) #State(True)
-    zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False) #State(False)
-    clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False) #State(True)
     prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
     ## Settings page variables
@@ -352,8 +352,12 @@ with app:
         log_files_output = gr.File(label="Log file output", interactive=False)
         with gr.Accordion("Combine multiple review files", open = False):
-            multiple_review_files_in_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv'])
-            merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
     ### UI INTERACTION ###
@@ -364,12 +368,12 @@ with app:
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
@@ -391,17 +395,17 @@ with app:
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Zoom in and out on annotator
     annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
@@ -415,13 +419,13 @@ with app:
     clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
-    annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
@@ -431,7 +435,7 @@ with app:
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Review table controls
     recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
@@ -439,28 +443,28 @@ with app:
     recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
     then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
-        then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
-        then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
     ###
     # TABULAR DATA REDACTION
-    ###
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
                   then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
-    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
     # If the output file count text box changes, keep going with redacting each data file until done
-    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
@@ -479,6 +483,10 @@ with app:
     # Merge multiple review csv files together
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
     ###
@@ -486,7 +494,7 @@ with app:
     ###
     # Get connection details on app load
-    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # If running on AWS, load in the default allow list file from S3
     # if RUN_AWS_FUNCTIONS == "1":

 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
+    all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_line_level_ocr_results_df", visible=False, type="pandas")
+    all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas")
+    review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas")
+    session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
+    s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
+    output_folder_textbox = gr.Textbox(value = output_folder, label="output_folder_textbox", visible=False)
+    first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
+    second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
+    do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
+    prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
+    images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
+    output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
+    output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
+    text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
+    log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
     # Logging state
     log_file_name = 'log.csv'
+    feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False)
+    feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False)
+    access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False)
+    access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False)
+    usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False)
+    usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False)
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
     comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
     ## Annotator zoom value
     annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
+    zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
+    zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
+    clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
     prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
     ## Settings page variables
         log_files_output = gr.File(label="Log file output", interactive=False)
         with gr.Accordion("Combine multiple review files", open = False):
+            multiple_review_files_in_out = gr.File(label="Combine multiple review_file.csv files together here.", file_count='multiple', file_types=['.csv'])
+            merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
+        with gr.Accordion("View all output files from this session", open = False):
+            all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
+            all_output_files = gr.File(label="All output files.", file_count='multiple', file_types=['.csv'], interactive=False)
     ### UI INTERACTION ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Zoom in and out on annotator
     annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
     clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
+    annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
         then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Review table controls
     recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
     recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
     then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
+        then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
+        then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
     # TABULAR DATA REDACTION
+    ###
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
                   then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
+    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
     # If the output file count text box changes, keep going with redacting each data file until done
+    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # Merge multiple review csv files together
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
+    #
+    all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
     ###
     ###
     # Get connection details on app load
+    app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
     # If running on AWS, load in the default allow list file from S3
     # if RUN_AWS_FUNCTIONS == "1":

tools/data_anonymise.py CHANGED Viewed

@@ -13,7 +13,7 @@ from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
@@ -108,9 +108,6 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
     decision_process_output_str = '\n'.join(decision_process_output)
-    print("decision_process_output_str:\n\n", decision_process_output_str)
     return decision_process_output_str
 def anon_consistent_names(df):
@@ -205,7 +202,7 @@ def anon_consistent_names(df):
     return scrubbed_df_consistent_names
-def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
     print("Identifying personal information")
     analyse_tic = time.perf_counter()
@@ -220,6 +217,21 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     else:
         in_allow_list_flat = []
     #analyzer = nlp_analyser #AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
@@ -242,8 +254,6 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     # Usage in the main function:
     decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
-    #print("decision_process_output_str:\n\n", decision_process_output_str)
     analyse_toc = time.perf_counter()
     analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
@@ -287,8 +297,46 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     return scrubbed_df, key_string, decision_process_output_str
-def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths):
     def check_lists(list1, list2):
             return any(string in list2 for string in list1)
@@ -327,7 +375,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
-    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
     # Rejoin the dataframe together
     anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
@@ -374,7 +422,28 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
     return out_file_paths, out_message, key_string, log_files_output_paths
-def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], log_files_output_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
     tic = time.perf_counter()
@@ -389,7 +458,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
     if isinstance(out_message, str):
         out_message = [out_message]
-    print("log_files_output_paths:",log_files_output_paths)
     if isinstance(log_files_output_paths, str):
         log_files_output_paths = []
@@ -433,7 +502,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
             file_type = ""
             out_file_part = anon_file
-            out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(anon_file)
@@ -472,14 +541,14 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
                     print(anon_df.head())  # Print the first few rows
-                    out_file_paths, out_message, key_string, log_files_output_paths  = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type,  anon_xlsx_export_file_name, log_files_output_paths)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
-                out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):

 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
     decision_process_output_str = '\n'.join(decision_process_output)
     return decision_process_output_str
 def anon_consistent_names(df):
     return scrubbed_df_consistent_names
+def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], progress=Progress(track_tqdm=False)):
     print("Identifying personal information")
     analyse_tic = time.perf_counter()
     else:
         in_allow_list_flat = []
+    if isinstance(in_deny_list, pd.DataFrame):
+        if not in_deny_list.empty:
+            in_deny_list = in_deny_list.iloc[:, 0].tolist()
+        else:
+            # Handle the case where the DataFrame is empty
+            in_deny_list = []  # or some default value
+        # Sort the strings in order from the longest string to the shortest
+        in_deny_list = sorted(in_deny_list, key=len, reverse=True)
+    if in_deny_list:
+        nlp_analyser.registry.remove_recognizer("CUSTOM")
+        new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
+        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
     #analyzer = nlp_analyser #AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
     # Usage in the main function:
     decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
     analyse_toc = time.perf_counter()
     analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
     return scrubbed_df, key_string, decision_process_output_str
+def anon_wrapper_func(
+    anon_file: str,
+    anon_df: pd.DataFrame,
+    chosen_cols: List[str],
+    out_file_paths: List[str],
+    out_file_part: str,
+    out_message: str,
+    excel_sheet_name: str,
+    anon_strat: str,
+    language: str,
+    chosen_redact_entities: List[str],
+    in_allow_list: List[str],
+    file_type: str,
+    anon_xlsx_export_file_name: str,
+    log_files_output_paths: List[str],
+    in_deny_list: List[str]=[],
+    output_folder: str = output_folder
+):
+    """
+    This function wraps the anonymization process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymization strategy, and exports the anonymized data to a file.
+    Input Variables:
+    - anon_file: The path to the file containing the data to be anonymized.
+    - anon_df: The pandas DataFrame containing the data to be anonymized.
+    - chosen_cols: A list of column names to be anonymized.
+    - out_file_paths: A list of paths where the anonymized files will be saved.
+    - out_file_part: A part of the output file name.
+    - out_message: A message to be displayed during the anonymization process.
+    - excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
+    - anon_strat: The anonymization strategy to be applied.
+    - language: The language of the data to be anonymized.
+    - chosen_redact_entities: A list of entities to be redacted.
+    - in_allow_list: A list of allowed values.
+    - file_type: The type of file to be exported.
+    - anon_xlsx_export_file_name: The name of the anonymized Excel file.
+    - log_files_output_paths: A list of paths where the log files will be saved.
+    - in_deny_list: List of specific terms to remove from the data.
+    - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
+    """
     def check_lists(list1, list2):
             return any(string in list2 for string in list1)
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
+    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list)
     # Rejoin the dataframe together
     anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
     return out_file_paths, out_message, key_string, log_files_output_paths
+def anonymise_data_files(file_paths: List[str], in_text: str, anon_strat: str, chosen_cols: List[str], language: str, chosen_redact_entities: List[str], in_allow_list: List[str] = None, latest_file_completed: int = 0, out_message: list = [], out_file_paths: list = [], log_files_output_paths: list = [], in_excel_sheets: list = [], first_loop_state: bool = False, output_folder: str = output_folder, in_deny_list:list[str]=[], progress: Progress = Progress(track_tqdm=True)):
+    """
+    This function anonymises data files based on the provided parameters.
+    Parameters:
+    - file_paths (List[str]): A list of file paths to anonymise.
+    - in_text (str): The text to anonymise if file_paths is 'open_text'.
+    - anon_strat (str): The anonymisation strategy to use.
+    - chosen_cols (List[str]): A list of column names to anonymise.
+    - language (str): The language of the text to anonymise.
+    - chosen_redact_entities (List[str]): A list of entities to redact.
+    - in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
+    - latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
+    - out_message (list, optional): A list to store output messages. Defaults to an empty list.
+    - out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
+    - log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
+    - in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
+    - first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
+    - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
+    - in_deny_list (list[str], optional): A list of specific terms to redact.
+    - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
+    """
     tic = time.perf_counter()
     if isinstance(out_message, str):
         out_message = [out_message]
+    #print("log_files_output_paths:",log_files_output_paths)
     if isinstance(log_files_output_paths, str):
         log_files_output_paths = []
             file_type = ""
             out_file_part = anon_file
+            out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(anon_file)
                     print(anon_df.head())  # Print the first few rows
+                    out_file_paths, out_message, key_string, log_files_output_paths  = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type,  anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, output_folder=output_folder)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
+                out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):

tools/file_redaction.py CHANGED Viewed

@@ -375,7 +375,8 @@ def choose_and_run_redactor(file_paths:List[str],
              redact_whole_page_list,
              max_fuzzy_spelling_mistakes_num,
              match_fuzzy_whole_phrase_bool,
-             log_files_output_paths=log_files_output_paths)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
@@ -443,15 +444,15 @@ def choose_and_run_redactor(file_paths:List[str],
             out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
-            logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
-            all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
-            log_files_output_paths.append(logs_output_file_name)
             all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
             out_file_paths.append(all_text_output_file_name)
-            # Save the gradio_annotation_boxes to a JSON file
             try:
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
@@ -461,15 +462,15 @@ def choose_and_run_redactor(file_paths:List[str],
                 #print("Saved review file to csv")
-                out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
-                with open(out_annotation_file_path, 'w') as f:
-                    json.dump(annotations_all_pages, f)
-                log_files_output_paths.append(out_annotation_file_path)
                 #print("Saving annotations to JSON")
             except Exception as e:
-                print("Could not save annotations to json or csv file:", e)
             # Make a combined message for the file
             if isinstance(out_message, list):
@@ -942,7 +943,8 @@ def redact_image_pdf(file_path:str,
                      match_fuzzy_whole_phrase_bool:bool=True,
                      page_break_val:int=int(page_break_value),
                      log_files_output_paths:List=[],
-                     max_time:int=int(max_time_value),
                      progress=Progress(track_tqdm=True)):
     '''
@@ -976,7 +978,8 @@ def redact_image_pdf(file_path:str,
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
-    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted PDF document along with processing output objects.

              redact_whole_page_list,
              max_fuzzy_spelling_mistakes_num,
              match_fuzzy_whole_phrase_bool,
+             log_files_output_paths=log_files_output_paths,
+             output_folder=output_folder)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
             out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
+            #logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
+            #all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
+            #log_files_output_paths.append(logs_output_file_name)
             all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
             out_file_paths.append(all_text_output_file_name)
+            # Save the gradio_annotation_boxes to a review csv file
             try:
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
                 #print("Saved review file to csv")
+                # out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
+                # with open(out_annotation_file_path, 'w') as f:
+                #     json.dump(annotations_all_pages, f)
+                # log_files_output_paths.append(out_annotation_file_path)
                 #print("Saving annotations to JSON")
             except Exception as e:
+                print("Could not save annotations to csv file:", e)
             # Make a combined message for the file
             if isinstance(out_message, list):
                      match_fuzzy_whole_phrase_bool:bool=True,
                      page_break_val:int=int(page_break_value),
                      log_files_output_paths:List=[],
+                     max_time:int=int(max_time_value),
+                     output_folder:str=output_folder,
                      progress=Progress(track_tqdm=True)):
     '''
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
+    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
+    - output_folder (str, optional): The folder for file outputs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted PDF document along with processing output objects.

tools/helper_functions.py CHANGED Viewed

@@ -34,6 +34,9 @@ aws_pii_detector  = "AWS Comprehend"
 output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
 print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
 input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
 print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
@@ -62,8 +65,6 @@ def reset_state_vars():
 def reset_review_vars():
     return [], pd.DataFrame(), pd.DataFrame()
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
         allow_list_file_path = [allow_list_file_path]
@@ -269,8 +270,7 @@ def merge_csv_files(file_list):
-async def get_connection_params(request: gr.Request):
-    base_folder = ""
     #print("request user:", request.username)
@@ -304,17 +304,14 @@ async def get_connection_params(request: gr.Request):
     if request.username:
         out_session_hash = request.username
-        base_folder = "user-files/"
         print("Request username found:", out_session_hash)
     elif 'x-cognito-id' in request.headers:
         out_session_hash = request.headers['x-cognito-id']
-        base_folder = "user-files/"
         print("Cognito ID found:", out_session_hash)
     elif 'x-amzn-oidc-identity' in request.headers:
         out_session_hash = request.headers['x-amzn-oidc-identity']
-        base_folder = "user-files/"
         # Fetch email address using Cognito client
         cognito_client = boto3.client('cognito-idp')
@@ -331,20 +328,23 @@ async def get_connection_params(request: gr.Request):
             print("Error fetching user details:", e)
             email = None
         print("Cognito ID found:", out_session_hash)
     else:
         out_session_hash = request.session_hash
-        base_folder = "temp-files/"
-        # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
-    output_folder = base_folder + out_session_hash + "/"
     #if bucket_name:
     #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
     return out_session_hash, output_folder, out_session_hash
 def clean_unicode_text(text):
     # Step 1: Normalize unicode characters to decompose any special forms
@@ -365,4 +365,20 @@ def clean_unicode_text(text):
     # Comment this line if you want to keep all Unicode characters.
     cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
-    return cleaned_text

 output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
 print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
+session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True')
+print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
 input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
 print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
 def reset_review_vars():
     return [], pd.DataFrame(), pd.DataFrame()
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
         allow_list_file_path = [allow_list_file_path]
+async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
     #print("request user:", request.username)
     if request.username:
         out_session_hash = request.username
         print("Request username found:", out_session_hash)
     elif 'x-cognito-id' in request.headers:
         out_session_hash = request.headers['x-cognito-id']
         print("Cognito ID found:", out_session_hash)
     elif 'x-amzn-oidc-identity' in request.headers:
         out_session_hash = request.headers['x-amzn-oidc-identity']
         # Fetch email address using Cognito client
         cognito_client = boto3.client('cognito-idp')
             print("Error fetching user details:", e)
             email = None
         print("Cognito ID found:", out_session_hash)
     else:
         out_session_hash = request.session_hash
+    if session_output_folder == 'True':
+        output_folder = output_folder_textbox + out_session_hash + "/"
+    else:
+        output_folder = output_folder_textbox
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
     #if bucket_name:
     #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
     return out_session_hash, output_folder, out_session_hash
 def clean_unicode_text(text):
     # Step 1: Normalize unicode characters to decompose any special forms
     # Comment this line if you want to keep all Unicode characters.
     cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
+    return cleaned_text
+def load_all_output_files(folder_path:str=output_folder) -> List[str]:
+    """Get the file paths of all files in the given folder."""
+    file_paths = []
+    # List all files in the specified folder
+    for filename in os.listdir(folder_path):
+        # Construct full file path
+        full_path = os.path.join(folder_path, filename)
+        # Check if it's a file (not a directory)
+        if os.path.isfile(full_path):
+            file_paths.append(full_path)
+    return file_paths

tools/redaction_review.py CHANGED Viewed

@@ -247,7 +247,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
     return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
-def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files
     '''
@@ -363,10 +363,10 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
         try:
             #print("Saving annotations to JSON")
-            out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
-            with open(out_annotation_file_path, 'w') as f:
-                json.dump(all_image_annotations, f)
-            output_log_files.append(out_annotation_file_path)
             #print("Saving annotations to CSV review file")
@@ -379,7 +379,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
             output_files.append(out_review_file_file_path)
         except Exception as e:
-            print("Could not save annotations to json or csv file:", e)
     return doc, all_image_annotations, output_files, output_log_files
@@ -535,7 +535,7 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
     return xml_str
-def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
     '''
     Load in files to convert a review file into an Adobe comment file format
     '''
@@ -586,7 +586,7 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
 ### Convert xfdf coordinates back to image for app
-def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
     '''
     Converts coordinates from Adobe PDF space to image space.
@@ -660,7 +660,7 @@ def parse_xfdf(xfdf_path):
     return redactions
-def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
     '''
     Convert redaction annotations from XFDF and associated images into a DataFrame.

     return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
+def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, output_folder:str = output_folder, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files
     '''
         try:
             #print("Saving annotations to JSON")
+            # out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
+            # with open(out_annotation_file_path, 'w') as f:
+            #     json.dump(all_image_annotations, f)
+            # output_log_files.append(out_annotation_file_path)
             #print("Saving annotations to CSV review file")
             output_files.append(out_review_file_file_path)
         except Exception as e:
+            print("Could not save annotations to csv file:", e)
     return doc, all_image_annotations, output_files, output_log_files
     return xml_str
+def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder):
     '''
     Load in files to convert a review file into an Adobe comment file format
     '''
 ### Convert xfdf coordinates back to image for app
+def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
     '''
     Converts coordinates from Adobe PDF space to image space.
     return redactions
+def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_paths:List[str], output_folder:str=output_folder):
     '''
     Convert redaction annotations from XFDF and associated images into a DataFrame.