Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Mar 5

Commit

ff290e1

1 Parent(s): dacc782

Integrated AWS Comprehend and fuzzy matching functions with tabular data redaction.

Browse files

Files changed (7) hide show

DocRedactApp_0.4.0.spec +66 -0
README.md +2 -2
app.py +5 -7
how_to_create_exe_dist.txt +3 -3
tools/custom_image_analyser_engine.py +7 -5
tools/data_anonymise.py +356 -196
tools/file_redaction.py +1 -1

DocRedactApp_0.4.0.spec ADDED Viewed

	@@ -0,0 +1,66 @@

+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import collect_all
+datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
+binaries = []
+hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
+datas += collect_data_files('gradio_client')
+datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_image_annotation')
+tmp_ret = collect_all('gradio_image_annotation')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('safehttpx')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_analyzer')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_anonymizer')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_image_redactor')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+a = Analysis(
+    ['app.py'],
+    pathex=[],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=hiddenimports,
+    hookspath=['build_deps'],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+    optimize=0,
+    module_collection_mode={
+        'gradio': 'py',  # Collect gradio package as source .py files
+    }
+)
+pyz = PYZ(a.pure)
+exe = EXE(
+    pyz,
+    a.scripts,
+    [],
+    exclude_binaries=True,
+    name='DocRedactApp_0.4.0',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
+coll = COLLECT(
+    exe,
+    a.binaries,
+    a.datas,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    name='DocRedactApp_0.4.0',
+)

README.md CHANGED Viewed

@@ -317,8 +317,8 @@ The Redaction Settings tab now has boxes for entering the AWS access key and sec
 ### Picking up AWS access keys through an .env file
 The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
-AWS_ACCESS_KEY=<your-access-key>
-AWS_SECRET_KEY=<your-secret-key>
 The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.

 ### Picking up AWS access keys through an .env file
 The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
+AWS_ACCESS_KEY= your-access-key
+AWS_SECRET_KEY= your-secret-key
 The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.

app.py CHANGED Viewed

@@ -282,6 +282,8 @@ with app:
         in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
         in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
         tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
@@ -347,7 +349,7 @@ with app:
                 aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
-            anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
         log_files_output = gr.File(label="Log file output", interactive=False)
@@ -461,10 +463,10 @@ with app:
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
                   then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
-    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
     # If the output file count text box changes, keep going with redacting each data file until done
-    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
@@ -480,15 +482,12 @@ with app:
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
     # Merge multiple review csv files together
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
     #
     all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
     ###
     # APP LOAD AND LOGGING
     ###
@@ -567,7 +566,6 @@ if __name__ == "__main__":
          log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
          current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
 # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
 # with gr.Tab(label="Advanced options"):
 #     with gr.Accordion(label = "AWS data access", open = True):

         in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
         in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
+        pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
         tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
                 aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
+            anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with 'REDACTED'")
         log_files_output = gr.File(label="Log file output", interactive=False)
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
                   then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
+    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
     # If the output file count text box changes, keep going with redacting each data file until done
+    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
     # Merge multiple review csv files together
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
     #
     all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
     ###
     # APP LOAD AND LOGGING
     ###
          log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
          current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
 # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
 # with gr.Tab(label="Advanced options"):
 #     with gr.Accordion(label = "AWS data access", open = True):

how_to_create_exe_dist.txt CHANGED Viewed

@@ -16,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
-a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.3.0 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
@@ -32,12 +32,12 @@ a = Analysis(
 hook-presidio-image-redactor.py
-c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.3.0.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
-10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
 def create_or_modify_pyi(
     component_class: type, class_name: str, events: list[str | EventListener]

 9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
+a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.4.0 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
 hook-presidio-image-redactor.py
+c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.4.0.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
+10. go to dist/APP-NAME/internal/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
 def create_or_modify_pyi(
     component_class: type, class_name: str, events: list[str | EventListener]

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -6,6 +6,7 @@ from dataclasses import dataclass
 import time
 import cv2
 import copy
 from copy import deepcopy
 from pdfminer.layout import LTChar
 import PIL
@@ -399,12 +400,12 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
             adjusted_contrast = contrast
         return adjusted_image, contrast, adjusted_contrast
-def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
-def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
     for entity in page_analyser_result:
         entity_start = entity.start
         entity_end = entity.end
@@ -442,7 +443,7 @@ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_li
     return all_text_line_results
-def map_back_comprehend_entity_results(response, current_batch_mapping, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
     if not response or "Entities" not in response:
         return all_text_line_results
@@ -489,7 +490,7 @@ def map_back_comprehend_entity_results(response, current_batch_mapping, allow_li
     return all_text_line_results
-def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_client, language, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
     if not current_batch:
         return all_text_line_results
@@ -913,7 +914,8 @@ class CustomImageAnalyzerEngine:
         ocr_results_with_children: Dict[str, Dict],
         chosen_redact_comprehend_entities: List[str],
         pii_identification_method: str = "Local",
-        comprehend_client = "",
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:

 import time
 import cv2
 import copy
+import botocore
 from copy import deepcopy
 from pdfminer.layout import LTChar
 import PIL
             adjusted_contrast = contrast
         return adjusted_image, contrast, adjusted_contrast
+def bounding_boxes_overlap(box1:List, box2:List):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
+def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results:List[Tuple]):
     for entity in page_analyser_result:
         entity_start = entity.start
         entity_end = entity.end
     return all_text_line_results
+def map_back_comprehend_entity_results(response, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
     if not response or "Entities" not in response:
         return all_text_line_results
     return all_text_line_results
+def do_aws_comprehend_call(current_batch:str, current_batch_mapping:List[Tuple], comprehend_client:botocore.client.BaseClient, language:str, allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
     if not current_batch:
         return all_text_line_results
         ocr_results_with_children: Dict[str, Dict],
         chosen_redact_comprehend_entities: List[str],
         pii_identification_method: str = "Local",
+        comprehend_client = "",
+        custom_entities:List[str]=custom_entities,
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:

tools/data_anonymise.py CHANGED Viewed

@@ -2,6 +2,8 @@ import re
 import secrets
 import base64
 import time
 import pandas as pd
 from faker import Faker
@@ -11,9 +13,11 @@ from typing import List, Dict, Any
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
@@ -202,101 +206,198 @@ def anon_consistent_names(df):
     return scrubbed_df_consistent_names
-def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], progress=Progress(track_tqdm=False)):
-    print("Identifying personal information")
-    analyse_tic = time.perf_counter()
-    key_string = ""
-    # DataFrame to dict
-    df_dict = df.to_dict(orient="list")
     if in_allow_list:
         in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
     else:
         in_allow_list_flat = []
-    if isinstance(in_deny_list, pd.DataFrame):
-        if not in_deny_list.empty:
-            in_deny_list = in_deny_list.iloc[:, 0].tolist()
         else:
-            # Handle the case where the DataFrame is empty
-            in_deny_list = []  # or some default value
-        # Sort the strings in order from the longest string to the shortest
-        in_deny_list = sorted(in_deny_list, key=len, reverse=True)
-    if in_deny_list:
-        nlp_analyser.registry.remove_recognizer("CUSTOM")
-        new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
-        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
-    #analyzer = nlp_analyser #AnalyzerEngine()
-    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
-    anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
-    batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
-    #print("Allow list:", in_allow_list)
-    #print("Input data keys:", df_dict.keys())
-    # Use custom analyzer to be able to track progress with Gradio
-    analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
-                                                            entities=chosen_redact_entities,
-                                                            score_threshold=score_threshold,
-                                                            return_decision_process=True,
-                                                            allow_list=in_allow_list_flat)
-    analyzer_results = list(analyzer_results)
-    # Usage in the main function:
-    decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
-    analyse_toc = time.perf_counter()
-    analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
-    print(analyse_time_out)
-    # Create faker function (note that it has to receive a value)
-    fake = Faker("en_UK")
-    def fake_first_name(x):
-        return fake.first_name()
-    # Set up the anonymization configuration WITHOUT DATE_TIME
-    simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
-    replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
-    redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
-    hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
-    mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
-    people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
-    fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
-    if anon_strat == "replace with <REDACTED>": chosen_mask_config = simple_replace_config
-    if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
-    if anon_strat == "redact": chosen_mask_config = redact_config
-    if anon_strat == "hash": chosen_mask_config = hash_config
-    if anon_strat == "mask": chosen_mask_config = mask_config
-    if anon_strat == "encrypt":
-        chosen_mask_config = people_encrypt_config
-        # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
-        key = secrets.token_bytes(16)  # 128 bits = 16 bytes
-        key_string = base64.b64encode(key).decode('utf-8')
-    elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
-    # I think in general people will want to keep date / times
-    keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
-    combined_config = {**chosen_mask_config, **keep_date_config}
-    combined_config
-    anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
-    scrubbed_df = pd.DataFrame(anonymizer_results)
-    return scrubbed_df, key_string, decision_process_output_str
 def anon_wrapper_func(
     anon_file: str,
@@ -313,11 +414,16 @@ def anon_wrapper_func(
     file_type: str,
     anon_xlsx_export_file_name: str,
     log_files_output_paths: List[str],
-    in_deny_list: List[str]=[],
     output_folder: str = output_folder
 ):
     """
-    This function wraps the anonymization process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymization strategy, and exports the anonymized data to a file.
     Input Variables:
     - anon_file: The path to the file containing the data to be anonymized.
@@ -335,6 +441,11 @@ def anon_wrapper_func(
     - anon_xlsx_export_file_name: The name of the anonymized Excel file.
     - log_files_output_paths: A list of paths where the log files will be saved.
     - in_deny_list: List of specific terms to remove from the data.
     - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
     """
     def check_lists(list1, list2):
@@ -357,6 +468,9 @@ def anon_wrapper_func(
                 common_strings.append(string)
         return common_strings
     # Check for chosen col, skip file if not found
     all_cols_original_order = list(anon_df.columns)
@@ -369,13 +483,13 @@ def anon_wrapper_func(
         chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
     # Split dataframe to keep only selected columns
-    print("Remaining columns to redact:", chosen_cols_in_anon_df)
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
-    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list)
     # Rejoin the dataframe together
     anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
@@ -384,8 +498,9 @@ def anon_wrapper_func(
     # Export file
     #  Rename anonymisation strategy for file path naming
-    if anon_strat == "replace with <REDACTED>": anon_strat_txt = "redact_simple"
     elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
     else: anon_strat_txt = anon_strat
     # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
@@ -422,151 +537,196 @@ def anon_wrapper_func(
     return out_file_paths, out_message, key_string, log_files_output_paths
-def anonymise_data_files(file_paths: List[str], in_text: str, anon_strat: str, chosen_cols: List[str], language: str, chosen_redact_entities: List[str], in_allow_list: List[str] = None, latest_file_completed: int = 0, out_message: list = [], out_file_paths: list = [], log_files_output_paths: list = [], in_excel_sheets: list = [], first_loop_state: bool = False, output_folder: str = output_folder, in_deny_list:list[str]=[], progress: Progress = Progress(track_tqdm=True)):
-    """
-    This function anonymises data files based on the provided parameters.
-    Parameters:
-    - file_paths (List[str]): A list of file paths to anonymise.
-    - in_text (str): The text to anonymise if file_paths is 'open_text'.
-    - anon_strat (str): The anonymisation strategy to use.
-    - chosen_cols (List[str]): A list of column names to anonymise.
-    - language (str): The language of the text to anonymise.
-    - chosen_redact_entities (List[str]): A list of entities to redact.
-    - in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
-    - latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
-    - out_message (list, optional): A list to store output messages. Defaults to an empty list.
-    - out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
-    - log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
-    - in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
-    - first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
-    - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
-    - in_deny_list (list[str], optional): A list of specific terms to redact.
-    - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
-    """
-    tic = time.perf_counter()
-    # If this is the first time around, set variables to 0/blank
-    if first_loop_state==True:
-        latest_file_completed = 0
-        out_message = []
-        out_file_paths = []
-    # Load file
-    # If out message or out_file_paths are blank, change to a list so it can be appended to
-    if isinstance(out_message, str):
-        out_message = [out_message]
-    #print("log_files_output_paths:",log_files_output_paths)
-    if isinstance(log_files_output_paths, str):
-        log_files_output_paths = []
-    if not out_file_paths:
-        out_file_paths = []
     if in_allow_list:
         in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
     else:
         in_allow_list_flat = []
-    anon_df = pd.DataFrame()
-    #out_file_paths = []
-    # Check if files and text exist
-    if not file_paths:
-        if in_text:
-            file_paths=['open_text']
-        else:
-            out_message = "Please enter text or a file to redact."
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
-    # If we have already redacted the last file, return the input out_message and file list to the relevant components
-    if latest_file_completed >= len(file_paths):
-        print("Last file reached, returning files:", str(latest_file_completed))
-        # Set to a very high number so as not to mess with subsequent file processing by the user
-        latest_file_completed = 99
-        final_out_message = '\n'.join(out_message)
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
-    file_path_loop = [file_paths[int(latest_file_completed)]]
-    for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
-        if anon_file=='open_text':
-            anon_df = pd.DataFrame(data={'text':[in_text]})
-            chosen_cols=['text']
-            sheet_name = ""
-            file_type = ""
-            out_file_part = anon_file
-            out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
         else:
-            # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
-            file_type = detect_file_type(anon_file)
-            print("File type is:", file_type)
-            out_file_part = get_file_name_without_type(anon_file.name)
-            if file_type == 'xlsx':
-                print("Running through all xlsx sheets")
-                #anon_xlsx = pd.ExcelFile(anon_file)
-                if not in_excel_sheets:
-                    out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
-                    continue
-                anon_xlsx = pd.ExcelFile(anon_file)
-                # Create xlsx file:
-                anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
-                from openpyxl import Workbook
-                wb = Workbook()
-                wb.save(anon_xlsx_export_file_name)
-                # Iterate through the sheet names
-                for sheet_name in in_excel_sheets:
-                    # Read each sheet into a DataFrame
-                    if sheet_name not in anon_xlsx.sheet_names:
-                        continue
-                    anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
-                    # Process the DataFrame (e.g., print its contents)
-                    print(f"Sheet Name: {sheet_name}")
-                    print(anon_df.head())  # Print the first few rows
-                    out_file_paths, out_message, key_string, log_files_output_paths  = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type,  anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, output_folder=output_folder)
-            else:
-                sheet_name = ""
-                anon_df = read_file(anon_file)
-                out_file_part = get_file_name_without_type(anon_file.name)
-                out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
-        # Increase latest file completed count unless we are at the last file
-        if latest_file_completed != len(file_paths):
-            print("Completed file number:", str(latest_file_completed))
-            latest_file_completed += 1
-        toc = time.perf_counter()
-        out_time = f"in {toc - tic:0.1f} seconds."
-        print(out_time)
-        if anon_strat == "encrypt":
-            out_message.append(". Your decryption key is " + key_string + ".")
-        out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
-        out_message_out = '\n'.join(out_message)
-        out_message_out = out_message_out + " " + out_time
-        out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
-    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths

 import secrets
 import base64
 import time
+import boto3
+import botocore
 import pandas as pd
 from faker import Faker
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
+from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
 from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
+from tools.custom_image_analyser_engine import do_aws_comprehend_call
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
     return scrubbed_df_consistent_names
+def anonymise_data_files(file_paths: List[str],
+                         in_text: str,
+                         anon_strat: str,
+                         chosen_cols: List[str],
+                         language: str,
+                         chosen_redact_entities: List[str],
+                         in_allow_list: List[str] = None,
+                         latest_file_completed: int = 0,
+                         out_message: list = [],
+                         out_file_paths: list = [],
+                         log_files_output_paths: list = [],
+                         in_excel_sheets: list = [],
+                         first_loop_state: bool = False,
+                         output_folder: str = output_folder,
+                         in_deny_list:list[str]=[],
+                         max_fuzzy_spelling_mistakes_num:int=0,
+                         pii_identification_method:str="Local",
+                         chosen_redact_comprehend_entities:List[str]=[],
+                         comprehend_query_number:int=0,
+                         aws_access_key_textbox:str='',
+                         aws_secret_key_textbox:str='',
+                         progress: Progress = Progress(track_tqdm=True)):
+    """
+    This function anonymises data files based on the provided parameters.
+    Parameters:
+    - file_paths (List[str]): A list of file paths to anonymise.
+    - in_text (str): The text to anonymise if file_paths is 'open_text'.
+    - anon_strat (str): The anonymisation strategy to use.
+    - chosen_cols (List[str]): A list of column names to anonymise.
+    - language (str): The language of the text to anonymise.
+    - chosen_redact_entities (List[str]): A list of entities to redact.
+    - in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
+    - latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
+    - out_message (list, optional): A list to store output messages. Defaults to an empty list.
+    - out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
+    - log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
+    - in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
+    - first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
+    - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
+    - in_deny_list (list[str], optional): A list of specific terms to redact.
+    - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+    - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
+    - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
+    - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
+    - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
+    """
+    tic = time.perf_counter()
+    comprehend_client = ""
+    # If this is the first time around, set variables to 0/blank
+    if first_loop_state==True:
+        latest_file_completed = 0
+        out_message = []
+        out_file_paths = []
+    # Load file
+    # If out message or out_file_paths are blank, change to a list so it can be appended to
+    if isinstance(out_message, str):
+        out_message = [out_message]
+    #print("log_files_output_paths:",log_files_output_paths)
+    if isinstance(log_files_output_paths, str):
+        log_files_output_paths = []
+    if not out_file_paths:
+        out_file_paths = []
     if in_allow_list:
         in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
     else:
         in_allow_list_flat = []
+    anon_df = pd.DataFrame()
+     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
+    if pii_identification_method == "AWS Comprehend":
+        print("Trying to connect to AWS Comprehend service")
+        if aws_access_key_textbox and aws_secret_key_textbox:
+            print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
+            print("aws_access_key_textbox:", aws_access_key_textbox)
+            print("aws_secret_access_key:", aws_secret_key_textbox)
+            comprehend_client = boto3.client('comprehend',
+                aws_access_key_id=aws_access_key_textbox,
+                aws_secret_access_key=aws_secret_key_textbox)
+        elif RUN_AWS_FUNCTIONS == "1":
+            print("Connecting to Comprehend via existing SSO connection")
+            comprehend_client = boto3.client('comprehend')
+        elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+            print("Getting Comprehend credentials from environment variables")
+            comprehend_client = boto3.client('comprehend',
+                aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY)
         else:
+            comprehend_client = ""
+            out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
+            print(out_message)
+    # Check if files and text exist
+    if not file_paths:
+        if in_text:
+            file_paths=['open_text']
+        else:
+            out_message = "Please enter text or a file to redact."
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
+    # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed >= len(file_paths):
+        print("Last file reached") #, returning files:", str(latest_file_completed))
+        # Set to a very high number so as not to mess with subsequent file processing by the user
+        latest_file_completed = 99
+        final_out_message = '\n'.join(out_message)
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
+    file_path_loop = [file_paths[int(latest_file_completed)]]
+    for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
+        if anon_file=='open_text':
+            anon_df = pd.DataFrame(data={'text':[in_text]})
+            chosen_cols=['text']
+            sheet_name = ""
+            file_type = ""
+            out_file_part = anon_file
+            out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
+        else:
+            # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
+            file_type = detect_file_type(anon_file)
+            print("File type is:", file_type)
+            out_file_part = get_file_name_without_type(anon_file.name)
+            if file_type == 'xlsx':
+                print("Running through all xlsx sheets")
+                #anon_xlsx = pd.ExcelFile(anon_file)
+                if not in_excel_sheets:
+                    out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
+                    continue
+                anon_xlsx = pd.ExcelFile(anon_file)
+                # Create xlsx file:
+                anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
+                from openpyxl import Workbook
+                wb = Workbook()
+                wb.save(anon_xlsx_export_file_name)
+                # Iterate through the sheet names
+                for sheet_name in in_excel_sheets:
+                    # Read each sheet into a DataFrame
+                    if sheet_name not in anon_xlsx.sheet_names:
+                        continue
+                    anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
+                    out_file_paths, out_message, key_string, log_files_output_paths  = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
+            else:
+                sheet_name = ""
+                anon_df = read_file(anon_file)
+                out_file_part = get_file_name_without_type(anon_file.name)
+                out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
+        # Increase latest file completed count unless we are at the last file
+        if latest_file_completed != len(file_paths):
+            print("Completed file number:", str(latest_file_completed))
+            latest_file_completed += 1
+        toc = time.perf_counter()
+        out_time = f"in {toc - tic:0.1f} seconds."
+        print(out_time)
+        if anon_strat == "encrypt":
+            out_message.append(". Your decryption key is " + key_string + ".")
+        out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
+        out_message_out = '\n'.join(out_message)
+        out_message_out = out_message_out + " " + out_time
+        out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
+    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
 def anon_wrapper_func(
     anon_file: str,
     file_type: str,
     anon_xlsx_export_file_name: str,
     log_files_output_paths: List[str],
+    in_deny_list: List[str]=[],
+    max_fuzzy_spelling_mistakes_num:int=0,
+    pii_identification_method:str="Local",
+    chosen_redact_comprehend_entities:List[str]=[],
+    comprehend_query_number:int=0,
+    comprehend_client:botocore.client.BaseClient="",
     output_folder: str = output_folder
 ):
     """
+    This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
     Input Variables:
     - anon_file: The path to the file containing the data to be anonymized.
     - anon_xlsx_export_file_name: The name of the anonymized Excel file.
     - log_files_output_paths: A list of paths where the log files will be saved.
     - in_deny_list: List of specific terms to remove from the data.
+    - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
+    - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
+    - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
     - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
     """
     def check_lists(list1, list2):
                 common_strings.append(string)
         return common_strings
+    if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
+        raise("Connection to AWS Comprehend service not found, please check connection details.")
     # Check for chosen col, skip file if not found
     all_cols_original_order = list(anon_df.columns)
         chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
     # Split dataframe to keep only selected columns
+    #print("Remaining columns to redact:", chosen_cols_in_anon_df)
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
+    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
     # Rejoin the dataframe together
     anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
     # Export file
     #  Rename anonymisation strategy for file path naming
+    if anon_strat == "replace with 'REDACTED'": anon_strat_txt = "redact_replace"
     elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
+    elif anon_strat == "redact completely": anon_strat_txt = "redact_remove"
     else: anon_strat_txt = anon_strat
     # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
     return out_file_paths, out_message, key_string, log_files_output_paths
+def anonymise_script(df:pd.DataFrame, anon_strat:str, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], max_fuzzy_spelling_mistakes_num:int=0, pii_identification_method:str="Local", chosen_redact_comprehend_entities:List[str]=[], comprehend_query_number:int=0, comprehend_client:botocore.client.BaseClient="", custom_entities=custom_entities, progress=Progress(track_tqdm=False)):
+    '''
+    Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
+    '''
+    print("Identifying personal information")
+    analyse_tic = time.perf_counter()
+    # Initialize analyzer_results as an empty dictionary to store results by column
+    results_by_column = {}
+    key_string = ""
+    # DataFrame to dict
+    df_dict = df.to_dict(orient="list")
     if in_allow_list:
         in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
     else:
         in_allow_list_flat = []
+    if isinstance(in_deny_list, pd.DataFrame):
+        if not in_deny_list.empty:
+            in_deny_list = in_deny_list.iloc[:, 0].tolist()
         else:
+            # Handle the case where the DataFrame is empty
+            in_deny_list = []  # or some default value
+        # Sort the strings in order from the longest string to the shortest
+        in_deny_list = sorted(in_deny_list, key=len, reverse=True)
+    if in_deny_list:
+        nlp_analyser.registry.remove_recognizer("CUSTOM")
+        new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
+        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+        nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
+        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=in_deny_list, search_whole_phrase=max_fuzzy_spelling_mistakes_num)
+        nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
+    #analyzer = nlp_analyser #AnalyzerEngine()
+    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
+    anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
+    batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
+    analyzer_results = []
+    if pii_identification_method == "Local":
+        # Use custom analyzer to be able to track progress with Gradio
+        custom_results = analyze_dict(batch_analyzer,
+                                        df_dict,
+                                        language=language,
+                                        entities=chosen_redact_entities,
+                                        score_threshold=score_threshold,
+                                        return_decision_process=True,
+                                        allow_list=in_allow_list_flat)
+        # Initialize results_by_column with custom entity results
+        for result in custom_results:
+            results_by_column[result.key] = result
+        # Convert the dictionary of results back to a list
+        analyzer_results = list(results_by_column.values())
+    # AWS Comprehend calls
+    elif pii_identification_method == "AWS Comprehend" and comprehend_client:
+        # Only run Local anonymisation for entities that are not covered by AWS Comprehend
+        if custom_entities:
+            custom_redact_entities = [
+                entity for entity in chosen_redact_comprehend_entities
+                if entity in custom_entities
+            ]
+            if custom_redact_entities:
+                # Get results from analyze_dict
+                custom_results = analyze_dict(batch_analyzer,
+                                    df_dict,
+                                    language=language,
+                                    entities=custom_redact_entities,
+                                    score_threshold=score_threshold,
+                                    return_decision_process=True,
+                                    allow_list=in_allow_list_flat)
+                # Initialize results_by_column with custom entity results
+                for result in custom_results:
+                    results_by_column[result.key] = result
+        max_retries = 3
+        retry_delay = 3
+        # Process each text column in the dictionary
+        for column_name, texts in progress.tqdm(df_dict.items(), desc="Querying AWS Comprehend service.", unit = "Columns"):
+            # Get or create DictAnalyzerResult for this column
+            if column_name in results_by_column:
+                column_results = results_by_column[column_name]
+            else:
+                column_results = DictAnalyzerResult(
+                    recognizer_results=[[] for _ in texts],
+                    key=column_name,
+                    value=texts
+                )
+            # Process each text in the column
+            for text_idx, text in progress.tqdm(enumerate(texts), desc="Querying AWS Comprehend service.", unit = "Row"):
+                for attempt in range(max_retries):
+                    try:
+                        response = comprehend_client.detect_pii_entities(
+                            Text=str(text),
+                            LanguageCode=language
+                        )
+                        comprehend_query_number += 1
+                        # Add all entities from this text to the column's recognizer_results
+                        for entity in response["Entities"]:
+                            if entity.get("Type") not in chosen_redact_comprehend_entities:
+                                continue
+                            recognizer_result = RecognizerResult(
+                                entity_type=entity["Type"],
+                                start=entity["BeginOffset"],
+                                end=entity["EndOffset"],
+                                score=entity["Score"]
+                            )
+                            column_results.recognizer_results[text_idx].append(recognizer_result)
+                        break  # Success, exit retry loop
+                    except Exception as e:
+                        if attempt == max_retries - 1:
+                            print(f"AWS Comprehend calls failed for text: {text[:100]}... due to", e)
+                            raise
+                        time.sleep(retry_delay)
+            # Store or update the column results
+            results_by_column[column_name] = column_results
+        # Convert the dictionary of results back to a list
+        analyzer_results = list(results_by_column.values())
+    elif (pii_identification_method == "AWS Comprehend") & (not comprehend_client):
+        raise("Unable to redact, Comprehend connection details not found.")
+    else:
+        print("Unable to redact.")
+    # Usage in the main function:
+    decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
+    analyse_toc = time.perf_counter()
+    analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
+    print(analyse_time_out)
+    # Create faker function (note that it has to receive a value)
+    #fake = Faker("en_UK")
+    #def fake_first_name(x):
+    #    return fake.first_name()
+    # Set up the anonymization configuration WITHOUT DATE_TIME
+    simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
+    replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
+    redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
+    hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
+    mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
+    people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
+    fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
+    if anon_strat == "replace with 'REDACTED'": chosen_mask_config = simple_replace_config
+    if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
+    if anon_strat == "redact completely": chosen_mask_config = redact_config
+    if anon_strat == "hash": chosen_mask_config = hash_config
+    if anon_strat == "mask": chosen_mask_config = mask_config
+    if anon_strat == "encrypt":
+        chosen_mask_config = people_encrypt_config
+        # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
+        key = secrets.token_bytes(16)  # 128 bits = 16 bytes
+        key_string = base64.b64encode(key).decode('utf-8')
+    elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
+    # I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
+    #keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
+    combined_config = {**chosen_mask_config} #, **keep_date_config}
+    anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
+    scrubbed_df = pd.DataFrame(anonymizer_results)
+    return scrubbed_df, key_string, decision_process_output_str

tools/file_redaction.py CHANGED Viewed

@@ -111,7 +111,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
     - language (str): The language of the text in the files.
     - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
-    - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
     - in_redact_method (str): The method to use for redaction.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.

     - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
     - language (str): The language of the text in the files.
     - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
+    - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - in_redact_method (str): The method to use for redaction.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.