seanpedrickcase commited on
Commit
ff290e1
·
1 Parent(s): dacc782

Integrated AWS Comprehend and fuzzy matching functions with tabular data redaction.

Browse files
DocRedactApp_0.4.0.spec ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- mode: python ; coding: utf-8 -*-
2
+ from PyInstaller.utils.hooks import collect_data_files
3
+ from PyInstaller.utils.hooks import collect_all
4
+
5
+ datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
6
+ binaries = []
7
+ hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
8
+ datas += collect_data_files('gradio_client')
9
+ datas += collect_data_files('gradio')
10
+ datas += collect_data_files('gradio_image_annotation')
11
+ tmp_ret = collect_all('gradio_image_annotation')
12
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
13
+ tmp_ret = collect_all('safehttpx')
14
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
15
+ tmp_ret = collect_all('presidio_analyzer')
16
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
17
+ tmp_ret = collect_all('presidio_anonymizer')
18
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
19
+ tmp_ret = collect_all('presidio_image_redactor')
20
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
21
+
22
+
23
+ a = Analysis(
24
+ ['app.py'],
25
+ pathex=[],
26
+ binaries=binaries,
27
+ datas=datas,
28
+ hiddenimports=hiddenimports,
29
+ hookspath=['build_deps'],
30
+ hooksconfig={},
31
+ runtime_hooks=[],
32
+ excludes=[],
33
+ noarchive=False,
34
+ optimize=0,
35
+ module_collection_mode={
36
+ 'gradio': 'py', # Collect gradio package as source .py files
37
+ }
38
+ )
39
+ pyz = PYZ(a.pure)
40
+
41
+ exe = EXE(
42
+ pyz,
43
+ a.scripts,
44
+ [],
45
+ exclude_binaries=True,
46
+ name='DocRedactApp_0.4.0',
47
+ debug=False,
48
+ bootloader_ignore_signals=False,
49
+ strip=False,
50
+ upx=True,
51
+ console=True,
52
+ disable_windowed_traceback=False,
53
+ argv_emulation=False,
54
+ target_arch=None,
55
+ codesign_identity=None,
56
+ entitlements_file=None,
57
+ )
58
+ coll = COLLECT(
59
+ exe,
60
+ a.binaries,
61
+ a.datas,
62
+ strip=False,
63
+ upx=True,
64
+ upx_exclude=[],
65
+ name='DocRedactApp_0.4.0',
66
+ )
README.md CHANGED
@@ -317,8 +317,8 @@ The Redaction Settings tab now has boxes for entering the AWS access key and sec
317
  ### Picking up AWS access keys through an .env file
318
  The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
319
 
320
- AWS_ACCESS_KEY=<your-access-key>
321
- AWS_SECRET_KEY=<your-secret-key>
322
 
323
  The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
324
 
 
317
  ### Picking up AWS access keys through an .env file
318
  The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
319
 
320
+ AWS_ACCESS_KEY= your-access-key
321
+ AWS_SECRET_KEY= your-secret-key
322
 
323
  The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
324
 
app.py CHANGED
@@ -282,6 +282,8 @@ with app:
282
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
283
 
284
  in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
 
 
285
 
286
  tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
287
 
@@ -347,7 +349,7 @@ with app:
347
  aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
348
 
349
  with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
350
- anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
351
 
352
  log_files_output = gr.File(label="Log file output", interactive=False)
353
 
@@ -461,10 +463,10 @@ with app:
461
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
462
  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
463
 
464
- tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
465
 
466
  # If the output file count text box changes, keep going with redacting each data file until done
467
- text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
468
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
469
 
470
  ###
@@ -480,15 +482,12 @@ with app:
480
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
481
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
482
 
483
-
484
  # Merge multiple review csv files together
485
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
486
 
487
-
488
  #
489
  all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
490
 
491
-
492
  ###
493
  # APP LOAD AND LOGGING
494
  ###
@@ -567,7 +566,6 @@ if __name__ == "__main__":
567
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
568
  current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
569
 
570
-
571
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
572
  # with gr.Tab(label="Advanced options"):
573
  # with gr.Accordion(label = "AWS data access", open = True):
 
282
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
283
 
284
  in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
285
+
286
+ pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
287
 
288
  tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
289
 
 
349
  aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
350
 
351
  with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
352
+ anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with 'REDACTED'")
353
 
354
  log_files_output = gr.File(label="Log file output", interactive=False)
355
 
 
463
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
464
  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
465
 
466
+ tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
467
 
468
  # If the output file count text box changes, keep going with redacting each data file until done
469
+ text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
470
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
471
 
472
  ###
 
482
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
483
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
484
 
 
485
  # Merge multiple review csv files together
486
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
487
 
 
488
  #
489
  all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
490
 
 
491
  ###
492
  # APP LOAD AND LOGGING
493
  ###
 
566
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
567
  current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
568
 
 
569
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
570
  # with gr.Tab(label="Advanced options"):
571
  # with gr.Accordion(label = "AWS data access", open = True):
how_to_create_exe_dist.txt CHANGED
@@ -16,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
16
 
17
  9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
18
 
19
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.3.0 app.py
20
 
21
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
22
 
@@ -32,12 +32,12 @@ a = Analysis(
32
 
33
  hook-presidio-image-redactor.py
34
 
35
- c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.3.0.spec
36
 
37
 
38
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
39
 
40
- 10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
41
 
42
  def create_or_modify_pyi(
43
  component_class: type, class_name: str, events: list[str | EventListener]
 
16
 
17
  9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
18
 
19
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.4.0 app.py
20
 
21
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
22
 
 
32
 
33
  hook-presidio-image-redactor.py
34
 
35
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.4.0.spec
36
 
37
 
38
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
39
 
40
+ 10. go to dist/APP-NAME/internal/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
41
 
42
  def create_or_modify_pyi(
43
  component_class: type, class_name: str, events: list[str | EventListener]
tools/custom_image_analyser_engine.py CHANGED
@@ -6,6 +6,7 @@ from dataclasses import dataclass
6
  import time
7
  import cv2
8
  import copy
 
9
  from copy import deepcopy
10
  from pdfminer.layout import LTChar
11
  import PIL
@@ -399,12 +400,12 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
399
  adjusted_contrast = contrast
400
  return adjusted_image, contrast, adjusted_contrast
401
 
402
- def bounding_boxes_overlap(box1, box2):
403
  """Check if two bounding boxes overlap."""
404
  return (box1[0] < box2[2] and box2[0] < box1[2] and
405
  box1[1] < box2[3] and box2[1] < box1[3])
406
 
407
- def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
408
  for entity in page_analyser_result:
409
  entity_start = entity.start
410
  entity_end = entity.end
@@ -442,7 +443,7 @@ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_li
442
 
443
  return all_text_line_results
444
 
445
- def map_back_comprehend_entity_results(response, current_batch_mapping, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
446
  if not response or "Entities" not in response:
447
  return all_text_line_results
448
 
@@ -489,7 +490,7 @@ def map_back_comprehend_entity_results(response, current_batch_mapping, allow_li
489
 
490
  return all_text_line_results
491
 
492
- def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_client, language, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
493
  if not current_batch:
494
  return all_text_line_results
495
 
@@ -913,7 +914,8 @@ class CustomImageAnalyzerEngine:
913
  ocr_results_with_children: Dict[str, Dict],
914
  chosen_redact_comprehend_entities: List[str],
915
  pii_identification_method: str = "Local",
916
- comprehend_client = "",
 
917
  **text_analyzer_kwargs
918
  ) -> List[CustomImageRecognizerResult]:
919
 
 
6
  import time
7
  import cv2
8
  import copy
9
+ import botocore
10
  from copy import deepcopy
11
  from pdfminer.layout import LTChar
12
  import PIL
 
400
  adjusted_contrast = contrast
401
  return adjusted_image, contrast, adjusted_contrast
402
 
403
+ def bounding_boxes_overlap(box1:List, box2:List):
404
  """Check if two bounding boxes overlap."""
405
  return (box1[0] < box2[2] and box2[0] < box1[2] and
406
  box1[1] < box2[3] and box2[1] < box1[3])
407
 
408
+ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results:List[Tuple]):
409
  for entity in page_analyser_result:
410
  entity_start = entity.start
411
  entity_end = entity.end
 
443
 
444
  return all_text_line_results
445
 
446
+ def map_back_comprehend_entity_results(response, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
447
  if not response or "Entities" not in response:
448
  return all_text_line_results
449
 
 
490
 
491
  return all_text_line_results
492
 
493
+ def do_aws_comprehend_call(current_batch:str, current_batch_mapping:List[Tuple], comprehend_client:botocore.client.BaseClient, language:str, allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
494
  if not current_batch:
495
  return all_text_line_results
496
 
 
914
  ocr_results_with_children: Dict[str, Dict],
915
  chosen_redact_comprehend_entities: List[str],
916
  pii_identification_method: str = "Local",
917
+ comprehend_client = "",
918
+ custom_entities:List[str]=custom_entities,
919
  **text_analyzer_kwargs
920
  ) -> List[CustomImageRecognizerResult]:
921
 
tools/data_anonymise.py CHANGED
@@ -2,6 +2,8 @@ import re
2
  import secrets
3
  import base64
4
  import time
 
 
5
  import pandas as pd
6
 
7
  from faker import Faker
@@ -11,9 +13,11 @@ from typing import List, Dict, Any
11
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
12
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 
14
 
15
  from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
16
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser
 
17
 
18
  # Use custom version of analyze_dict to be able to track progress
19
  from tools.presidio_analyzer_custom import analyze_dict
@@ -202,101 +206,198 @@ def anon_consistent_names(df):
202
 
203
  return scrubbed_df_consistent_names
204
 
205
- def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], progress=Progress(track_tqdm=False)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- print("Identifying personal information")
208
- analyse_tic = time.perf_counter()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- key_string = ""
 
 
 
 
211
 
212
- # DataFrame to dict
213
- df_dict = df.to_dict(orient="list")
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  if in_allow_list:
216
  in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
217
  else:
218
  in_allow_list_flat = []
 
 
219
 
220
- if isinstance(in_deny_list, pd.DataFrame):
221
- if not in_deny_list.empty:
222
- in_deny_list = in_deny_list.iloc[:, 0].tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  else:
224
- # Handle the case where the DataFrame is empty
225
- in_deny_list = [] # or some default value
226
-
227
- # Sort the strings in order from the longest string to the shortest
228
- in_deny_list = sorted(in_deny_list, key=len, reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- if in_deny_list:
231
- nlp_analyser.registry.remove_recognizer("CUSTOM")
232
- new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
233
- nlp_analyser.registry.add_recognizer(new_custom_recogniser)
 
 
234
 
235
- #analyzer = nlp_analyser #AnalyzerEngine()
236
- batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
 
 
 
237
 
238
- anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
 
 
 
 
 
 
 
239
 
240
- batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
241
 
242
- #print("Allow list:", in_allow_list)
243
- #print("Input data keys:", df_dict.keys())
244
 
245
- # Use custom analyzer to be able to track progress with Gradio
246
- analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
247
- entities=chosen_redact_entities,
248
- score_threshold=score_threshold,
249
- return_decision_process=True,
250
- allow_list=in_allow_list_flat)
251
-
252
- analyzer_results = list(analyzer_results)
253
 
254
- # Usage in the main function:
255
- decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
256
 
257
- analyse_toc = time.perf_counter()
258
- analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
259
- print(analyse_time_out)
 
 
260
 
261
- # Create faker function (note that it has to receive a value)
262
- fake = Faker("en_UK")
263
 
264
- def fake_first_name(x):
265
- return fake.first_name()
 
 
 
 
266
 
267
- # Set up the anonymization configuration WITHOUT DATE_TIME
268
- simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
269
- replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
270
- redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
271
- hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
272
- mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
273
- people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
274
- fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
275
 
276
- if anon_strat == "replace with <REDACTED>": chosen_mask_config = simple_replace_config
277
- if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
278
- if anon_strat == "redact": chosen_mask_config = redact_config
279
- if anon_strat == "hash": chosen_mask_config = hash_config
280
- if anon_strat == "mask": chosen_mask_config = mask_config
281
- if anon_strat == "encrypt":
282
- chosen_mask_config = people_encrypt_config
283
- # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
284
- key = secrets.token_bytes(16) # 128 bits = 16 bytes
285
- key_string = base64.b64encode(key).decode('utf-8')
286
- elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
287
 
288
- # I think in general people will want to keep date / times
289
- keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
 
 
 
 
290
 
291
- combined_config = {**chosen_mask_config, **keep_date_config}
292
- combined_config
293
 
294
- anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
 
295
 
296
- scrubbed_df = pd.DataFrame(anonymizer_results)
297
 
298
- return scrubbed_df, key_string, decision_process_output_str
299
-
300
 
301
  def anon_wrapper_func(
302
  anon_file: str,
@@ -313,11 +414,16 @@ def anon_wrapper_func(
313
  file_type: str,
314
  anon_xlsx_export_file_name: str,
315
  log_files_output_paths: List[str],
316
- in_deny_list: List[str]=[],
 
 
 
 
 
317
  output_folder: str = output_folder
318
  ):
319
  """
320
- This function wraps the anonymization process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymization strategy, and exports the anonymized data to a file.
321
 
322
  Input Variables:
323
  - anon_file: The path to the file containing the data to be anonymized.
@@ -335,6 +441,11 @@ def anon_wrapper_func(
335
  - anon_xlsx_export_file_name: The name of the anonymized Excel file.
336
  - log_files_output_paths: A list of paths where the log files will be saved.
337
  - in_deny_list: List of specific terms to remove from the data.
 
 
 
 
 
338
  - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
339
  """
340
  def check_lists(list1, list2):
@@ -357,6 +468,9 @@ def anon_wrapper_func(
357
  common_strings.append(string)
358
  return common_strings
359
 
 
 
 
360
  # Check for chosen col, skip file if not found
361
  all_cols_original_order = list(anon_df.columns)
362
 
@@ -369,13 +483,13 @@ def anon_wrapper_func(
369
  chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
370
 
371
  # Split dataframe to keep only selected columns
372
- print("Remaining columns to redact:", chosen_cols_in_anon_df)
373
 
374
  anon_df_part = anon_df[chosen_cols_in_anon_df]
375
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
376
-
377
  # Anonymise the selected columns
378
- anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list)
379
 
380
  # Rejoin the dataframe together
381
  anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
@@ -384,8 +498,9 @@ def anon_wrapper_func(
384
  # Export file
385
 
386
  # Rename anonymisation strategy for file path naming
387
- if anon_strat == "replace with <REDACTED>": anon_strat_txt = "redact_simple"
388
  elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
 
389
  else: anon_strat_txt = anon_strat
390
 
391
  # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
@@ -422,151 +537,196 @@ def anon_wrapper_func(
422
 
423
  return out_file_paths, out_message, key_string, log_files_output_paths
424
 
425
- def anonymise_data_files(file_paths: List[str], in_text: str, anon_strat: str, chosen_cols: List[str], language: str, chosen_redact_entities: List[str], in_allow_list: List[str] = None, latest_file_completed: int = 0, out_message: list = [], out_file_paths: list = [], log_files_output_paths: list = [], in_excel_sheets: list = [], first_loop_state: bool = False, output_folder: str = output_folder, in_deny_list:list[str]=[], progress: Progress = Progress(track_tqdm=True)):
426
- """
427
- This function anonymises data files based on the provided parameters.
 
428
 
429
- Parameters:
430
- - file_paths (List[str]): A list of file paths to anonymise.
431
- - in_text (str): The text to anonymise if file_paths is 'open_text'.
432
- - anon_strat (str): The anonymisation strategy to use.
433
- - chosen_cols (List[str]): A list of column names to anonymise.
434
- - language (str): The language of the text to anonymise.
435
- - chosen_redact_entities (List[str]): A list of entities to redact.
436
- - in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
437
- - latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
438
- - out_message (list, optional): A list to store output messages. Defaults to an empty list.
439
- - out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
440
- - log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
441
- - in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
442
- - first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
443
- - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
444
- - in_deny_list (list[str], optional): A list of specific terms to redact.
445
- - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
446
- """
447
-
448
- tic = time.perf_counter()
449
-
450
- # If this is the first time around, set variables to 0/blank
451
- if first_loop_state==True:
452
- latest_file_completed = 0
453
- out_message = []
454
- out_file_paths = []
455
-
456
- # Load file
457
- # If out message or out_file_paths are blank, change to a list so it can be appended to
458
- if isinstance(out_message, str):
459
- out_message = [out_message]
460
-
461
- #print("log_files_output_paths:",log_files_output_paths)
462
 
463
- if isinstance(log_files_output_paths, str):
464
- log_files_output_paths = []
 
465
 
466
- if not out_file_paths:
467
- out_file_paths = []
468
-
469
 
470
  if in_allow_list:
471
  in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
472
  else:
473
  in_allow_list_flat = []
474
-
475
- anon_df = pd.DataFrame()
476
- #out_file_paths = []
477
-
478
- # Check if files and text exist
479
- if not file_paths:
480
- if in_text:
481
- file_paths=['open_text']
482
- else:
483
- out_message = "Please enter text or a file to redact."
484
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
485
-
486
- # If we have already redacted the last file, return the input out_message and file list to the relevant components
487
- if latest_file_completed >= len(file_paths):
488
- print("Last file reached, returning files:", str(latest_file_completed))
489
- # Set to a very high number so as not to mess with subsequent file processing by the user
490
- latest_file_completed = 99
491
- final_out_message = '\n'.join(out_message)
492
- return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
493
-
494
- file_path_loop = [file_paths[int(latest_file_completed)]]
495
-
496
- for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
497
-
498
- if anon_file=='open_text':
499
- anon_df = pd.DataFrame(data={'text':[in_text]})
500
- chosen_cols=['text']
501
- sheet_name = ""
502
- file_type = ""
503
- out_file_part = anon_file
504
 
505
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
 
 
506
  else:
507
- # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
508
- file_type = detect_file_type(anon_file)
509
- print("File type is:", file_type)
510
 
511
- out_file_part = get_file_name_without_type(anon_file.name)
512
-
513
- if file_type == 'xlsx':
514
- print("Running through all xlsx sheets")
515
- #anon_xlsx = pd.ExcelFile(anon_file)
516
- if not in_excel_sheets:
517
- out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
518
- continue
519
 
520
- anon_xlsx = pd.ExcelFile(anon_file)
 
 
 
521
 
522
- # Create xlsx file:
523
- anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
 
524
 
525
- from openpyxl import Workbook
 
526
 
527
- wb = Workbook()
528
- wb.save(anon_xlsx_export_file_name)
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
 
531
- # Iterate through the sheet names
532
- for sheet_name in in_excel_sheets:
533
- # Read each sheet into a DataFrame
534
- if sheet_name not in anon_xlsx.sheet_names:
535
- continue
536
 
537
- anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
 
 
538
 
539
- # Process the DataFrame (e.g., print its contents)
540
- print(f"Sheet Name: {sheet_name}")
541
- print(anon_df.head()) # Print the first few rows
542
 
543
-
544
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, output_folder=output_folder)
545
-
546
- else:
547
- sheet_name = ""
548
- anon_df = read_file(anon_file)
549
- out_file_part = get_file_name_without_type(anon_file.name)
550
 
551
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, output_folder=output_folder)
 
 
 
 
 
 
 
552
 
553
- # Increase latest file completed count unless we are at the last file
554
- if latest_file_completed != len(file_paths):
555
- print("Completed file number:", str(latest_file_completed))
556
- latest_file_completed += 1
 
 
 
 
 
 
 
557
 
558
- toc = time.perf_counter()
559
- out_time = f"in {toc - tic:0.1f} seconds."
560
- print(out_time)
561
-
562
- if anon_strat == "encrypt":
563
- out_message.append(". Your decryption key is " + key_string + ".")
564
 
565
- out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
566
 
567
- out_message_out = '\n'.join(out_message)
568
- out_message_out = out_message_out + " " + out_time
569
 
570
- out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
571
 
572
- return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
 
2
  import secrets
3
  import base64
4
  import time
5
+ import boto3
6
+ import botocore
7
  import pandas as pd
8
 
9
  from faker import Faker
 
13
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
14
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
15
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
16
+ from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
17
 
18
  from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
19
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
20
+ from tools.custom_image_analyser_engine import do_aws_comprehend_call
21
 
22
  # Use custom version of analyze_dict to be able to track progress
23
  from tools.presidio_analyzer_custom import analyze_dict
 
206
 
207
  return scrubbed_df_consistent_names
208
 
209
+ def anonymise_data_files(file_paths: List[str],
210
+ in_text: str,
211
+ anon_strat: str,
212
+ chosen_cols: List[str],
213
+ language: str,
214
+ chosen_redact_entities: List[str],
215
+ in_allow_list: List[str] = None,
216
+ latest_file_completed: int = 0,
217
+ out_message: list = [],
218
+ out_file_paths: list = [],
219
+ log_files_output_paths: list = [],
220
+ in_excel_sheets: list = [],
221
+ first_loop_state: bool = False,
222
+ output_folder: str = output_folder,
223
+ in_deny_list:list[str]=[],
224
+ max_fuzzy_spelling_mistakes_num:int=0,
225
+ pii_identification_method:str="Local",
226
+ chosen_redact_comprehend_entities:List[str]=[],
227
+ comprehend_query_number:int=0,
228
+ aws_access_key_textbox:str='',
229
+ aws_secret_key_textbox:str='',
230
+ progress: Progress = Progress(track_tqdm=True)):
231
+ """
232
+ This function anonymises data files based on the provided parameters.
233
 
234
+ Parameters:
235
+ - file_paths (List[str]): A list of file paths to anonymise.
236
+ - in_text (str): The text to anonymise if file_paths is 'open_text'.
237
+ - anon_strat (str): The anonymisation strategy to use.
238
+ - chosen_cols (List[str]): A list of column names to anonymise.
239
+ - language (str): The language of the text to anonymise.
240
+ - chosen_redact_entities (List[str]): A list of entities to redact.
241
+ - in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
242
+ - latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
243
+ - out_message (list, optional): A list to store output messages. Defaults to an empty list.
244
+ - out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
245
+ - log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
246
+ - in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
247
+ - first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
248
+ - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
249
+ - in_deny_list (list[str], optional): A list of specific terms to redact.
250
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
251
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
252
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
253
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
254
+ - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
255
+ - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
256
+ - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
257
+ """
258
+
259
+ tic = time.perf_counter()
260
+ comprehend_client = ""
261
 
262
+ # If this is the first time around, set variables to 0/blank
263
+ if first_loop_state==True:
264
+ latest_file_completed = 0
265
+ out_message = []
266
+ out_file_paths = []
267
 
268
+ # Load file
269
+ # If out message or out_file_paths are blank, change to a list so it can be appended to
270
+ if isinstance(out_message, str):
271
+ out_message = [out_message]
272
+
273
+ #print("log_files_output_paths:",log_files_output_paths)
274
+
275
+ if isinstance(log_files_output_paths, str):
276
+ log_files_output_paths = []
277
+
278
+ if not out_file_paths:
279
+ out_file_paths = []
280
+
281
 
282
  if in_allow_list:
283
  in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
284
  else:
285
  in_allow_list_flat = []
286
+
287
+ anon_df = pd.DataFrame()
288
 
289
+ # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
290
+ if pii_identification_method == "AWS Comprehend":
291
+ print("Trying to connect to AWS Comprehend service")
292
+ if aws_access_key_textbox and aws_secret_key_textbox:
293
+ print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
294
+ print("aws_access_key_textbox:", aws_access_key_textbox)
295
+ print("aws_secret_access_key:", aws_secret_key_textbox)
296
+ comprehend_client = boto3.client('comprehend',
297
+ aws_access_key_id=aws_access_key_textbox,
298
+ aws_secret_access_key=aws_secret_key_textbox)
299
+ elif RUN_AWS_FUNCTIONS == "1":
300
+ print("Connecting to Comprehend via existing SSO connection")
301
+ comprehend_client = boto3.client('comprehend')
302
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
303
+ print("Getting Comprehend credentials from environment variables")
304
+ comprehend_client = boto3.client('comprehend',
305
+ aws_access_key_id=AWS_ACCESS_KEY,
306
+ aws_secret_access_key=AWS_SECRET_KEY)
307
  else:
308
+ comprehend_client = ""
309
+ out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
310
+ print(out_message)
311
+
312
+ # Check if files and text exist
313
+ if not file_paths:
314
+ if in_text:
315
+ file_paths=['open_text']
316
+ else:
317
+ out_message = "Please enter text or a file to redact."
318
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
319
+
320
+ # If we have already redacted the last file, return the input out_message and file list to the relevant components
321
+ if latest_file_completed >= len(file_paths):
322
+ print("Last file reached") #, returning files:", str(latest_file_completed))
323
+ # Set to a very high number so as not to mess with subsequent file processing by the user
324
+ latest_file_completed = 99
325
+ final_out_message = '\n'.join(out_message)
326
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
327
+
328
+ file_path_loop = [file_paths[int(latest_file_completed)]]
329
+
330
+ for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
331
 
332
+ if anon_file=='open_text':
333
+ anon_df = pd.DataFrame(data={'text':[in_text]})
334
+ chosen_cols=['text']
335
+ sheet_name = ""
336
+ file_type = ""
337
+ out_file_part = anon_file
338
 
339
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
340
+ else:
341
+ # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
342
+ file_type = detect_file_type(anon_file)
343
+ print("File type is:", file_type)
344
 
345
+ out_file_part = get_file_name_without_type(anon_file.name)
346
+
347
+ if file_type == 'xlsx':
348
+ print("Running through all xlsx sheets")
349
+ #anon_xlsx = pd.ExcelFile(anon_file)
350
+ if not in_excel_sheets:
351
+ out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
352
+ continue
353
 
354
+ anon_xlsx = pd.ExcelFile(anon_file)
355
 
356
+ # Create xlsx file:
357
+ anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
358
 
359
+ from openpyxl import Workbook
 
 
 
 
 
 
 
360
 
361
+ wb = Workbook()
362
+ wb.save(anon_xlsx_export_file_name)
363
 
364
+ # Iterate through the sheet names
365
+ for sheet_name in in_excel_sheets:
366
+ # Read each sheet into a DataFrame
367
+ if sheet_name not in anon_xlsx.sheet_names:
368
+ continue
369
 
370
+ anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
 
371
 
372
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
373
+
374
+ else:
375
+ sheet_name = ""
376
+ anon_df = read_file(anon_file)
377
+ out_file_part = get_file_name_without_type(anon_file.name)
378
 
379
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
 
 
 
 
 
 
 
380
 
381
+ # Increase latest file completed count unless we are at the last file
382
+ if latest_file_completed != len(file_paths):
383
+ print("Completed file number:", str(latest_file_completed))
384
+ latest_file_completed += 1
 
 
 
 
 
 
 
385
 
386
+ toc = time.perf_counter()
387
+ out_time = f"in {toc - tic:0.1f} seconds."
388
+ print(out_time)
389
+
390
+ if anon_strat == "encrypt":
391
+ out_message.append(". Your decryption key is " + key_string + ".")
392
 
393
+ out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
 
394
 
395
+ out_message_out = '\n'.join(out_message)
396
+ out_message_out = out_message_out + " " + out_time
397
 
398
+ out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
399
 
400
+ return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
 
401
 
402
  def anon_wrapper_func(
403
  anon_file: str,
 
414
  file_type: str,
415
  anon_xlsx_export_file_name: str,
416
  log_files_output_paths: List[str],
417
+ in_deny_list: List[str]=[],
418
+ max_fuzzy_spelling_mistakes_num:int=0,
419
+ pii_identification_method:str="Local",
420
+ chosen_redact_comprehend_entities:List[str]=[],
421
+ comprehend_query_number:int=0,
422
+ comprehend_client:botocore.client.BaseClient="",
423
  output_folder: str = output_folder
424
  ):
425
  """
426
+ This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
427
 
428
  Input Variables:
429
  - anon_file: The path to the file containing the data to be anonymized.
 
441
  - anon_xlsx_export_file_name: The name of the anonymized Excel file.
442
  - log_files_output_paths: A list of paths where the log files will be saved.
443
  - in_deny_list: List of specific terms to remove from the data.
444
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
445
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
446
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
447
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
448
+ - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
449
  - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
450
  """
451
  def check_lists(list1, list2):
 
468
  common_strings.append(string)
469
  return common_strings
470
 
471
+ if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
472
+ raise("Connection to AWS Comprehend service not found, please check connection details.")
473
+
474
  # Check for chosen col, skip file if not found
475
  all_cols_original_order = list(anon_df.columns)
476
 
 
483
  chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
484
 
485
  # Split dataframe to keep only selected columns
486
+ #print("Remaining columns to redact:", chosen_cols_in_anon_df)
487
 
488
  anon_df_part = anon_df[chosen_cols_in_anon_df]
489
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
490
+
491
  # Anonymise the selected columns
492
+ anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
493
 
494
  # Rejoin the dataframe together
495
  anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
 
498
  # Export file
499
 
500
  # Rename anonymisation strategy for file path naming
501
+ if anon_strat == "replace with 'REDACTED'": anon_strat_txt = "redact_replace"
502
  elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
503
+ elif anon_strat == "redact completely": anon_strat_txt = "redact_remove"
504
  else: anon_strat_txt = anon_strat
505
 
506
  # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
 
537
 
538
  return out_file_paths, out_message, key_string, log_files_output_paths
539
 
540
+ def anonymise_script(df:pd.DataFrame, anon_strat:str, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], max_fuzzy_spelling_mistakes_num:int=0, pii_identification_method:str="Local", chosen_redact_comprehend_entities:List[str]=[], comprehend_query_number:int=0, comprehend_client:botocore.client.BaseClient="", custom_entities=custom_entities, progress=Progress(track_tqdm=False)):
541
+ '''
542
+ Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
543
+ '''
544
 
545
+ print("Identifying personal information")
546
+ analyse_tic = time.perf_counter()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
 
548
+ # Initialize analyzer_results as an empty dictionary to store results by column
549
+ results_by_column = {}
550
+ key_string = ""
551
 
552
+ # DataFrame to dict
553
+ df_dict = df.to_dict(orient="list")
 
554
 
555
  if in_allow_list:
556
  in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
557
  else:
558
  in_allow_list_flat = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
+ if isinstance(in_deny_list, pd.DataFrame):
561
+ if not in_deny_list.empty:
562
+ in_deny_list = in_deny_list.iloc[:, 0].tolist()
563
  else:
564
+ # Handle the case where the DataFrame is empty
565
+ in_deny_list = [] # or some default value
 
566
 
567
+ # Sort the strings in order from the longest string to the shortest
568
+ in_deny_list = sorted(in_deny_list, key=len, reverse=True)
 
 
 
 
 
 
569
 
570
+ if in_deny_list:
571
+ nlp_analyser.registry.remove_recognizer("CUSTOM")
572
+ new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
573
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
574
 
575
+ nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
576
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=in_deny_list, search_whole_phrase=max_fuzzy_spelling_mistakes_num)
577
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
578
 
579
+ #analyzer = nlp_analyser #AnalyzerEngine()
580
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
581
 
582
+ anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
 
583
 
584
+ batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
585
+
586
+ analyzer_results = []
587
+
588
+ if pii_identification_method == "Local":
589
+
590
+ # Use custom analyzer to be able to track progress with Gradio
591
+ custom_results = analyze_dict(batch_analyzer,
592
+ df_dict,
593
+ language=language,
594
+ entities=chosen_redact_entities,
595
+ score_threshold=score_threshold,
596
+ return_decision_process=True,
597
+ allow_list=in_allow_list_flat)
598
+
599
+ # Initialize results_by_column with custom entity results
600
+ for result in custom_results:
601
+ results_by_column[result.key] = result
602
+
603
+ # Convert the dictionary of results back to a list
604
+ analyzer_results = list(results_by_column.values())
605
+
606
+ # AWS Comprehend calls
607
+ elif pii_identification_method == "AWS Comprehend" and comprehend_client:
608
+
609
+ # Only run Local anonymisation for entities that are not covered by AWS Comprehend
610
+ if custom_entities:
611
+ custom_redact_entities = [
612
+ entity for entity in chosen_redact_comprehend_entities
613
+ if entity in custom_entities
614
+ ]
615
+ if custom_redact_entities:
616
+ # Get results from analyze_dict
617
+ custom_results = analyze_dict(batch_analyzer,
618
+ df_dict,
619
+ language=language,
620
+ entities=custom_redact_entities,
621
+ score_threshold=score_threshold,
622
+ return_decision_process=True,
623
+ allow_list=in_allow_list_flat)
624
+
625
+ # Initialize results_by_column with custom entity results
626
+ for result in custom_results:
627
+ results_by_column[result.key] = result
628
+
629
+ max_retries = 3
630
+ retry_delay = 3
631
+
632
+ # Process each text column in the dictionary
633
+ for column_name, texts in progress.tqdm(df_dict.items(), desc="Querying AWS Comprehend service.", unit = "Columns"):
634
+ # Get or create DictAnalyzerResult for this column
635
+ if column_name in results_by_column:
636
+ column_results = results_by_column[column_name]
637
+ else:
638
+ column_results = DictAnalyzerResult(
639
+ recognizer_results=[[] for _ in texts],
640
+ key=column_name,
641
+ value=texts
642
+ )
643
+
644
+ # Process each text in the column
645
+ for text_idx, text in progress.tqdm(enumerate(texts), desc="Querying AWS Comprehend service.", unit = "Row"):
646
+
647
+ for attempt in range(max_retries):
648
+ try:
649
+ response = comprehend_client.detect_pii_entities(
650
+ Text=str(text),
651
+ LanguageCode=language
652
+ )
653
+
654
+ comprehend_query_number += 1
655
+
656
+ # Add all entities from this text to the column's recognizer_results
657
+ for entity in response["Entities"]:
658
+ if entity.get("Type") not in chosen_redact_comprehend_entities:
659
+ continue
660
+
661
+ recognizer_result = RecognizerResult(
662
+ entity_type=entity["Type"],
663
+ start=entity["BeginOffset"],
664
+ end=entity["EndOffset"],
665
+ score=entity["Score"]
666
+ )
667
+ column_results.recognizer_results[text_idx].append(recognizer_result)
668
+
669
+ break # Success, exit retry loop
670
+
671
+ except Exception as e:
672
+ if attempt == max_retries - 1:
673
+ print(f"AWS Comprehend calls failed for text: {text[:100]}... due to", e)
674
+ raise
675
+ time.sleep(retry_delay)
676
+
677
+ # Store or update the column results
678
+ results_by_column[column_name] = column_results
679
+
680
+ # Convert the dictionary of results back to a list
681
+ analyzer_results = list(results_by_column.values())
682
+
683
+ elif (pii_identification_method == "AWS Comprehend") & (not comprehend_client):
684
+ raise("Unable to redact, Comprehend connection details not found.")
685
+
686
+ else:
687
+ print("Unable to redact.")
688
 
689
+ # Usage in the main function:
690
+ decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
 
 
 
691
 
692
+ analyse_toc = time.perf_counter()
693
+ analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
694
+ print(analyse_time_out)
695
 
696
+ # Create faker function (note that it has to receive a value)
697
+ #fake = Faker("en_UK")
 
698
 
699
+ #def fake_first_name(x):
700
+ # return fake.first_name()
 
 
 
 
 
701
 
702
+ # Set up the anonymization configuration WITHOUT DATE_TIME
703
+ simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
704
+ replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
705
+ redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
706
+ hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
707
+ mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
708
+ people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
709
+ fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
710
 
711
+ if anon_strat == "replace with 'REDACTED'": chosen_mask_config = simple_replace_config
712
+ if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
713
+ if anon_strat == "redact completely": chosen_mask_config = redact_config
714
+ if anon_strat == "hash": chosen_mask_config = hash_config
715
+ if anon_strat == "mask": chosen_mask_config = mask_config
716
+ if anon_strat == "encrypt":
717
+ chosen_mask_config = people_encrypt_config
718
+ # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
719
+ key = secrets.token_bytes(16) # 128 bits = 16 bytes
720
+ key_string = base64.b64encode(key).decode('utf-8')
721
+ elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
722
 
723
+ # I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
724
+ #keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
 
 
 
 
725
 
726
+ combined_config = {**chosen_mask_config} #, **keep_date_config}
727
 
728
+ anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
 
729
 
730
+ scrubbed_df = pd.DataFrame(anonymizer_results)
731
 
732
+ return scrubbed_df, key_string, decision_process_output_str
tools/file_redaction.py CHANGED
@@ -111,7 +111,7 @@ def choose_and_run_redactor(file_paths:List[str],
111
  - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
112
  - language (str): The language of the text in the files.
113
  - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
114
- - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
115
  - in_redact_method (str): The method to use for redaction.
116
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
117
  - custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
 
111
  - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
112
  - language (str): The language of the text in the files.
113
  - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
114
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
115
  - in_redact_method (str): The method to use for redaction.
116
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
117
  - custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.