Commit
·
ff290e1
1
Parent(s):
dacc782
Integrated AWS Comprehend and fuzzy matching functions with tabular data redaction.
Browse files- DocRedactApp_0.4.0.spec +66 -0
- README.md +2 -2
- app.py +5 -7
- how_to_create_exe_dist.txt +3 -3
- tools/custom_image_analyser_engine.py +7 -5
- tools/data_anonymise.py +356 -196
- tools/file_redaction.py +1 -1
DocRedactApp_0.4.0.spec
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- mode: python ; coding: utf-8 -*-
|
2 |
+
from PyInstaller.utils.hooks import collect_data_files
|
3 |
+
from PyInstaller.utils.hooks import collect_all
|
4 |
+
|
5 |
+
datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
|
6 |
+
binaries = []
|
7 |
+
hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
|
8 |
+
datas += collect_data_files('gradio_client')
|
9 |
+
datas += collect_data_files('gradio')
|
10 |
+
datas += collect_data_files('gradio_image_annotation')
|
11 |
+
tmp_ret = collect_all('gradio_image_annotation')
|
12 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
13 |
+
tmp_ret = collect_all('safehttpx')
|
14 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
15 |
+
tmp_ret = collect_all('presidio_analyzer')
|
16 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
17 |
+
tmp_ret = collect_all('presidio_anonymizer')
|
18 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
19 |
+
tmp_ret = collect_all('presidio_image_redactor')
|
20 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
21 |
+
|
22 |
+
|
23 |
+
a = Analysis(
|
24 |
+
['app.py'],
|
25 |
+
pathex=[],
|
26 |
+
binaries=binaries,
|
27 |
+
datas=datas,
|
28 |
+
hiddenimports=hiddenimports,
|
29 |
+
hookspath=['build_deps'],
|
30 |
+
hooksconfig={},
|
31 |
+
runtime_hooks=[],
|
32 |
+
excludes=[],
|
33 |
+
noarchive=False,
|
34 |
+
optimize=0,
|
35 |
+
module_collection_mode={
|
36 |
+
'gradio': 'py', # Collect gradio package as source .py files
|
37 |
+
}
|
38 |
+
)
|
39 |
+
pyz = PYZ(a.pure)
|
40 |
+
|
41 |
+
exe = EXE(
|
42 |
+
pyz,
|
43 |
+
a.scripts,
|
44 |
+
[],
|
45 |
+
exclude_binaries=True,
|
46 |
+
name='DocRedactApp_0.4.0',
|
47 |
+
debug=False,
|
48 |
+
bootloader_ignore_signals=False,
|
49 |
+
strip=False,
|
50 |
+
upx=True,
|
51 |
+
console=True,
|
52 |
+
disable_windowed_traceback=False,
|
53 |
+
argv_emulation=False,
|
54 |
+
target_arch=None,
|
55 |
+
codesign_identity=None,
|
56 |
+
entitlements_file=None,
|
57 |
+
)
|
58 |
+
coll = COLLECT(
|
59 |
+
exe,
|
60 |
+
a.binaries,
|
61 |
+
a.datas,
|
62 |
+
strip=False,
|
63 |
+
upx=True,
|
64 |
+
upx_exclude=[],
|
65 |
+
name='DocRedactApp_0.4.0',
|
66 |
+
)
|
README.md
CHANGED
@@ -317,8 +317,8 @@ The Redaction Settings tab now has boxes for entering the AWS access key and sec
|
|
317 |
### Picking up AWS access keys through an .env file
|
318 |
The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
|
319 |
|
320 |
-
AWS_ACCESS_KEY
|
321 |
-
AWS_SECRET_KEY
|
322 |
|
323 |
The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
|
324 |
|
|
|
317 |
### Picking up AWS access keys through an .env file
|
318 |
The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
|
319 |
|
320 |
+
AWS_ACCESS_KEY= your-access-key
|
321 |
+
AWS_SECRET_KEY= your-secret-key
|
322 |
|
323 |
The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
|
324 |
|
app.py
CHANGED
@@ -282,6 +282,8 @@ with app:
|
|
282 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
|
283 |
|
284 |
in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
|
|
|
|
|
285 |
|
286 |
tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
|
287 |
|
@@ -347,7 +349,7 @@ with app:
|
|
347 |
aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
|
348 |
|
349 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
350 |
-
anon_strat = gr.Radio(choices=["replace with
|
351 |
|
352 |
log_files_output = gr.File(label="Log file output", interactive=False)
|
353 |
|
@@ -461,10 +463,10 @@ with app:
|
|
461 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
462 |
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
463 |
|
464 |
-
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
465 |
|
466 |
# If the output file count text box changes, keep going with redacting each data file until done
|
467 |
-
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
468 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
469 |
|
470 |
###
|
@@ -480,15 +482,12 @@ with app:
|
|
480 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
481 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
482 |
|
483 |
-
|
484 |
# Merge multiple review csv files together
|
485 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
486 |
|
487 |
-
|
488 |
#
|
489 |
all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
|
490 |
|
491 |
-
|
492 |
###
|
493 |
# APP LOAD AND LOGGING
|
494 |
###
|
@@ -567,7 +566,6 @@ if __name__ == "__main__":
|
|
567 |
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
568 |
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
|
569 |
|
570 |
-
|
571 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
572 |
# with gr.Tab(label="Advanced options"):
|
573 |
# with gr.Accordion(label = "AWS data access", open = True):
|
|
|
282 |
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
|
283 |
|
284 |
in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
|
285 |
+
|
286 |
+
pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
287 |
|
288 |
tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
|
289 |
|
|
|
349 |
aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
|
350 |
|
351 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
352 |
+
anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with 'REDACTED'")
|
353 |
|
354 |
log_files_output = gr.File(label="Log file output", interactive=False)
|
355 |
|
|
|
463 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
464 |
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
465 |
|
466 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
467 |
|
468 |
# If the output file count text box changes, keep going with redacting each data file until done
|
469 |
+
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
470 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
471 |
|
472 |
###
|
|
|
482 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
483 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
484 |
|
|
|
485 |
# Merge multiple review csv files together
|
486 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
487 |
|
|
|
488 |
#
|
489 |
all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
|
490 |
|
|
|
491 |
###
|
492 |
# APP LOAD AND LOGGING
|
493 |
###
|
|
|
566 |
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
567 |
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
|
568 |
|
|
|
569 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
570 |
# with gr.Tab(label="Advanced options"):
|
571 |
# with gr.Accordion(label = "AWS data access", open = True):
|
how_to_create_exe_dist.txt
CHANGED
@@ -16,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
16 |
|
17 |
9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
18 |
|
19 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.
|
20 |
|
21 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
22 |
|
@@ -32,12 +32,12 @@ a = Analysis(
|
|
32 |
|
33 |
hook-presidio-image-redactor.py
|
34 |
|
35 |
-
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.
|
36 |
|
37 |
|
38 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
|
39 |
|
40 |
-
10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
|
41 |
|
42 |
def create_or_modify_pyi(
|
43 |
component_class: type, class_name: str, events: list[str | EventListener]
|
|
|
16 |
|
17 |
9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
18 |
|
19 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.4.0 app.py
|
20 |
|
21 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
22 |
|
|
|
32 |
|
33 |
hook-presidio-image-redactor.py
|
34 |
|
35 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.4.0.spec
|
36 |
|
37 |
|
38 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
|
39 |
|
40 |
+
10. go to dist/APP-NAME/internal/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
|
41 |
|
42 |
def create_or_modify_pyi(
|
43 |
component_class: type, class_name: str, events: list[str | EventListener]
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -6,6 +6,7 @@ from dataclasses import dataclass
|
|
6 |
import time
|
7 |
import cv2
|
8 |
import copy
|
|
|
9 |
from copy import deepcopy
|
10 |
from pdfminer.layout import LTChar
|
11 |
import PIL
|
@@ -399,12 +400,12 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
|
|
399 |
adjusted_contrast = contrast
|
400 |
return adjusted_image, contrast, adjusted_contrast
|
401 |
|
402 |
-
def bounding_boxes_overlap(box1, box2):
|
403 |
"""Check if two bounding boxes overlap."""
|
404 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
405 |
box1[1] < box2[3] and box2[1] < box1[3])
|
406 |
|
407 |
-
def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
|
408 |
for entity in page_analyser_result:
|
409 |
entity_start = entity.start
|
410 |
entity_end = entity.end
|
@@ -442,7 +443,7 @@ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_li
|
|
442 |
|
443 |
return all_text_line_results
|
444 |
|
445 |
-
def map_back_comprehend_entity_results(response, current_batch_mapping, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
|
446 |
if not response or "Entities" not in response:
|
447 |
return all_text_line_results
|
448 |
|
@@ -489,7 +490,7 @@ def map_back_comprehend_entity_results(response, current_batch_mapping, allow_li
|
|
489 |
|
490 |
return all_text_line_results
|
491 |
|
492 |
-
def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_client, language, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
|
493 |
if not current_batch:
|
494 |
return all_text_line_results
|
495 |
|
@@ -913,7 +914,8 @@ class CustomImageAnalyzerEngine:
|
|
913 |
ocr_results_with_children: Dict[str, Dict],
|
914 |
chosen_redact_comprehend_entities: List[str],
|
915 |
pii_identification_method: str = "Local",
|
916 |
-
comprehend_client = "",
|
|
|
917 |
**text_analyzer_kwargs
|
918 |
) -> List[CustomImageRecognizerResult]:
|
919 |
|
|
|
6 |
import time
|
7 |
import cv2
|
8 |
import copy
|
9 |
+
import botocore
|
10 |
from copy import deepcopy
|
11 |
from pdfminer.layout import LTChar
|
12 |
import PIL
|
|
|
400 |
adjusted_contrast = contrast
|
401 |
return adjusted_image, contrast, adjusted_contrast
|
402 |
|
403 |
+
def bounding_boxes_overlap(box1:List, box2:List):
|
404 |
"""Check if two bounding boxes overlap."""
|
405 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
406 |
box1[1] < box2[3] and box2[1] < box1[3])
|
407 |
|
408 |
+
def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results:List[Tuple]):
|
409 |
for entity in page_analyser_result:
|
410 |
entity_start = entity.start
|
411 |
entity_end = entity.end
|
|
|
443 |
|
444 |
return all_text_line_results
|
445 |
|
446 |
+
def map_back_comprehend_entity_results(response, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
|
447 |
if not response or "Entities" not in response:
|
448 |
return all_text_line_results
|
449 |
|
|
|
490 |
|
491 |
return all_text_line_results
|
492 |
|
493 |
+
def do_aws_comprehend_call(current_batch:str, current_batch_mapping:List[Tuple], comprehend_client:botocore.client.BaseClient, language:str, allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
|
494 |
if not current_batch:
|
495 |
return all_text_line_results
|
496 |
|
|
|
914 |
ocr_results_with_children: Dict[str, Dict],
|
915 |
chosen_redact_comprehend_entities: List[str],
|
916 |
pii_identification_method: str = "Local",
|
917 |
+
comprehend_client = "",
|
918 |
+
custom_entities:List[str]=custom_entities,
|
919 |
**text_analyzer_kwargs
|
920 |
) -> List[CustomImageRecognizerResult]:
|
921 |
|
tools/data_anonymise.py
CHANGED
@@ -2,6 +2,8 @@ import re
|
|
2 |
import secrets
|
3 |
import base64
|
4 |
import time
|
|
|
|
|
5 |
import pandas as pd
|
6 |
|
7 |
from faker import Faker
|
@@ -11,9 +13,11 @@ from typing import List, Dict, Any
|
|
11 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
12 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
|
|
14 |
|
15 |
from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
|
16 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser
|
|
|
17 |
|
18 |
# Use custom version of analyze_dict to be able to track progress
|
19 |
from tools.presidio_analyzer_custom import analyze_dict
|
@@ -202,101 +206,198 @@ def anon_consistent_names(df):
|
|
202 |
|
203 |
return scrubbed_df_consistent_names
|
204 |
|
205 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
-
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
#
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
if in_allow_list:
|
216 |
in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
|
217 |
else:
|
218 |
in_allow_list_flat = []
|
|
|
|
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
else:
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
|
|
|
|
234 |
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
237 |
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
-
|
241 |
|
242 |
-
|
243 |
-
|
244 |
|
245 |
-
|
246 |
-
analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
|
247 |
-
entities=chosen_redact_entities,
|
248 |
-
score_threshold=score_threshold,
|
249 |
-
return_decision_process=True,
|
250 |
-
allow_list=in_allow_list_flat)
|
251 |
-
|
252 |
-
analyzer_results = list(analyzer_results)
|
253 |
|
254 |
-
|
255 |
-
|
256 |
|
257 |
-
|
258 |
-
|
259 |
-
|
|
|
|
|
260 |
|
261 |
-
|
262 |
-
fake = Faker("en_UK")
|
263 |
|
264 |
-
|
265 |
-
|
|
|
|
|
|
|
|
|
266 |
|
267 |
-
|
268 |
-
simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
|
269 |
-
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
270 |
-
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
271 |
-
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
272 |
-
mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
|
273 |
-
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
274 |
-
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
275 |
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
if anon_strat == "mask": chosen_mask_config = mask_config
|
281 |
-
if anon_strat == "encrypt":
|
282 |
-
chosen_mask_config = people_encrypt_config
|
283 |
-
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
284 |
-
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
285 |
-
key_string = base64.b64encode(key).decode('utf-8')
|
286 |
-
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
287 |
|
288 |
-
|
289 |
-
|
|
|
|
|
|
|
|
|
290 |
|
291 |
-
|
292 |
-
combined_config
|
293 |
|
294 |
-
|
|
|
295 |
|
296 |
-
|
297 |
|
298 |
-
return
|
299 |
-
|
300 |
|
301 |
def anon_wrapper_func(
|
302 |
anon_file: str,
|
@@ -313,11 +414,16 @@ def anon_wrapper_func(
|
|
313 |
file_type: str,
|
314 |
anon_xlsx_export_file_name: str,
|
315 |
log_files_output_paths: List[str],
|
316 |
-
in_deny_list: List[str]=[],
|
|
|
|
|
|
|
|
|
|
|
317 |
output_folder: str = output_folder
|
318 |
):
|
319 |
"""
|
320 |
-
This function wraps the
|
321 |
|
322 |
Input Variables:
|
323 |
- anon_file: The path to the file containing the data to be anonymized.
|
@@ -335,6 +441,11 @@ def anon_wrapper_func(
|
|
335 |
- anon_xlsx_export_file_name: The name of the anonymized Excel file.
|
336 |
- log_files_output_paths: A list of paths where the log files will be saved.
|
337 |
- in_deny_list: List of specific terms to remove from the data.
|
|
|
|
|
|
|
|
|
|
|
338 |
- output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
|
339 |
"""
|
340 |
def check_lists(list1, list2):
|
@@ -357,6 +468,9 @@ def anon_wrapper_func(
|
|
357 |
common_strings.append(string)
|
358 |
return common_strings
|
359 |
|
|
|
|
|
|
|
360 |
# Check for chosen col, skip file if not found
|
361 |
all_cols_original_order = list(anon_df.columns)
|
362 |
|
@@ -369,13 +483,13 @@ def anon_wrapper_func(
|
|
369 |
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
370 |
|
371 |
# Split dataframe to keep only selected columns
|
372 |
-
print("Remaining columns to redact:", chosen_cols_in_anon_df)
|
373 |
|
374 |
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
375 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
376 |
-
|
377 |
# Anonymise the selected columns
|
378 |
-
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list)
|
379 |
|
380 |
# Rejoin the dataframe together
|
381 |
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
@@ -384,8 +498,9 @@ def anon_wrapper_func(
|
|
384 |
# Export file
|
385 |
|
386 |
# Rename anonymisation strategy for file path naming
|
387 |
-
if anon_strat == "replace with
|
388 |
elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
|
|
|
389 |
else: anon_strat_txt = anon_strat
|
390 |
|
391 |
# If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
|
@@ -422,151 +537,196 @@ def anon_wrapper_func(
|
|
422 |
|
423 |
return out_file_paths, out_message, key_string, log_files_output_paths
|
424 |
|
425 |
-
def
|
426 |
-
|
427 |
-
|
|
|
428 |
|
429 |
-
|
430 |
-
|
431 |
-
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
432 |
-
- anon_strat (str): The anonymisation strategy to use.
|
433 |
-
- chosen_cols (List[str]): A list of column names to anonymise.
|
434 |
-
- language (str): The language of the text to anonymise.
|
435 |
-
- chosen_redact_entities (List[str]): A list of entities to redact.
|
436 |
-
- in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
|
437 |
-
- latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
|
438 |
-
- out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
439 |
-
- out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
|
440 |
-
- log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
|
441 |
-
- in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
|
442 |
-
- first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
|
443 |
-
- output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
|
444 |
-
- in_deny_list (list[str], optional): A list of specific terms to redact.
|
445 |
-
- progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
|
446 |
-
"""
|
447 |
-
|
448 |
-
tic = time.perf_counter()
|
449 |
-
|
450 |
-
# If this is the first time around, set variables to 0/blank
|
451 |
-
if first_loop_state==True:
|
452 |
-
latest_file_completed = 0
|
453 |
-
out_message = []
|
454 |
-
out_file_paths = []
|
455 |
-
|
456 |
-
# Load file
|
457 |
-
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
458 |
-
if isinstance(out_message, str):
|
459 |
-
out_message = [out_message]
|
460 |
-
|
461 |
-
#print("log_files_output_paths:",log_files_output_paths)
|
462 |
|
463 |
-
|
464 |
-
|
|
|
465 |
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
|
470 |
if in_allow_list:
|
471 |
in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
|
472 |
else:
|
473 |
in_allow_list_flat = []
|
474 |
-
|
475 |
-
anon_df = pd.DataFrame()
|
476 |
-
#out_file_paths = []
|
477 |
-
|
478 |
-
# Check if files and text exist
|
479 |
-
if not file_paths:
|
480 |
-
if in_text:
|
481 |
-
file_paths=['open_text']
|
482 |
-
else:
|
483 |
-
out_message = "Please enter text or a file to redact."
|
484 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
485 |
-
|
486 |
-
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
487 |
-
if latest_file_completed >= len(file_paths):
|
488 |
-
print("Last file reached, returning files:", str(latest_file_completed))
|
489 |
-
# Set to a very high number so as not to mess with subsequent file processing by the user
|
490 |
-
latest_file_completed = 99
|
491 |
-
final_out_message = '\n'.join(out_message)
|
492 |
-
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
493 |
-
|
494 |
-
file_path_loop = [file_paths[int(latest_file_completed)]]
|
495 |
-
|
496 |
-
for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
|
497 |
-
|
498 |
-
if anon_file=='open_text':
|
499 |
-
anon_df = pd.DataFrame(data={'text':[in_text]})
|
500 |
-
chosen_cols=['text']
|
501 |
-
sheet_name = ""
|
502 |
-
file_type = ""
|
503 |
-
out_file_part = anon_file
|
504 |
|
505 |
-
|
|
|
|
|
506 |
else:
|
507 |
-
#
|
508 |
-
|
509 |
-
print("File type is:", file_type)
|
510 |
|
511 |
-
|
512 |
-
|
513 |
-
if file_type == 'xlsx':
|
514 |
-
print("Running through all xlsx sheets")
|
515 |
-
#anon_xlsx = pd.ExcelFile(anon_file)
|
516 |
-
if not in_excel_sheets:
|
517 |
-
out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
|
518 |
-
continue
|
519 |
|
520 |
-
|
|
|
|
|
|
|
521 |
|
522 |
-
|
523 |
-
|
|
|
524 |
|
525 |
-
|
|
|
526 |
|
527 |
-
|
528 |
-
wb.save(anon_xlsx_export_file_name)
|
529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
|
531 |
-
|
532 |
-
|
533 |
-
# Read each sheet into a DataFrame
|
534 |
-
if sheet_name not in anon_xlsx.sheet_names:
|
535 |
-
continue
|
536 |
|
537 |
-
|
|
|
|
|
538 |
|
539 |
-
|
540 |
-
|
541 |
-
print(anon_df.head()) # Print the first few rows
|
542 |
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
else:
|
547 |
-
sheet_name = ""
|
548 |
-
anon_df = read_file(anon_file)
|
549 |
-
out_file_part = get_file_name_without_type(anon_file.name)
|
550 |
|
551 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
552 |
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
|
558 |
-
|
559 |
-
|
560 |
-
print(out_time)
|
561 |
-
|
562 |
-
if anon_strat == "encrypt":
|
563 |
-
out_message.append(". Your decryption key is " + key_string + ".")
|
564 |
|
565 |
-
|
566 |
|
567 |
-
|
568 |
-
out_message_out = out_message_out + " " + out_time
|
569 |
|
570 |
-
|
571 |
|
572 |
-
return
|
|
|
2 |
import secrets
|
3 |
import base64
|
4 |
import time
|
5 |
+
import boto3
|
6 |
+
import botocore
|
7 |
import pandas as pd
|
8 |
|
9 |
from faker import Faker
|
|
|
13 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
14 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
15 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
16 |
+
from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
|
17 |
|
18 |
from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
|
19 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
|
20 |
+
from tools.custom_image_analyser_engine import do_aws_comprehend_call
|
21 |
|
22 |
# Use custom version of analyze_dict to be able to track progress
|
23 |
from tools.presidio_analyzer_custom import analyze_dict
|
|
|
206 |
|
207 |
return scrubbed_df_consistent_names
|
208 |
|
209 |
+
def anonymise_data_files(file_paths: List[str],
|
210 |
+
in_text: str,
|
211 |
+
anon_strat: str,
|
212 |
+
chosen_cols: List[str],
|
213 |
+
language: str,
|
214 |
+
chosen_redact_entities: List[str],
|
215 |
+
in_allow_list: List[str] = None,
|
216 |
+
latest_file_completed: int = 0,
|
217 |
+
out_message: list = [],
|
218 |
+
out_file_paths: list = [],
|
219 |
+
log_files_output_paths: list = [],
|
220 |
+
in_excel_sheets: list = [],
|
221 |
+
first_loop_state: bool = False,
|
222 |
+
output_folder: str = output_folder,
|
223 |
+
in_deny_list:list[str]=[],
|
224 |
+
max_fuzzy_spelling_mistakes_num:int=0,
|
225 |
+
pii_identification_method:str="Local",
|
226 |
+
chosen_redact_comprehend_entities:List[str]=[],
|
227 |
+
comprehend_query_number:int=0,
|
228 |
+
aws_access_key_textbox:str='',
|
229 |
+
aws_secret_key_textbox:str='',
|
230 |
+
progress: Progress = Progress(track_tqdm=True)):
|
231 |
+
"""
|
232 |
+
This function anonymises data files based on the provided parameters.
|
233 |
|
234 |
+
Parameters:
|
235 |
+
- file_paths (List[str]): A list of file paths to anonymise.
|
236 |
+
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
237 |
+
- anon_strat (str): The anonymisation strategy to use.
|
238 |
+
- chosen_cols (List[str]): A list of column names to anonymise.
|
239 |
+
- language (str): The language of the text to anonymise.
|
240 |
+
- chosen_redact_entities (List[str]): A list of entities to redact.
|
241 |
+
- in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
|
242 |
+
- latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
|
243 |
+
- out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
244 |
+
- out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
|
245 |
+
- log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
|
246 |
+
- in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
|
247 |
+
- first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
|
248 |
+
- output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
|
249 |
+
- in_deny_list (list[str], optional): A list of specific terms to redact.
|
250 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
251 |
+
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
252 |
+
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
253 |
+
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
254 |
+
- aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
|
255 |
+
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
256 |
+
- progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
|
257 |
+
"""
|
258 |
+
|
259 |
+
tic = time.perf_counter()
|
260 |
+
comprehend_client = ""
|
261 |
|
262 |
+
# If this is the first time around, set variables to 0/blank
|
263 |
+
if first_loop_state==True:
|
264 |
+
latest_file_completed = 0
|
265 |
+
out_message = []
|
266 |
+
out_file_paths = []
|
267 |
|
268 |
+
# Load file
|
269 |
+
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
270 |
+
if isinstance(out_message, str):
|
271 |
+
out_message = [out_message]
|
272 |
+
|
273 |
+
#print("log_files_output_paths:",log_files_output_paths)
|
274 |
+
|
275 |
+
if isinstance(log_files_output_paths, str):
|
276 |
+
log_files_output_paths = []
|
277 |
+
|
278 |
+
if not out_file_paths:
|
279 |
+
out_file_paths = []
|
280 |
+
|
281 |
|
282 |
if in_allow_list:
|
283 |
in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
|
284 |
else:
|
285 |
in_allow_list_flat = []
|
286 |
+
|
287 |
+
anon_df = pd.DataFrame()
|
288 |
|
289 |
+
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
290 |
+
if pii_identification_method == "AWS Comprehend":
|
291 |
+
print("Trying to connect to AWS Comprehend service")
|
292 |
+
if aws_access_key_textbox and aws_secret_key_textbox:
|
293 |
+
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
294 |
+
print("aws_access_key_textbox:", aws_access_key_textbox)
|
295 |
+
print("aws_secret_access_key:", aws_secret_key_textbox)
|
296 |
+
comprehend_client = boto3.client('comprehend',
|
297 |
+
aws_access_key_id=aws_access_key_textbox,
|
298 |
+
aws_secret_access_key=aws_secret_key_textbox)
|
299 |
+
elif RUN_AWS_FUNCTIONS == "1":
|
300 |
+
print("Connecting to Comprehend via existing SSO connection")
|
301 |
+
comprehend_client = boto3.client('comprehend')
|
302 |
+
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
303 |
+
print("Getting Comprehend credentials from environment variables")
|
304 |
+
comprehend_client = boto3.client('comprehend',
|
305 |
+
aws_access_key_id=AWS_ACCESS_KEY,
|
306 |
+
aws_secret_access_key=AWS_SECRET_KEY)
|
307 |
else:
|
308 |
+
comprehend_client = ""
|
309 |
+
out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
|
310 |
+
print(out_message)
|
311 |
+
|
312 |
+
# Check if files and text exist
|
313 |
+
if not file_paths:
|
314 |
+
if in_text:
|
315 |
+
file_paths=['open_text']
|
316 |
+
else:
|
317 |
+
out_message = "Please enter text or a file to redact."
|
318 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
319 |
+
|
320 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
321 |
+
if latest_file_completed >= len(file_paths):
|
322 |
+
print("Last file reached") #, returning files:", str(latest_file_completed))
|
323 |
+
# Set to a very high number so as not to mess with subsequent file processing by the user
|
324 |
+
latest_file_completed = 99
|
325 |
+
final_out_message = '\n'.join(out_message)
|
326 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
327 |
+
|
328 |
+
file_path_loop = [file_paths[int(latest_file_completed)]]
|
329 |
+
|
330 |
+
for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
|
331 |
|
332 |
+
if anon_file=='open_text':
|
333 |
+
anon_df = pd.DataFrame(data={'text':[in_text]})
|
334 |
+
chosen_cols=['text']
|
335 |
+
sheet_name = ""
|
336 |
+
file_type = ""
|
337 |
+
out_file_part = anon_file
|
338 |
|
339 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
|
340 |
+
else:
|
341 |
+
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
342 |
+
file_type = detect_file_type(anon_file)
|
343 |
+
print("File type is:", file_type)
|
344 |
|
345 |
+
out_file_part = get_file_name_without_type(anon_file.name)
|
346 |
+
|
347 |
+
if file_type == 'xlsx':
|
348 |
+
print("Running through all xlsx sheets")
|
349 |
+
#anon_xlsx = pd.ExcelFile(anon_file)
|
350 |
+
if not in_excel_sheets:
|
351 |
+
out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
|
352 |
+
continue
|
353 |
|
354 |
+
anon_xlsx = pd.ExcelFile(anon_file)
|
355 |
|
356 |
+
# Create xlsx file:
|
357 |
+
anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
|
358 |
|
359 |
+
from openpyxl import Workbook
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
|
361 |
+
wb = Workbook()
|
362 |
+
wb.save(anon_xlsx_export_file_name)
|
363 |
|
364 |
+
# Iterate through the sheet names
|
365 |
+
for sheet_name in in_excel_sheets:
|
366 |
+
# Read each sheet into a DataFrame
|
367 |
+
if sheet_name not in anon_xlsx.sheet_names:
|
368 |
+
continue
|
369 |
|
370 |
+
anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
|
|
|
371 |
|
372 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
|
373 |
+
|
374 |
+
else:
|
375 |
+
sheet_name = ""
|
376 |
+
anon_df = read_file(anon_file)
|
377 |
+
out_file_part = get_file_name_without_type(anon_file.name)
|
378 |
|
379 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
|
381 |
+
# Increase latest file completed count unless we are at the last file
|
382 |
+
if latest_file_completed != len(file_paths):
|
383 |
+
print("Completed file number:", str(latest_file_completed))
|
384 |
+
latest_file_completed += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
+
toc = time.perf_counter()
|
387 |
+
out_time = f"in {toc - tic:0.1f} seconds."
|
388 |
+
print(out_time)
|
389 |
+
|
390 |
+
if anon_strat == "encrypt":
|
391 |
+
out_message.append(". Your decryption key is " + key_string + ".")
|
392 |
|
393 |
+
out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
|
|
|
394 |
|
395 |
+
out_message_out = '\n'.join(out_message)
|
396 |
+
out_message_out = out_message_out + " " + out_time
|
397 |
|
398 |
+
out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
399 |
|
400 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
|
|
401 |
|
402 |
def anon_wrapper_func(
|
403 |
anon_file: str,
|
|
|
414 |
file_type: str,
|
415 |
anon_xlsx_export_file_name: str,
|
416 |
log_files_output_paths: List[str],
|
417 |
+
in_deny_list: List[str]=[],
|
418 |
+
max_fuzzy_spelling_mistakes_num:int=0,
|
419 |
+
pii_identification_method:str="Local",
|
420 |
+
chosen_redact_comprehend_entities:List[str]=[],
|
421 |
+
comprehend_query_number:int=0,
|
422 |
+
comprehend_client:botocore.client.BaseClient="",
|
423 |
output_folder: str = output_folder
|
424 |
):
|
425 |
"""
|
426 |
+
This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
|
427 |
|
428 |
Input Variables:
|
429 |
- anon_file: The path to the file containing the data to be anonymized.
|
|
|
441 |
- anon_xlsx_export_file_name: The name of the anonymized Excel file.
|
442 |
- log_files_output_paths: A list of paths where the log files will be saved.
|
443 |
- in_deny_list: List of specific terms to remove from the data.
|
444 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
445 |
+
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
446 |
+
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
447 |
+
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
448 |
+
- comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
|
449 |
- output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
|
450 |
"""
|
451 |
def check_lists(list1, list2):
|
|
|
468 |
common_strings.append(string)
|
469 |
return common_strings
|
470 |
|
471 |
+
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
472 |
+
raise("Connection to AWS Comprehend service not found, please check connection details.")
|
473 |
+
|
474 |
# Check for chosen col, skip file if not found
|
475 |
all_cols_original_order = list(anon_df.columns)
|
476 |
|
|
|
483 |
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
484 |
|
485 |
# Split dataframe to keep only selected columns
|
486 |
+
#print("Remaining columns to redact:", chosen_cols_in_anon_df)
|
487 |
|
488 |
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
489 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
490 |
+
|
491 |
# Anonymise the selected columns
|
492 |
+
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
|
493 |
|
494 |
# Rejoin the dataframe together
|
495 |
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
|
|
498 |
# Export file
|
499 |
|
500 |
# Rename anonymisation strategy for file path naming
|
501 |
+
if anon_strat == "replace with 'REDACTED'": anon_strat_txt = "redact_replace"
|
502 |
elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
|
503 |
+
elif anon_strat == "redact completely": anon_strat_txt = "redact_remove"
|
504 |
else: anon_strat_txt = anon_strat
|
505 |
|
506 |
# If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
|
|
|
537 |
|
538 |
return out_file_paths, out_message, key_string, log_files_output_paths
|
539 |
|
540 |
+
def anonymise_script(df:pd.DataFrame, anon_strat:str, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], max_fuzzy_spelling_mistakes_num:int=0, pii_identification_method:str="Local", chosen_redact_comprehend_entities:List[str]=[], comprehend_query_number:int=0, comprehend_client:botocore.client.BaseClient="", custom_entities=custom_entities, progress=Progress(track_tqdm=False)):
|
541 |
+
'''
|
542 |
+
Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
|
543 |
+
'''
|
544 |
|
545 |
+
print("Identifying personal information")
|
546 |
+
analyse_tic = time.perf_counter()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
|
548 |
+
# Initialize analyzer_results as an empty dictionary to store results by column
|
549 |
+
results_by_column = {}
|
550 |
+
key_string = ""
|
551 |
|
552 |
+
# DataFrame to dict
|
553 |
+
df_dict = df.to_dict(orient="list")
|
|
|
554 |
|
555 |
if in_allow_list:
|
556 |
in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
|
557 |
else:
|
558 |
in_allow_list_flat = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
|
560 |
+
if isinstance(in_deny_list, pd.DataFrame):
|
561 |
+
if not in_deny_list.empty:
|
562 |
+
in_deny_list = in_deny_list.iloc[:, 0].tolist()
|
563 |
else:
|
564 |
+
# Handle the case where the DataFrame is empty
|
565 |
+
in_deny_list = [] # or some default value
|
|
|
566 |
|
567 |
+
# Sort the strings in order from the longest string to the shortest
|
568 |
+
in_deny_list = sorted(in_deny_list, key=len, reverse=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
569 |
|
570 |
+
if in_deny_list:
|
571 |
+
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
572 |
+
new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
|
573 |
+
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
574 |
|
575 |
+
nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
|
576 |
+
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=in_deny_list, search_whole_phrase=max_fuzzy_spelling_mistakes_num)
|
577 |
+
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
578 |
|
579 |
+
#analyzer = nlp_analyser #AnalyzerEngine()
|
580 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
581 |
|
582 |
+
anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
|
|
|
583 |
|
584 |
+
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
585 |
+
|
586 |
+
analyzer_results = []
|
587 |
+
|
588 |
+
if pii_identification_method == "Local":
|
589 |
+
|
590 |
+
# Use custom analyzer to be able to track progress with Gradio
|
591 |
+
custom_results = analyze_dict(batch_analyzer,
|
592 |
+
df_dict,
|
593 |
+
language=language,
|
594 |
+
entities=chosen_redact_entities,
|
595 |
+
score_threshold=score_threshold,
|
596 |
+
return_decision_process=True,
|
597 |
+
allow_list=in_allow_list_flat)
|
598 |
+
|
599 |
+
# Initialize results_by_column with custom entity results
|
600 |
+
for result in custom_results:
|
601 |
+
results_by_column[result.key] = result
|
602 |
+
|
603 |
+
# Convert the dictionary of results back to a list
|
604 |
+
analyzer_results = list(results_by_column.values())
|
605 |
+
|
606 |
+
# AWS Comprehend calls
|
607 |
+
elif pii_identification_method == "AWS Comprehend" and comprehend_client:
|
608 |
+
|
609 |
+
# Only run Local anonymisation for entities that are not covered by AWS Comprehend
|
610 |
+
if custom_entities:
|
611 |
+
custom_redact_entities = [
|
612 |
+
entity for entity in chosen_redact_comprehend_entities
|
613 |
+
if entity in custom_entities
|
614 |
+
]
|
615 |
+
if custom_redact_entities:
|
616 |
+
# Get results from analyze_dict
|
617 |
+
custom_results = analyze_dict(batch_analyzer,
|
618 |
+
df_dict,
|
619 |
+
language=language,
|
620 |
+
entities=custom_redact_entities,
|
621 |
+
score_threshold=score_threshold,
|
622 |
+
return_decision_process=True,
|
623 |
+
allow_list=in_allow_list_flat)
|
624 |
+
|
625 |
+
# Initialize results_by_column with custom entity results
|
626 |
+
for result in custom_results:
|
627 |
+
results_by_column[result.key] = result
|
628 |
+
|
629 |
+
max_retries = 3
|
630 |
+
retry_delay = 3
|
631 |
+
|
632 |
+
# Process each text column in the dictionary
|
633 |
+
for column_name, texts in progress.tqdm(df_dict.items(), desc="Querying AWS Comprehend service.", unit = "Columns"):
|
634 |
+
# Get or create DictAnalyzerResult for this column
|
635 |
+
if column_name in results_by_column:
|
636 |
+
column_results = results_by_column[column_name]
|
637 |
+
else:
|
638 |
+
column_results = DictAnalyzerResult(
|
639 |
+
recognizer_results=[[] for _ in texts],
|
640 |
+
key=column_name,
|
641 |
+
value=texts
|
642 |
+
)
|
643 |
+
|
644 |
+
# Process each text in the column
|
645 |
+
for text_idx, text in progress.tqdm(enumerate(texts), desc="Querying AWS Comprehend service.", unit = "Row"):
|
646 |
+
|
647 |
+
for attempt in range(max_retries):
|
648 |
+
try:
|
649 |
+
response = comprehend_client.detect_pii_entities(
|
650 |
+
Text=str(text),
|
651 |
+
LanguageCode=language
|
652 |
+
)
|
653 |
+
|
654 |
+
comprehend_query_number += 1
|
655 |
+
|
656 |
+
# Add all entities from this text to the column's recognizer_results
|
657 |
+
for entity in response["Entities"]:
|
658 |
+
if entity.get("Type") not in chosen_redact_comprehend_entities:
|
659 |
+
continue
|
660 |
+
|
661 |
+
recognizer_result = RecognizerResult(
|
662 |
+
entity_type=entity["Type"],
|
663 |
+
start=entity["BeginOffset"],
|
664 |
+
end=entity["EndOffset"],
|
665 |
+
score=entity["Score"]
|
666 |
+
)
|
667 |
+
column_results.recognizer_results[text_idx].append(recognizer_result)
|
668 |
+
|
669 |
+
break # Success, exit retry loop
|
670 |
+
|
671 |
+
except Exception as e:
|
672 |
+
if attempt == max_retries - 1:
|
673 |
+
print(f"AWS Comprehend calls failed for text: {text[:100]}... due to", e)
|
674 |
+
raise
|
675 |
+
time.sleep(retry_delay)
|
676 |
+
|
677 |
+
# Store or update the column results
|
678 |
+
results_by_column[column_name] = column_results
|
679 |
+
|
680 |
+
# Convert the dictionary of results back to a list
|
681 |
+
analyzer_results = list(results_by_column.values())
|
682 |
+
|
683 |
+
elif (pii_identification_method == "AWS Comprehend") & (not comprehend_client):
|
684 |
+
raise("Unable to redact, Comprehend connection details not found.")
|
685 |
+
|
686 |
+
else:
|
687 |
+
print("Unable to redact.")
|
688 |
|
689 |
+
# Usage in the main function:
|
690 |
+
decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
|
|
|
|
|
|
|
691 |
|
692 |
+
analyse_toc = time.perf_counter()
|
693 |
+
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
694 |
+
print(analyse_time_out)
|
695 |
|
696 |
+
# Create faker function (note that it has to receive a value)
|
697 |
+
#fake = Faker("en_UK")
|
|
|
698 |
|
699 |
+
#def fake_first_name(x):
|
700 |
+
# return fake.first_name()
|
|
|
|
|
|
|
|
|
|
|
701 |
|
702 |
+
# Set up the anonymization configuration WITHOUT DATE_TIME
|
703 |
+
simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
|
704 |
+
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
705 |
+
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
706 |
+
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
707 |
+
mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
|
708 |
+
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
709 |
+
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
710 |
|
711 |
+
if anon_strat == "replace with 'REDACTED'": chosen_mask_config = simple_replace_config
|
712 |
+
if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
|
713 |
+
if anon_strat == "redact completely": chosen_mask_config = redact_config
|
714 |
+
if anon_strat == "hash": chosen_mask_config = hash_config
|
715 |
+
if anon_strat == "mask": chosen_mask_config = mask_config
|
716 |
+
if anon_strat == "encrypt":
|
717 |
+
chosen_mask_config = people_encrypt_config
|
718 |
+
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
719 |
+
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
720 |
+
key_string = base64.b64encode(key).decode('utf-8')
|
721 |
+
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
722 |
|
723 |
+
# I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
|
724 |
+
#keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
|
|
|
|
|
|
|
|
|
725 |
|
726 |
+
combined_config = {**chosen_mask_config} #, **keep_date_config}
|
727 |
|
728 |
+
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
|
|
|
729 |
|
730 |
+
scrubbed_df = pd.DataFrame(anonymizer_results)
|
731 |
|
732 |
+
return scrubbed_df, key_string, decision_process_output_str
|
tools/file_redaction.py
CHANGED
@@ -111,7 +111,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
111 |
- prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
112 |
- language (str): The language of the text in the files.
|
113 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
114 |
-
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
|
115 |
- in_redact_method (str): The method to use for redaction.
|
116 |
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
117 |
- custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
|
|
111 |
- prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
112 |
- language (str): The language of the text in the files.
|
113 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
114 |
+
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
115 |
- in_redact_method (str): The method to use for redaction.
|
116 |
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
117 |
- custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|