seanpedrickcase
commited on
Commit
•
01c88c0
1
Parent(s):
e08f9b8
Added logging, anonymising all Excel sheets, simple redaction tags, some Dockerfile optimisation
Browse files- .dockerignore +1 -0
- .gitignore +1 -0
- Dockerfile +26 -19
- app.py +40 -20
- requirements.txt +6 -6
- tools/data_anonymise.py +161 -67
- tools/file_conversion.py +23 -13
- tools/file_redaction.py +37 -11
- tools/helper_functions.py +36 -7
.dockerignore
CHANGED
@@ -14,4 +14,5 @@ poppler/*
|
|
14 |
build/*
|
15 |
dist/*
|
16 |
build_deps/*
|
|
|
17 |
doc_redaction_amplify_app/*
|
|
|
14 |
build/*
|
15 |
dist/*
|
16 |
build_deps/*
|
17 |
+
logs/*
|
18 |
doc_redaction_amplify_app/*
|
.gitignore
CHANGED
@@ -14,4 +14,5 @@ poppler/*
|
|
14 |
build/*
|
15 |
dist/*
|
16 |
build_deps/*
|
|
|
17 |
doc_redaction_amplify_app/*
|
|
|
14 |
build/*
|
15 |
dist/*
|
16 |
build_deps/*
|
17 |
+
logs/*
|
18 |
doc_redaction_amplify_app/*
|
Dockerfile
CHANGED
@@ -1,12 +1,8 @@
|
|
1 |
-
|
|
|
2 |
|
3 |
# Install system dependencies. Need to specify -y for poppler to get it to install
|
4 |
RUN apt-get update \
|
5 |
-
&& apt-get install -y \
|
6 |
-
tesseract-ocr -y \
|
7 |
-
poppler-utils -y \
|
8 |
-
libgl1-mesa-glx -y \
|
9 |
-
libglib2.0-0 -y \
|
10 |
&& apt-get clean \
|
11 |
&& rm -rf /var/lib/apt/lists/*
|
12 |
|
@@ -14,19 +10,34 @@ WORKDIR /src
|
|
14 |
|
15 |
COPY requirements.txt .
|
16 |
|
17 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Set up a new user named "user" with user ID 1000
|
22 |
RUN useradd -m -u 1000 user
|
23 |
|
24 |
-
# Change ownership of /home/user directory
|
25 |
-
#RUN chown -R user:user /home/user
|
26 |
-
|
27 |
# Make output folder
|
28 |
-
RUN mkdir -p /home/user/app/output
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
# Switch to the "user" user
|
32 |
USER user
|
@@ -34,18 +45,15 @@ USER user
|
|
34 |
# Set environmental variables
|
35 |
ENV HOME=/home/user \
|
36 |
PATH=/home/user/.local/bin:$PATH \
|
37 |
-
PYTHONPATH
|
38 |
PYTHONUNBUFFERED=1 \
|
|
|
39 |
GRADIO_ALLOW_FLAGGING=never \
|
40 |
GRADIO_NUM_PORTS=1 \
|
41 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
42 |
GRADIO_SERVER_PORT=7860 \
|
43 |
GRADIO_THEME=huggingface \
|
44 |
TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
|
45 |
-
#GRADIO_TEMP_DIR=$HOME/tmp \
|
46 |
-
#GRADIO_ROOT_PATH=/address-match \
|
47 |
-
# gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
|
48 |
-
KEEP_ALIVE=60 \
|
49 |
SYSTEM=spaces
|
50 |
|
51 |
# Set the working directory to the user's home directory
|
@@ -53,6 +61,5 @@ WORKDIR $HOME/app
|
|
53 |
|
54 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
55 |
COPY --chown=user . $HOME/app
|
56 |
-
#COPY . $HOME/app
|
57 |
|
58 |
CMD ["python", "app.py"]
|
|
|
1 |
+
# Stage 1: Build dependencies and download models
|
2 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
|
3 |
|
4 |
# Install system dependencies. Need to specify -y for poppler to get it to install
|
5 |
RUN apt-get update \
|
|
|
|
|
|
|
|
|
|
|
6 |
&& apt-get clean \
|
7 |
&& rm -rf /var/lib/apt/lists/*
|
8 |
|
|
|
10 |
|
11 |
COPY requirements.txt .
|
12 |
|
13 |
+
RUN pip install --no-cache-dir --target=/install -r requirements.txt
|
14 |
+
|
15 |
+
RUN rm requirements.txt
|
16 |
|
17 |
+
# Stage 2: Final runtime image
|
18 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
19 |
+
|
20 |
+
# Install system dependencies. Need to specify -y for poppler to get it to install
|
21 |
+
RUN apt-get update \
|
22 |
+
&& apt-get install -y \
|
23 |
+
tesseract-ocr \
|
24 |
+
poppler-utils \
|
25 |
+
libgl1-mesa-glx \
|
26 |
+
libglib2.0-0 \
|
27 |
+
&& apt-get clean \
|
28 |
+
&& rm -rf /var/lib/apt/lists/*
|
29 |
|
30 |
# Set up a new user named "user" with user ID 1000
|
31 |
RUN useradd -m -u 1000 user
|
32 |
|
|
|
|
|
|
|
33 |
# Make output folder
|
34 |
+
RUN mkdir -p /home/user/app/output \
|
35 |
+
&& mkdir -p /home/user/app/tld \
|
36 |
+
&& mkdir -p /home/user/app/logs \
|
37 |
+
&& chown -R user:user /home/user/app
|
38 |
+
|
39 |
+
# Copy installed packages from builder stage
|
40 |
+
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
41 |
|
42 |
# Switch to the "user" user
|
43 |
USER user
|
|
|
45 |
# Set environmental variables
|
46 |
ENV HOME=/home/user \
|
47 |
PATH=/home/user/.local/bin:$PATH \
|
48 |
+
PYTHONPATH=/home/user/app \
|
49 |
PYTHONUNBUFFERED=1 \
|
50 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
51 |
GRADIO_ALLOW_FLAGGING=never \
|
52 |
GRADIO_NUM_PORTS=1 \
|
53 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
54 |
GRADIO_SERVER_PORT=7860 \
|
55 |
GRADIO_THEME=huggingface \
|
56 |
TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
|
|
|
|
|
|
|
|
|
57 |
SYSTEM=spaces
|
58 |
|
59 |
# Set the working directory to the user's home directory
|
|
|
61 |
|
62 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
63 |
COPY --chown=user . $HOME/app
|
|
|
64 |
|
65 |
CMD ["python", "app.py"]
|
app.py
CHANGED
@@ -6,7 +6,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
|
6 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
from tools.file_conversion import prepare_image_or_text_pdf
|
9 |
-
from tools.data_anonymise import
|
10 |
from tools.auth import authenticate_user
|
11 |
#from tools.aws_functions import load_data_from_aws
|
12 |
import gradio as gr
|
@@ -28,6 +28,7 @@ with app:
|
|
28 |
prepared_pdf_state = gr.State([])
|
29 |
output_image_files_state = gr.State([])
|
30 |
output_file_list_state = gr.State([])
|
|
|
31 |
|
32 |
session_hash_state = gr.State()
|
33 |
s3_output_folder_state = gr.State()
|
@@ -51,7 +52,8 @@ with app:
|
|
51 |
|
52 |
with gr.Row():
|
53 |
output_summary = gr.Textbox(label="Output summary")
|
54 |
-
output_file = gr.File(label="Output
|
|
|
55 |
|
56 |
with gr.Row():
|
57 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
@@ -64,16 +66,19 @@ with app:
|
|
64 |
)
|
65 |
with gr.Accordion("Paste open text", open = False):
|
66 |
in_text = gr.Textbox(label="Enter open text", lines=10)
|
67 |
-
with gr.Accordion("Upload xlsx
|
68 |
-
|
|
|
|
|
69 |
|
70 |
-
in_colnames = gr.Dropdown(choices=["Choose
|
71 |
|
72 |
-
|
73 |
|
74 |
with gr.Row():
|
75 |
text_output_summary = gr.Textbox(label="Output result")
|
76 |
-
text_output_file = gr.File(label="Output
|
|
|
77 |
|
78 |
with gr.Tab(label="Redaction settings"):
|
79 |
gr.Markdown(
|
@@ -83,13 +88,16 @@ with app:
|
|
83 |
with gr.Accordion("Settings for documents", open = True):
|
84 |
in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
|
85 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
86 |
-
anon_strat = gr.Radio(choices=["replace", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace")
|
87 |
|
88 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
89 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
90 |
with gr.Row():
|
91 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
92 |
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
|
|
|
|
|
|
|
93 |
|
94 |
# AWS options - not yet implemented
|
95 |
# with gr.Tab(label="Advanced options"):
|
@@ -104,26 +112,38 @@ with app:
|
|
104 |
# ### Loading AWS data ###
|
105 |
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
106 |
|
|
|
107 |
|
108 |
# Document redaction
|
109 |
-
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
|
110 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
111 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
112 |
-
outputs=[output_summary, output_file, output_file_list_state], api_name="redact_doc")
|
113 |
-
#then(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
|
114 |
-
#outputs=[output_summary, output_file])
|
115 |
|
116 |
-
#
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
|
122 |
|
123 |
-
app.load(
|
|
|
124 |
|
125 |
# Launch the Gradio app
|
126 |
-
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '
|
127 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
128 |
|
129 |
if __name__ == "__main__":
|
|
|
6 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
from tools.file_conversion import prepare_image_or_text_pdf
|
9 |
+
from tools.data_anonymise import anonymise_data_files
|
10 |
from tools.auth import authenticate_user
|
11 |
#from tools.aws_functions import load_data_from_aws
|
12 |
import gradio as gr
|
|
|
28 |
prepared_pdf_state = gr.State([])
|
29 |
output_image_files_state = gr.State([])
|
30 |
output_file_list_state = gr.State([])
|
31 |
+
text_output_file_list_state = gr.State([])
|
32 |
|
33 |
session_hash_state = gr.State()
|
34 |
s3_output_folder_state = gr.State()
|
|
|
52 |
|
53 |
with gr.Row():
|
54 |
output_summary = gr.Textbox(label="Output summary")
|
55 |
+
output_file = gr.File(label="Output files")
|
56 |
+
text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False)
|
57 |
|
58 |
with gr.Row():
|
59 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
|
|
66 |
)
|
67 |
with gr.Accordion("Paste open text", open = False):
|
68 |
in_text = gr.Textbox(label="Enter open text", lines=10)
|
69 |
+
with gr.Accordion("Upload xlsx or csv files", open = True):
|
70 |
+
in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
71 |
+
|
72 |
+
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
|
73 |
|
74 |
+
in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
|
75 |
|
76 |
+
tabular_data_redact_btn = gr.Button("Anonymise text", variant="primary")
|
77 |
|
78 |
with gr.Row():
|
79 |
text_output_summary = gr.Textbox(label="Output result")
|
80 |
+
text_output_file = gr.File(label="Output files")
|
81 |
+
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
|
82 |
|
83 |
with gr.Tab(label="Redaction settings"):
|
84 |
gr.Markdown(
|
|
|
88 |
with gr.Accordion("Settings for documents", open = True):
|
89 |
in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
|
90 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
91 |
+
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
92 |
|
93 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
94 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
95 |
with gr.Row():
|
96 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
97 |
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
|
98 |
+
|
99 |
+
# Invisible text box to hold the session hash/username just for logging purposes
|
100 |
+
session_hash_textbox = gr.Textbox(value="", visible=False)
|
101 |
|
102 |
# AWS options - not yet implemented
|
103 |
# with gr.Tab(label="Advanced options"):
|
|
|
112 |
# ### Loading AWS data ###
|
113 |
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
114 |
|
115 |
+
callback = gr.CSVLogger()
|
116 |
|
117 |
# Document redaction
|
118 |
+
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
|
119 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
120 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
|
121 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
|
|
|
|
|
122 |
|
123 |
+
# If the output file count text box changes, keep going with redacting each document until done
|
124 |
+
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
|
125 |
+
outputs=[output_summary, prepared_pdf_state]).\
|
126 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
|
127 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done])
|
128 |
+
|
129 |
+
# Tabular data redaction
|
130 |
+
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
|
131 |
+
|
132 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
|
133 |
+
|
134 |
+
# If the output file count text box changes, keep going with redacting each data file until done
|
135 |
+
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done])
|
136 |
+
|
137 |
+
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
138 |
|
139 |
+
# This needs to be called at some point prior to the first call to callback.flag()
|
140 |
+
callback.setup([session_hash_textbox], "logs")
|
|
|
141 |
|
142 |
+
#app.load(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
|
143 |
+
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
|
144 |
|
145 |
# Launch the Gradio app
|
146 |
+
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
147 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
148 |
|
149 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
@@ -6,10 +6,10 @@ presidio_anonymizer==2.2.354
|
|
6 |
presidio-image-redactor==0.0.52
|
7 |
pikepdf==8.15.1
|
8 |
pandas==2.2.2
|
9 |
-
spacy
|
10 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
|
11 |
-
gradio
|
12 |
-
boto3==1.34.
|
13 |
-
|
14 |
-
openpyxl
|
15 |
-
|
|
|
6 |
presidio-image-redactor==0.0.52
|
7 |
pikepdf==8.15.1
|
8 |
pandas==2.2.2
|
9 |
+
spacy==3.7.5
|
10 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
|
11 |
+
gradio>=4.26.0
|
12 |
+
boto3==1.34.158
|
13 |
+
pyarrow==14.0.2
|
14 |
+
openpyxl==3.1.2
|
15 |
+
Faker==22.2.0
|
tools/data_anonymise.py
CHANGED
@@ -11,9 +11,9 @@ from typing import List
|
|
11 |
|
12 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
|
13 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
14 |
-
from presidio_anonymizer.entities import OperatorConfig
|
15 |
|
16 |
-
from tools.helper_functions import output_folder, get_file_path_end, read_file
|
17 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
18 |
|
19 |
# Use custom version of analyze_dict to be able to track progress
|
@@ -116,17 +116,20 @@ def anon_consistent_names(df):
|
|
116 |
|
117 |
return scrubbed_df_consistent_names
|
118 |
|
119 |
-
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str],
|
|
|
|
|
|
|
120 |
# DataFrame to dict
|
121 |
df_dict = df.to_dict(orient="list")
|
122 |
|
123 |
-
if
|
124 |
-
|
125 |
|
126 |
#analyzer = nlp_analyser #AnalyzerEngine()
|
127 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
128 |
|
129 |
-
anonymizer = AnonymizerEngine()
|
130 |
|
131 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
132 |
|
@@ -134,19 +137,19 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
134 |
# entities=chosen_redact_entities,
|
135 |
# score_threshold=score_threshold,
|
136 |
# return_decision_process=False,
|
137 |
-
#
|
138 |
|
139 |
print("Identifying personal information")
|
140 |
analyse_tic = time.perf_counter()
|
141 |
|
142 |
-
print("Allow list:",
|
143 |
|
144 |
# Use custom analyzer to be able to track progress with Gradio
|
145 |
analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
|
146 |
entities=chosen_redact_entities,
|
147 |
score_threshold=score_threshold,
|
148 |
return_decision_process=False,
|
149 |
-
allow_list=
|
150 |
analyzer_results = list(analyzer_results)
|
151 |
#analyzer_results
|
152 |
|
@@ -154,9 +157,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
154 |
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
155 |
print(analyse_time_out)
|
156 |
|
157 |
-
|
158 |
-
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
159 |
-
key_string = base64.b64encode(key).decode('utf-8')
|
160 |
|
161 |
# Create faker function (note that it has to receive a value)
|
162 |
|
@@ -166,6 +167,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
166 |
return fake.first_name()
|
167 |
|
168 |
# Set up the anonymization configuration WITHOUT DATE_TIME
|
|
|
169 |
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
170 |
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
171 |
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
@@ -173,12 +175,16 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
173 |
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
174 |
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
175 |
|
176 |
-
|
177 |
-
if anon_strat == "replace": chosen_mask_config = replace_config
|
178 |
if anon_strat == "redact": chosen_mask_config = redact_config
|
179 |
if anon_strat == "hash": chosen_mask_config = hash_config
|
180 |
if anon_strat == "mask": chosen_mask_config = mask_config
|
181 |
-
if anon_strat == "encrypt":
|
|
|
|
|
|
|
|
|
182 |
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
183 |
|
184 |
# I think in general people will want to keep date / times
|
@@ -190,17 +196,10 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
190 |
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
|
191 |
|
192 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
193 |
-
|
194 |
-
# Create reporting message
|
195 |
-
out_message = "Successfully anonymised"
|
196 |
-
|
197 |
-
if anon_strat == "encrypt":
|
198 |
-
out_message = out_message + ". Your decryption key is " + key_string + "."
|
199 |
|
200 |
-
return scrubbed_df,
|
201 |
|
202 |
-
def
|
203 |
-
|
204 |
def check_lists(list1, list2):
|
205 |
return any(string in list2 for string in list1)
|
206 |
|
@@ -221,69 +220,164 @@ def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], la
|
|
221 |
common_strings.append(string)
|
222 |
return common_strings
|
223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
# Load file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
anon_df = pd.DataFrame()
|
227 |
-
|
228 |
|
229 |
# Check if files and text exist
|
230 |
-
if not
|
231 |
if in_text:
|
232 |
-
|
233 |
else:
|
234 |
out_message = "Please enter text or a file to redact."
|
235 |
-
return out_message,
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
-
|
|
|
|
|
238 |
|
239 |
-
if
|
240 |
anon_df = pd.DataFrame(data={'text':[in_text]})
|
241 |
chosen_cols=['text']
|
242 |
-
out_file_part =
|
243 |
else:
|
244 |
-
|
245 |
-
|
|
|
246 |
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
-
|
250 |
-
all_cols_original_order = list(anon_df.columns)
|
251 |
|
252 |
-
|
|
|
253 |
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
else:
|
259 |
-
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
260 |
|
261 |
-
# Split dataframe to keep only selected columns
|
262 |
-
print("Remaining columns to redact:", chosen_cols_in_anon_df)
|
263 |
-
|
264 |
-
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
265 |
-
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
266 |
-
|
267 |
-
# Anonymise the selected columns
|
268 |
-
anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, allow_list)
|
269 |
-
|
270 |
-
# Rejoin the dataframe together
|
271 |
-
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
272 |
-
anon_df_out = anon_df_out[all_cols_original_order]
|
273 |
-
|
274 |
-
# Export file
|
275 |
-
|
276 |
-
|
277 |
-
# out_file_part = re.sub(r'\.csv', '', match_file.name)
|
278 |
|
279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
-
|
|
|
282 |
|
283 |
-
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
out_message = anon_df_out['text'][0]
|
288 |
|
289 |
-
return
|
|
|
11 |
|
12 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
|
13 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
14 |
+
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
15 |
|
16 |
+
from tools.helper_functions import output_folder, get_file_path_end, read_file, detect_file_type
|
17 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
18 |
|
19 |
# Use custom version of analyze_dict to be able to track progress
|
|
|
116 |
|
117 |
return scrubbed_df_consistent_names
|
118 |
|
119 |
+
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
|
120 |
+
|
121 |
+
key_string = ""
|
122 |
+
|
123 |
# DataFrame to dict
|
124 |
df_dict = df.to_dict(orient="list")
|
125 |
|
126 |
+
if in_allow_list:
|
127 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
128 |
|
129 |
#analyzer = nlp_analyser #AnalyzerEngine()
|
130 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
131 |
|
132 |
+
anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
|
133 |
|
134 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
135 |
|
|
|
137 |
# entities=chosen_redact_entities,
|
138 |
# score_threshold=score_threshold,
|
139 |
# return_decision_process=False,
|
140 |
+
# in_allow_list=in_allow_list_flat)
|
141 |
|
142 |
print("Identifying personal information")
|
143 |
analyse_tic = time.perf_counter()
|
144 |
|
145 |
+
print("Allow list:", in_allow_list)
|
146 |
|
147 |
# Use custom analyzer to be able to track progress with Gradio
|
148 |
analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
|
149 |
entities=chosen_redact_entities,
|
150 |
score_threshold=score_threshold,
|
151 |
return_decision_process=False,
|
152 |
+
allow_list=in_allow_list_flat)
|
153 |
analyzer_results = list(analyzer_results)
|
154 |
#analyzer_results
|
155 |
|
|
|
157 |
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
158 |
print(analyse_time_out)
|
159 |
|
160 |
+
|
|
|
|
|
161 |
|
162 |
# Create faker function (note that it has to receive a value)
|
163 |
|
|
|
167 |
return fake.first_name()
|
168 |
|
169 |
# Set up the anonymization configuration WITHOUT DATE_TIME
|
170 |
+
simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
|
171 |
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
172 |
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
173 |
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
|
|
175 |
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
176 |
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
177 |
|
178 |
+
if anon_strat == "replace with <REDACTED>": chosen_mask_config = simple_replace_config
|
179 |
+
if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
|
180 |
if anon_strat == "redact": chosen_mask_config = redact_config
|
181 |
if anon_strat == "hash": chosen_mask_config = hash_config
|
182 |
if anon_strat == "mask": chosen_mask_config = mask_config
|
183 |
+
if anon_strat == "encrypt":
|
184 |
+
chosen_mask_config = people_encrypt_config
|
185 |
+
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
186 |
+
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
187 |
+
key_string = base64.b64encode(key).decode('utf-8')
|
188 |
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
189 |
|
190 |
# I think in general people will want to keep date / times
|
|
|
196 |
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
|
197 |
|
198 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
+
return scrubbed_df, key_string
|
201 |
|
202 |
+
def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
|
|
|
203 |
def check_lists(list1, list2):
|
204 |
return any(string in list2 for string in list1)
|
205 |
|
|
|
220 |
common_strings.append(string)
|
221 |
return common_strings
|
222 |
|
223 |
+
# Check for chosen col, skip file if not found
|
224 |
+
all_cols_original_order = list(anon_df.columns)
|
225 |
+
|
226 |
+
any_cols_found = check_lists(chosen_cols, all_cols_original_order)
|
227 |
+
|
228 |
+
if any_cols_found == False:
|
229 |
+
out_message = "No chosen columns found in dataframe: " + out_file_part
|
230 |
+
print(out_message)
|
231 |
+
else:
|
232 |
+
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
233 |
+
|
234 |
+
# Split dataframe to keep only selected columns
|
235 |
+
print("Remaining columns to redact:", chosen_cols_in_anon_df)
|
236 |
+
|
237 |
+
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
238 |
+
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
239 |
+
|
240 |
+
# Anonymise the selected columns
|
241 |
+
anon_df_part_out, key_string = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
|
242 |
+
|
243 |
+
# Rejoin the dataframe together
|
244 |
+
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
245 |
+
anon_df_out = anon_df_out[all_cols_original_order]
|
246 |
+
|
247 |
+
# Export file
|
248 |
+
|
249 |
+
# Rename anonymisation strategy for file path naming
|
250 |
+
if anon_strat == "replace with <REDACTED>": anon_strat_txt = "redact_simple"
|
251 |
+
elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
|
252 |
+
else: anon_strat_txt = anon_strat
|
253 |
+
|
254 |
+
# If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
|
255 |
+
if file_type == 'xlsx':
|
256 |
+
|
257 |
+
anon_export_file_name = anon_xlsx_export_file_name
|
258 |
+
|
259 |
+
# Create a Pandas Excel writer using XlsxWriter as the engine.
|
260 |
+
with pd.ExcelWriter(anon_xlsx_export_file_name, engine='openpyxl', mode='a') as writer:
|
261 |
+
# Write each DataFrame to a different worksheet.
|
262 |
+
anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
|
263 |
+
|
264 |
+
else:
|
265 |
+
anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
|
266 |
+
anon_df_out.to_csv(anon_export_file_name, index = None)
|
267 |
+
|
268 |
+
out_file_paths.append(anon_export_file_name)
|
269 |
+
|
270 |
+
# As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
|
271 |
+
out_file_paths = list(set(out_file_paths))
|
272 |
+
|
273 |
+
# Print result text to output text box if just anonymising open text
|
274 |
+
if anon_file=='open_text':
|
275 |
+
out_message = [anon_df_out['text'][0]]
|
276 |
+
|
277 |
+
return out_file_paths, out_message, key_string
|
278 |
+
|
279 |
+
def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], progress=Progress(track_tqdm=True)):
|
280 |
+
|
281 |
+
tic = time.perf_counter()
|
282 |
+
|
283 |
# Load file
|
284 |
+
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
285 |
+
if isinstance(out_message, str):
|
286 |
+
out_message = [out_message]
|
287 |
+
|
288 |
+
if not out_file_paths:
|
289 |
+
out_file_paths = []
|
290 |
+
|
291 |
+
|
292 |
+
if in_allow_list:
|
293 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
294 |
|
295 |
anon_df = pd.DataFrame()
|
296 |
+
#out_file_paths = []
|
297 |
|
298 |
# Check if files and text exist
|
299 |
+
if not file_paths:
|
300 |
if in_text:
|
301 |
+
file_paths=['open_text']
|
302 |
else:
|
303 |
out_message = "Please enter text or a file to redact."
|
304 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
305 |
+
|
306 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
307 |
+
if latest_file_completed == len(file_paths):
|
308 |
+
print("Last file reached, returning files:", str(latest_file_completed))
|
309 |
+
final_out_message = '\n'.join(out_message)
|
310 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed
|
311 |
|
312 |
+
file_path_loop = [file_paths[int(latest_file_completed)]]
|
313 |
+
|
314 |
+
for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
|
315 |
|
316 |
+
if anon_file=='open_text':
|
317 |
anon_df = pd.DataFrame(data={'text':[in_text]})
|
318 |
chosen_cols=['text']
|
319 |
+
out_file_part = anon_file
|
320 |
else:
|
321 |
+
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
322 |
+
file_type = detect_file_type(anon_file)
|
323 |
+
print("File type is:", file_type)
|
324 |
|
325 |
+
out_file_part = get_file_path_end(anon_file.name)
|
326 |
+
|
327 |
+
if file_type == 'xlsx':
|
328 |
+
print("Running through all xlsx sheets")
|
329 |
+
#anon_xlsx = pd.ExcelFile(anon_file)
|
330 |
+
if not in_excel_sheets:
|
331 |
+
out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
|
332 |
+
continue
|
333 |
|
334 |
+
anon_xlsx = pd.ExcelFile(anon_file)
|
|
|
335 |
|
336 |
+
# Create xlsx file:
|
337 |
+
anon_xlsx_export_file_name = output_folder + out_file_part + ".xlsx"
|
338 |
|
339 |
+
from openpyxl import Workbook
|
340 |
+
|
341 |
+
wb = Workbook()
|
342 |
+
wb.save(anon_xlsx_export_file_name)
|
|
|
|
|
343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
|
345 |
+
# Iterate through the sheet names
|
346 |
+
for sheet_name in in_excel_sheets:
|
347 |
+
# Read each sheet into a DataFrame
|
348 |
+
if sheet_name not in anon_xlsx.sheet_names:
|
349 |
+
continue
|
350 |
+
|
351 |
+
anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
|
352 |
+
|
353 |
+
# Process the DataFrame (e.g., print its contents)
|
354 |
+
print(f"Sheet Name: {sheet_name}")
|
355 |
+
print(anon_df.head()) # Print the first few rows
|
356 |
+
|
357 |
+
|
358 |
+
out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name)
|
359 |
+
|
360 |
+
else:
|
361 |
+
sheet_name = ""
|
362 |
+
anon_df = read_file(anon_file)
|
363 |
+
out_file_part = get_file_path_end(anon_file.name)
|
364 |
+
out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "")
|
365 |
+
|
366 |
+
# Increase latest file completed count unless we are at the last file
|
367 |
+
if latest_file_completed != len(file_paths):
|
368 |
+
print("Completed file number:", str(latest_file_completed))
|
369 |
+
latest_file_completed += 1
|
370 |
+
|
371 |
+
toc = time.perf_counter()
|
372 |
+
out_time = f"in {toc - tic:0.1f} seconds."
|
373 |
+
print(out_time)
|
374 |
|
375 |
+
if anon_strat == "encrypt":
|
376 |
+
out_message.append(". Your decryption key is " + key_string + ".")
|
377 |
|
378 |
+
out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
|
379 |
|
380 |
+
out_message_out = '\n'.join(out_message)
|
381 |
+
out_message_out = out_message_out + " " + out_time
|
|
|
382 |
|
383 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed
|
tools/file_conversion.py
CHANGED
@@ -89,15 +89,31 @@ def process_file(file_path):
|
|
89 |
|
90 |
return img_object
|
91 |
|
92 |
-
def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
98 |
|
|
|
|
|
99 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
100 |
-
for file in
|
101 |
file_path = file.name
|
102 |
|
103 |
#if file_path:
|
@@ -112,7 +128,7 @@ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_all
|
|
112 |
if is_pdf_or_image(file_path) == False:
|
113 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
114 |
print(out_message)
|
115 |
-
return out_message,
|
116 |
|
117 |
out_file_path = process_file(file_path)
|
118 |
print("Out file path at image conversion step:", out_file_path)
|
@@ -121,7 +137,7 @@ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_all
|
|
121 |
if is_pdf(file_path) == False:
|
122 |
out_message = "Please upload a PDF file for text analysis."
|
123 |
print(out_message)
|
124 |
-
return out_message,
|
125 |
|
126 |
out_file_path = file_path
|
127 |
|
@@ -151,10 +167,4 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
151 |
|
152 |
print("Out file paths:", out_file_paths)
|
153 |
|
154 |
-
return out_message, out_file_paths
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
89 |
|
90 |
return img_object
|
91 |
|
92 |
+
def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], progress=Progress(track_tqdm=True)):
|
93 |
|
94 |
+
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
95 |
+
#if isinstance(out_message, str):
|
96 |
+
# out_message = [out_message]
|
97 |
+
|
98 |
+
if not file_paths:
|
99 |
+
file_paths = []
|
100 |
+
|
101 |
+
out_file_paths = file_paths
|
102 |
+
|
103 |
+
latest_file_completed = int(latest_file_completed)
|
104 |
+
|
105 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
106 |
+
if latest_file_completed == len(out_file_paths):
|
107 |
+
print("Last file reached, returning files:", str(latest_file_completed))
|
108 |
+
#final_out_message = '\n'.join(out_message)
|
109 |
+
return out_message, out_file_paths
|
110 |
|
111 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
112 |
|
113 |
+
file_paths_loop = [out_file_paths[int(latest_file_completed)]]
|
114 |
+
|
115 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
116 |
+
for file in file_paths_loop:
|
117 |
file_path = file.name
|
118 |
|
119 |
#if file_path:
|
|
|
128 |
if is_pdf_or_image(file_path) == False:
|
129 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
130 |
print(out_message)
|
131 |
+
return out_message, out_file_paths
|
132 |
|
133 |
out_file_path = process_file(file_path)
|
134 |
print("Out file path at image conversion step:", out_file_path)
|
|
|
137 |
if is_pdf(file_path) == False:
|
138 |
out_message = "Please upload a PDF file for text analysis."
|
139 |
print(out_message)
|
140 |
+
return out_message, out_file_paths
|
141 |
|
142 |
out_file_path = file_path
|
143 |
|
|
|
167 |
|
168 |
print("Out file paths:", out_file_paths)
|
169 |
|
170 |
+
return out_message, out_file_paths
|
|
|
|
|
|
|
|
|
|
|
|
tools/file_redaction.py
CHANGED
@@ -17,20 +17,36 @@ from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_
|
|
17 |
import gradio as gr
|
18 |
|
19 |
|
20 |
-
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
|
21 |
|
22 |
tic = time.perf_counter()
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
if in_allow_list:
|
28 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
29 |
-
|
30 |
|
31 |
print("File paths:", file_paths)
|
32 |
|
33 |
-
for file in progress.tqdm(
|
34 |
file_path = file.name
|
35 |
|
36 |
if file_path:
|
@@ -42,7 +58,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
42 |
else:
|
43 |
out_message = "No file selected"
|
44 |
print(out_message)
|
45 |
-
return out_message, out_file_paths
|
46 |
|
47 |
if in_redact_method == "Image analysis":
|
48 |
# Analyse and redact image-based pdf or image
|
@@ -57,6 +73,11 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
57 |
out_file_paths.append(out_image_file_path)
|
58 |
out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
|
59 |
|
|
|
|
|
|
|
|
|
|
|
60 |
elif in_redact_method == "Text analysis":
|
61 |
if is_pdf(file_path) == False:
|
62 |
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
@@ -81,21 +102,26 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
81 |
out_file_paths.extend(img_output_file_path)
|
82 |
|
83 |
# Add confirmation for converting to image if you want
|
84 |
-
# out_message.append(img_output_summary)
|
|
|
|
|
|
|
|
|
85 |
|
86 |
else:
|
87 |
out_message = "No redaction method selected"
|
88 |
print(out_message)
|
89 |
-
return out_message, out_file_paths
|
|
|
90 |
|
91 |
toc = time.perf_counter()
|
92 |
-
out_time = f"
|
93 |
print(out_time)
|
94 |
|
95 |
out_message_out = '\n'.join(out_message)
|
96 |
-
out_message_out = out_message_out + "
|
97 |
|
98 |
-
return out_message_out, out_file_paths, out_file_paths
|
99 |
|
100 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
101 |
merged_bboxes = []
|
|
|
17 |
import gradio as gr
|
18 |
|
19 |
|
20 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], progress=gr.Progress(track_tqdm=True)):
|
21 |
|
22 |
tic = time.perf_counter()
|
23 |
|
24 |
+
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
25 |
+
if isinstance(out_message, str):
|
26 |
+
out_message = [out_message]
|
27 |
+
|
28 |
+
if not out_file_paths:
|
29 |
+
out_file_paths = []
|
30 |
+
|
31 |
+
print("Latest file completed is:", str(latest_file_completed))
|
32 |
+
|
33 |
+
latest_file_completed = int(latest_file_completed)
|
34 |
+
|
35 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
36 |
+
if latest_file_completed == len(file_paths):
|
37 |
+
print("Last file reached, returning files:", str(latest_file_completed))
|
38 |
+
final_out_message = '\n'.join(out_message)
|
39 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed
|
40 |
+
|
41 |
+
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
42 |
|
43 |
if in_allow_list:
|
44 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
45 |
+
|
46 |
|
47 |
print("File paths:", file_paths)
|
48 |
|
49 |
+
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
50 |
file_path = file.name
|
51 |
|
52 |
if file_path:
|
|
|
58 |
else:
|
59 |
out_message = "No file selected"
|
60 |
print(out_message)
|
61 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
62 |
|
63 |
if in_redact_method == "Image analysis":
|
64 |
# Analyse and redact image-based pdf or image
|
|
|
73 |
out_file_paths.append(out_image_file_path)
|
74 |
out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
|
75 |
|
76 |
+
# Increase latest file completed count unless we are at the last file
|
77 |
+
if latest_file_completed != len(file_paths):
|
78 |
+
print("Completed file number:", str(latest_file_completed))
|
79 |
+
latest_file_completed += 1
|
80 |
+
|
81 |
elif in_redact_method == "Text analysis":
|
82 |
if is_pdf(file_path) == False:
|
83 |
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
|
|
102 |
out_file_paths.extend(img_output_file_path)
|
103 |
|
104 |
# Add confirmation for converting to image if you want
|
105 |
+
# out_message.append(img_output_summary)
|
106 |
+
|
107 |
+
if latest_file_completed != len(file_paths):
|
108 |
+
print("Completed file number:", str(latest_file_completed))
|
109 |
+
latest_file_completed += 1
|
110 |
|
111 |
else:
|
112 |
out_message = "No redaction method selected"
|
113 |
print(out_message)
|
114 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
115 |
+
|
116 |
|
117 |
toc = time.perf_counter()
|
118 |
+
out_time = f"in {toc - tic:0.1f} seconds."
|
119 |
print(out_time)
|
120 |
|
121 |
out_message_out = '\n'.join(out_message)
|
122 |
+
out_message_out = out_message_out + " " + out_time
|
123 |
|
124 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed
|
125 |
|
126 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
127 |
merged_bboxes = []
|
tools/helper_functions.py
CHANGED
@@ -76,17 +76,46 @@ def ensure_output_folder_exists():
|
|
76 |
def put_columns_in_df(in_file):
|
77 |
new_choices = []
|
78 |
concat_choices = []
|
|
|
|
|
79 |
|
80 |
for file in in_file:
|
81 |
-
|
82 |
-
|
|
|
83 |
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
|
|
|
|
86 |
# Drop duplicate columns
|
87 |
concat_choices = list(set(concat_choices))
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
90 |
|
91 |
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
92 |
def add_folder_to_path(folder_path: str):
|
@@ -104,7 +133,7 @@ def add_folder_to_path(folder_path: str):
|
|
104 |
if absolute_path not in current_path.split(os.pathsep):
|
105 |
full_path_extension = absolute_path + os.pathsep + current_path
|
106 |
os.environ['PATH'] = full_path_extension
|
107 |
-
print(f"Updated PATH with: ", full_path_extension)
|
108 |
else:
|
109 |
print(f"Directory {folder_path} already exists in PATH.")
|
110 |
else:
|
@@ -167,7 +196,7 @@ async def get_connection_params(request: gr.Request):
|
|
167 |
#if bucket_name:
|
168 |
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
169 |
|
170 |
-
return out_session_hash, output_folder
|
171 |
else:
|
172 |
print("No session parameters found.")
|
173 |
return "",""
|
|
|
76 |
def put_columns_in_df(in_file):
|
77 |
new_choices = []
|
78 |
concat_choices = []
|
79 |
+
all_sheet_names = []
|
80 |
+
number_of_excel_files = 0
|
81 |
|
82 |
for file in in_file:
|
83 |
+
file_name = file.name
|
84 |
+
file_type = detect_file_type(file_name)
|
85 |
+
print("File type is:", file_type)
|
86 |
|
87 |
+
if file_type == 'xlsx':
|
88 |
+
number_of_excel_files += 1
|
89 |
+
new_choices = []
|
90 |
+
print("Running through all xlsx sheets")
|
91 |
+
anon_xlsx = pd.ExcelFile(file_name)
|
92 |
+
new_sheet_names = anon_xlsx.sheet_names
|
93 |
+
# Iterate through the sheet names
|
94 |
+
for sheet_name in new_sheet_names:
|
95 |
+
# Read each sheet into a DataFrame
|
96 |
+
df = pd.read_excel(file_name, sheet_name=sheet_name)
|
97 |
+
|
98 |
+
# Process the DataFrame (e.g., print its contents)
|
99 |
+
print(f"Sheet Name: {sheet_name}")
|
100 |
+
print(df.head()) # Print the first few rows
|
101 |
+
|
102 |
+
new_choices.extend(list(df.columns))
|
103 |
+
|
104 |
+
all_sheet_names.extend(new_sheet_names)
|
105 |
+
|
106 |
+
else:
|
107 |
+
df = read_file(file_name)
|
108 |
+
new_choices = list(df.columns)
|
109 |
|
110 |
+
concat_choices.extend(new_choices)
|
111 |
+
|
112 |
# Drop duplicate columns
|
113 |
concat_choices = list(set(concat_choices))
|
114 |
+
|
115 |
+
if number_of_excel_files > 0:
|
116 |
+
return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names, visible=True)
|
117 |
+
else:
|
118 |
+
return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
|
119 |
|
120 |
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
121 |
def add_folder_to_path(folder_path: str):
|
|
|
133 |
if absolute_path not in current_path.split(os.pathsep):
|
134 |
full_path_extension = absolute_path + os.pathsep + current_path
|
135 |
os.environ['PATH'] = full_path_extension
|
136 |
+
#print(f"Updated PATH with: ", full_path_extension)
|
137 |
else:
|
138 |
print(f"Directory {folder_path} already exists in PATH.")
|
139 |
else:
|
|
|
196 |
#if bucket_name:
|
197 |
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
198 |
|
199 |
+
return out_session_hash, output_folder, out_session_hash
|
200 |
else:
|
201 |
print("No session parameters found.")
|
202 |
return "",""
|