seanpedrickcase
commited on
Commit
•
8c33828
1
Parent(s):
01c88c0
Decision process now saved as log files. Other log files and feedback added
Browse files- app.py +55 -21
- tools/aws_functions.py +46 -5
- tools/data_anonymise.py +100 -21
- tools/file_conversion.py +50 -6
- tools/file_redaction.py +56 -30
- tools/helper_functions.py +17 -0
app.py
CHANGED
@@ -3,7 +3,8 @@ import os
|
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
|
6 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
|
|
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
from tools.file_conversion import prepare_image_or_text_pdf
|
9 |
from tools.data_anonymise import anonymise_data_files
|
@@ -29,9 +30,15 @@ with app:
|
|
29 |
output_image_files_state = gr.State([])
|
30 |
output_file_list_state = gr.State([])
|
31 |
text_output_file_list_state = gr.State([])
|
|
|
|
|
32 |
|
33 |
session_hash_state = gr.State()
|
34 |
s3_output_folder_state = gr.State()
|
|
|
|
|
|
|
|
|
35 |
|
36 |
gr.Markdown(
|
37 |
"""
|
@@ -39,9 +46,9 @@ with app:
|
|
39 |
|
40 |
Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
|
41 |
|
42 |
-
WARNING:
|
43 |
|
44 |
-
|
45 |
""")
|
46 |
|
47 |
with gr.Tab("PDFs/images"):
|
@@ -57,6 +64,15 @@ with app:
|
|
57 |
|
58 |
with gr.Row():
|
59 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
with gr.Tab(label="Open text or Excel/csv files"):
|
62 |
gr.Markdown(
|
@@ -73,13 +89,20 @@ with app:
|
|
73 |
|
74 |
in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
|
75 |
|
76 |
-
tabular_data_redact_btn = gr.Button("
|
77 |
|
78 |
with gr.Row():
|
79 |
text_output_summary = gr.Textbox(label="Output result")
|
80 |
text_output_file = gr.File(label="Output files")
|
81 |
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
with gr.Tab(label="Redaction settings"):
|
84 |
gr.Markdown(
|
85 |
"""
|
@@ -111,44 +134,55 @@ with app:
|
|
111 |
|
112 |
# ### Loading AWS data ###
|
113 |
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
114 |
-
|
115 |
-
callback = gr.CSVLogger()
|
116 |
|
117 |
# Document redaction
|
118 |
-
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
|
119 |
-
|
120 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
|
121 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
|
122 |
|
123 |
# If the output file count text box changes, keep going with redacting each document until done
|
124 |
-
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
|
129 |
# Tabular data redaction
|
130 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
|
131 |
|
132 |
-
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
|
133 |
|
134 |
# If the output file count text box changes, keep going with redacting each data file until done
|
135 |
-
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done])
|
|
|
136 |
|
|
|
|
|
|
|
137 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
138 |
|
139 |
-
#
|
|
|
140 |
callback.setup([session_hash_textbox], "logs")
|
141 |
-
|
142 |
-
#app.load(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
|
143 |
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
# Launch the Gradio app
|
146 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
147 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
148 |
|
149 |
if __name__ == "__main__":
|
150 |
-
|
151 |
if os.environ['COGNITO_AUTH'] == "1":
|
152 |
-
app.queue().launch(show_error=True, auth=authenticate_user)
|
153 |
else:
|
154 |
-
app.queue().launch(show_error=True, inbrowser=True)
|
|
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
|
6 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs
|
7 |
+
from tools.aws_functions import upload_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_text_pdf
|
10 |
from tools.data_anonymise import anonymise_data_files
|
|
|
30 |
output_image_files_state = gr.State([])
|
31 |
output_file_list_state = gr.State([])
|
32 |
text_output_file_list_state = gr.State([])
|
33 |
+
first_loop_state = gr.State(True)
|
34 |
+
second_loop_state = gr.State(False)
|
35 |
|
36 |
session_hash_state = gr.State()
|
37 |
s3_output_folder_state = gr.State()
|
38 |
+
feedback_logs_state = gr.State('feedback/log.csv')
|
39 |
+
feedback_s3_logs_loc_state = gr.State('feedback/')
|
40 |
+
usage_logs_state = gr.State('logs/log.csv')
|
41 |
+
usage_s3_logs_loc_state = gr.State('logs/')
|
42 |
|
43 |
gr.Markdown(
|
44 |
"""
|
|
|
46 |
|
47 |
Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
|
48 |
|
49 |
+
WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
50 |
|
51 |
+
This app accepts a maximum file size of 10mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
|
52 |
""")
|
53 |
|
54 |
with gr.Tab("PDFs/images"):
|
|
|
64 |
|
65 |
with gr.Row():
|
66 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
67 |
+
|
68 |
+
with gr.Row():
|
69 |
+
pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
|
70 |
+
with gr.Row():
|
71 |
+
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
72 |
+
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
73 |
+
|
74 |
+
with gr.Row():
|
75 |
+
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
76 |
|
77 |
with gr.Tab(label="Open text or Excel/csv files"):
|
78 |
gr.Markdown(
|
|
|
89 |
|
90 |
in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
|
91 |
|
92 |
+
tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
|
93 |
|
94 |
with gr.Row():
|
95 |
text_output_summary = gr.Textbox(label="Output result")
|
96 |
text_output_file = gr.File(label="Output files")
|
97 |
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
|
98 |
|
99 |
+
with gr.Row():
|
100 |
+
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
|
101 |
+
choices=["The results were good", "The results were not good"], visible=False)
|
102 |
+
with gr.Row():
|
103 |
+
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
104 |
+
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
105 |
+
|
106 |
with gr.Tab(label="Redaction settings"):
|
107 |
gr.Markdown(
|
108 |
"""
|
|
|
134 |
|
135 |
# ### Loading AWS data ###
|
136 |
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
|
|
|
|
137 |
|
138 |
# Document redaction
|
139 |
+
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
140 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, first_loop_state],
|
|
|
141 |
outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
|
142 |
|
143 |
# If the output file count text box changes, keep going with redacting each document until done
|
144 |
+
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
|
145 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, second_loop_state],
|
146 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done]).\
|
147 |
+
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn])
|
148 |
|
149 |
# Tabular data redaction
|
150 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
|
151 |
|
152 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
|
153 |
|
154 |
# If the output file count text box changes, keep going with redacting each data file until done
|
155 |
+
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done]).\
|
156 |
+
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn])
|
157 |
|
158 |
+
#app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
|
159 |
+
# then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
160 |
+
|
161 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
162 |
|
163 |
+
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
164 |
+
callback = gr.CSVLogger()
|
165 |
callback.setup([session_hash_textbox], "logs")
|
|
|
|
|
166 |
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
|
167 |
|
168 |
+
# User submitted feedback for pdf redactions
|
169 |
+
pdf_callback = gr.CSVLogger()
|
170 |
+
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], "feedback")
|
171 |
+
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
|
172 |
+
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
173 |
+
|
174 |
+
# User submitted feedback for data redactions
|
175 |
+
data_callback = gr.CSVLogger()
|
176 |
+
data_callback.setup([data_feedback_radio, data_further_details_text], "feedback")
|
177 |
+
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
|
178 |
+
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
179 |
+
|
180 |
# Launch the Gradio app
|
181 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
182 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
183 |
|
184 |
if __name__ == "__main__":
|
|
|
185 |
if os.environ['COGNITO_AUTH'] == "1":
|
186 |
+
app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='10mb')
|
187 |
else:
|
188 |
+
app.queue().launch(show_error=True, inbrowser=True, max_file_size='10mb')
|
tools/aws_functions.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import Type
|
2 |
import pandas as pd
|
3 |
import boto3
|
4 |
import tempfile
|
@@ -6,12 +6,11 @@ import os
|
|
6 |
from tools.helper_functions import get_or_create_env_var
|
7 |
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
-
bucket_name=""
|
10 |
|
11 |
# Get AWS credentials if required
|
12 |
-
|
13 |
aws_var = "RUN_AWS_FUNCTIONS"
|
14 |
-
aws_var_default = "
|
15 |
aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
|
16 |
print(f'The value of {aws_var} is {aws_var_val}')
|
17 |
|
@@ -156,4 +155,46 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
|
|
156 |
out_message = "No password provided. Please ask the data team for access if you need this."
|
157 |
print(out_message)
|
158 |
|
159 |
-
return files, out_message
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Type, List
|
2 |
import pandas as pd
|
3 |
import boto3
|
4 |
import tempfile
|
|
|
6 |
from tools.helper_functions import get_or_create_env_var
|
7 |
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
|
|
9 |
|
10 |
# Get AWS credentials if required
|
11 |
+
bucket_name=""
|
12 |
aws_var = "RUN_AWS_FUNCTIONS"
|
13 |
+
aws_var_default = "1"
|
14 |
aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
|
15 |
print(f'The value of {aws_var} is {aws_var_val}')
|
16 |
|
|
|
155 |
out_message = "No password provided. Please ask the data team for access if you need this."
|
156 |
print(out_message)
|
157 |
|
158 |
+
return files, out_message
|
159 |
+
|
160 |
+
def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
|
161 |
+
"""
|
162 |
+
Uploads a file from local machine to Amazon S3.
|
163 |
+
|
164 |
+
Args:
|
165 |
+
- local_file_path: Local file path(s) of the file(s) to upload.
|
166 |
+
- s3_key: Key (path) to the file in the S3 bucket.
|
167 |
+
- s3_bucket: Name of the S3 bucket.
|
168 |
+
|
169 |
+
Returns:
|
170 |
+
- Message as variable/printed to console
|
171 |
+
"""
|
172 |
+
final_out_message = []
|
173 |
+
|
174 |
+
s3_client = boto3.client('s3')
|
175 |
+
|
176 |
+
if isinstance(local_file_paths, str):
|
177 |
+
local_file_paths = [local_file_paths]
|
178 |
+
|
179 |
+
for file in local_file_paths:
|
180 |
+
try:
|
181 |
+
# Get file name off file path
|
182 |
+
file_name = os.path.basename(file)
|
183 |
+
|
184 |
+
s3_key_full = s3_key + file_name
|
185 |
+
print("S3 key: ", s3_key_full)
|
186 |
+
|
187 |
+
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
188 |
+
out_message = "File " + file_name + " uploaded successfully to S3!"
|
189 |
+
print(out_message)
|
190 |
+
|
191 |
+
except Exception as e:
|
192 |
+
out_message = f"Error uploading file(s) to S3: {e}"
|
193 |
+
print(out_message)
|
194 |
+
|
195 |
+
final_out_message.append(out_message)
|
196 |
+
final_out_message_str = '\n'.join(final_out_message)
|
197 |
+
|
198 |
+
return final_out_message_str
|
199 |
+
|
200 |
+
|
tools/data_anonymise.py
CHANGED
@@ -5,11 +5,10 @@ import time
|
|
5 |
import pandas as pd
|
6 |
|
7 |
from faker import Faker
|
8 |
-
|
9 |
from gradio import Progress
|
10 |
-
from typing import List
|
11 |
|
12 |
-
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
|
13 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
14 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
15 |
|
@@ -24,6 +23,76 @@ fake = Faker("en_UK")
|
|
24 |
def fake_first_name(x):
|
25 |
return fake.first_name()
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def anon_consistent_names(df):
|
28 |
# ## Pick out common names and replace them with the same person value
|
29 |
df_dict = df.to_dict(orient="list")
|
@@ -118,6 +187,9 @@ def anon_consistent_names(df):
|
|
118 |
|
119 |
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
|
120 |
|
|
|
|
|
|
|
121 |
key_string = ""
|
122 |
|
123 |
# DataFrame to dict
|
@@ -133,34 +205,26 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
133 |
|
134 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
135 |
|
136 |
-
#
|
137 |
-
#
|
138 |
-
# score_threshold=score_threshold,
|
139 |
-
# return_decision_process=False,
|
140 |
-
# in_allow_list=in_allow_list_flat)
|
141 |
-
|
142 |
-
print("Identifying personal information")
|
143 |
-
analyse_tic = time.perf_counter()
|
144 |
-
|
145 |
-
print("Allow list:", in_allow_list)
|
146 |
|
147 |
# Use custom analyzer to be able to track progress with Gradio
|
148 |
analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
|
149 |
entities=chosen_redact_entities,
|
150 |
score_threshold=score_threshold,
|
151 |
-
return_decision_process=
|
152 |
allow_list=in_allow_list_flat)
|
|
|
153 |
analyzer_results = list(analyzer_results)
|
154 |
-
|
|
|
|
|
155 |
|
156 |
analyse_toc = time.perf_counter()
|
157 |
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
158 |
print(analyse_time_out)
|
159 |
|
160 |
-
|
161 |
-
|
162 |
# Create faker function (note that it has to receive a value)
|
163 |
-
|
164 |
fake = Faker("en_UK")
|
165 |
|
166 |
def fake_first_name(x):
|
@@ -197,7 +261,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
197 |
|
198 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
199 |
|
200 |
-
return scrubbed_df, key_string
|
201 |
|
202 |
def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
|
203 |
def check_lists(list1, list2):
|
@@ -238,7 +302,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
|
|
238 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
239 |
|
240 |
# Anonymise the selected columns
|
241 |
-
anon_df_part_out, key_string = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
|
242 |
|
243 |
# Rejoin the dataframe together
|
244 |
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
@@ -261,11 +325,20 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
|
|
261 |
# Write each DataFrame to a different worksheet.
|
262 |
anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
|
263 |
|
|
|
|
|
|
|
|
|
264 |
else:
|
265 |
anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
|
266 |
anon_df_out.to_csv(anon_export_file_name, index = None)
|
267 |
|
|
|
|
|
|
|
|
|
268 |
out_file_paths.append(anon_export_file_name)
|
|
|
269 |
|
270 |
# As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
|
271 |
out_file_paths = list(set(out_file_paths))
|
@@ -276,10 +349,16 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
|
|
276 |
|
277 |
return out_file_paths, out_message, key_string
|
278 |
|
279 |
-
def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], progress=Progress(track_tqdm=True)):
|
280 |
|
281 |
tic = time.perf_counter()
|
282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
# Load file
|
284 |
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
285 |
if isinstance(out_message, str):
|
|
|
5 |
import pandas as pd
|
6 |
|
7 |
from faker import Faker
|
|
|
8 |
from gradio import Progress
|
9 |
+
from typing import List, Dict, Any
|
10 |
|
11 |
+
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
12 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
14 |
|
|
|
23 |
def fake_first_name(x):
|
24 |
return fake.first_name()
|
25 |
|
26 |
+
# Writing decision making process to file
|
27 |
+
def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]) -> str:
|
28 |
+
"""
|
29 |
+
Generate a detailed output of the decision process for entity recognition.
|
30 |
+
|
31 |
+
This function takes the results from the analyzer and the original data dictionary,
|
32 |
+
and produces a string output detailing the decision process for each recognized entity.
|
33 |
+
It includes information such as entity type, position, confidence score, and the context
|
34 |
+
in which the entity was found.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
analyzer_results (List[DictAnalyzerResult]): The results from the entity analyzer.
|
38 |
+
df_dict (Dict[str, List[Any]]): The original data in dictionary format.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
str: A string containing the detailed decision process output.
|
42 |
+
"""
|
43 |
+
decision_process_output = []
|
44 |
+
keys_to_keep = ['entity_type', 'start', 'end']
|
45 |
+
|
46 |
+
def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
|
47 |
+
output = []
|
48 |
+
|
49 |
+
if hasattr(result, 'value'):
|
50 |
+
text = result.value[data_row]
|
51 |
+
else:
|
52 |
+
text = ""
|
53 |
+
|
54 |
+
if isinstance(recognizer_result, list):
|
55 |
+
for sub_result in recognizer_result:
|
56 |
+
if isinstance(text, str):
|
57 |
+
found_text = text[sub_result.start:sub_result.end]
|
58 |
+
else:
|
59 |
+
found_text = ''
|
60 |
+
analysis_explanation = {key: sub_result.__dict__[key] for key in keys_to_keep}
|
61 |
+
analysis_explanation.update({
|
62 |
+
'data_row': str(data_row),
|
63 |
+
'column': list(df_dict.keys())[dictionary_key],
|
64 |
+
'entity': found_text
|
65 |
+
})
|
66 |
+
output.append(str(analysis_explanation))
|
67 |
+
|
68 |
+
return output
|
69 |
+
|
70 |
+
#print("Analyser results:", analyzer_results)
|
71 |
+
|
72 |
+
# Run through each column to analyse for PII
|
73 |
+
for i, result in enumerate(analyzer_results):
|
74 |
+
print("Looking at result:", str(i))
|
75 |
+
|
76 |
+
# If a single result
|
77 |
+
if isinstance(result, RecognizerResult):
|
78 |
+
decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
|
79 |
+
|
80 |
+
# If a list of results
|
81 |
+
elif isinstance(result, List):
|
82 |
+
for x, recognizer_result in enumerate(result.recognizer_results):
|
83 |
+
decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
|
84 |
+
|
85 |
+
else:
|
86 |
+
try:
|
87 |
+
decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
|
88 |
+
except Exception as e:
|
89 |
+
print(e)
|
90 |
+
|
91 |
+
decision_process_output_str = '\n'.join(decision_process_output)
|
92 |
+
|
93 |
+
|
94 |
+
return decision_process_output_str
|
95 |
+
|
96 |
def anon_consistent_names(df):
|
97 |
# ## Pick out common names and replace them with the same person value
|
98 |
df_dict = df.to_dict(orient="list")
|
|
|
187 |
|
188 |
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
|
189 |
|
190 |
+
print("Identifying personal information")
|
191 |
+
analyse_tic = time.perf_counter()
|
192 |
+
|
193 |
key_string = ""
|
194 |
|
195 |
# DataFrame to dict
|
|
|
205 |
|
206 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
207 |
|
208 |
+
#print("Allow list:", in_allow_list)
|
209 |
+
#print("Input data keys:", df_dict.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
# Use custom analyzer to be able to track progress with Gradio
|
212 |
analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
|
213 |
entities=chosen_redact_entities,
|
214 |
score_threshold=score_threshold,
|
215 |
+
return_decision_process=True,
|
216 |
allow_list=in_allow_list_flat)
|
217 |
+
|
218 |
analyzer_results = list(analyzer_results)
|
219 |
+
|
220 |
+
# Usage in the main function:
|
221 |
+
decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
|
222 |
|
223 |
analyse_toc = time.perf_counter()
|
224 |
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
225 |
print(analyse_time_out)
|
226 |
|
|
|
|
|
227 |
# Create faker function (note that it has to receive a value)
|
|
|
228 |
fake = Faker("en_UK")
|
229 |
|
230 |
def fake_first_name(x):
|
|
|
261 |
|
262 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
263 |
|
264 |
+
return scrubbed_df, key_string, decision_process_output_str
|
265 |
|
266 |
def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
|
267 |
def check_lists(list1, list2):
|
|
|
302 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
303 |
|
304 |
# Anonymise the selected columns
|
305 |
+
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
|
306 |
|
307 |
# Rejoin the dataframe together
|
308 |
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
|
|
325 |
# Write each DataFrame to a different worksheet.
|
326 |
anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
|
327 |
|
328 |
+
decision_process_log_output_file = anon_xlsx_export_file_name + "decision_process_output.txt"
|
329 |
+
with open(decision_process_log_output_file, "w") as f:
|
330 |
+
f.write(decision_process_output_str)
|
331 |
+
|
332 |
else:
|
333 |
anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
|
334 |
anon_df_out.to_csv(anon_export_file_name, index = None)
|
335 |
|
336 |
+
decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
|
337 |
+
with open(decision_process_log_output_file, "w") as f:
|
338 |
+
f.write(decision_process_output_str)
|
339 |
+
|
340 |
out_file_paths.append(anon_export_file_name)
|
341 |
+
out_file_paths.append(decision_process_log_output_file)
|
342 |
|
343 |
# As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
|
344 |
out_file_paths = list(set(out_file_paths))
|
|
|
349 |
|
350 |
return out_file_paths, out_message, key_string
|
351 |
|
352 |
+
def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
|
353 |
|
354 |
tic = time.perf_counter()
|
355 |
|
356 |
+
# If this is the first time around, set variables to 0/blank
|
357 |
+
if first_loop_state==True:
|
358 |
+
latest_file_completed = 0
|
359 |
+
out_message = []
|
360 |
+
out_file_paths = []
|
361 |
+
|
362 |
# Load file
|
363 |
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
364 |
if isinstance(out_message, str):
|
tools/file_conversion.py
CHANGED
@@ -3,7 +3,7 @@ from tools.helper_functions import get_file_path_end, output_folder
|
|
3 |
from PIL import Image
|
4 |
import os
|
5 |
from gradio import Progress
|
6 |
-
from typing import List
|
7 |
|
8 |
def is_pdf_or_image(filename):
|
9 |
"""
|
@@ -55,6 +55,7 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
|
|
55 |
|
56 |
# If no images are returned, break the loop
|
57 |
if not image:
|
|
|
58 |
break
|
59 |
|
60 |
images.extend(image)
|
@@ -74,6 +75,7 @@ def process_file(file_path):
|
|
74 |
print(f"{file_path} is an image file.")
|
75 |
# Perform image processing here
|
76 |
img_object = [Image.open(file_path)]
|
|
|
77 |
|
78 |
# Check if the file is a PDF
|
79 |
elif file_extension == '.pdf':
|
@@ -85,37 +87,79 @@ def process_file(file_path):
|
|
85 |
print(f"{file_path} is not an image or PDF file.")
|
86 |
img_object = ['']
|
87 |
|
88 |
-
|
89 |
|
90 |
return img_object
|
91 |
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
95 |
#if isinstance(out_message, str):
|
96 |
# out_message = [out_message]
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
if not file_paths:
|
99 |
file_paths = []
|
100 |
|
101 |
-
out_file_paths = file_paths
|
102 |
|
103 |
latest_file_completed = int(latest_file_completed)
|
104 |
|
105 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
106 |
-
if latest_file_completed == len(
|
107 |
print("Last file reached, returning files:", str(latest_file_completed))
|
108 |
#final_out_message = '\n'.join(out_message)
|
109 |
return out_message, out_file_paths
|
110 |
|
111 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
112 |
|
113 |
-
file_paths_loop = [
|
|
|
114 |
|
115 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
116 |
for file in file_paths_loop:
|
117 |
file_path = file.name
|
118 |
|
|
|
|
|
119 |
#if file_path:
|
120 |
# file_path_without_ext = get_file_path_end(file_path)
|
121 |
if not file_path:
|
|
|
3 |
from PIL import Image
|
4 |
import os
|
5 |
from gradio import Progress
|
6 |
+
from typing import List, Optional
|
7 |
|
8 |
def is_pdf_or_image(filename):
|
9 |
"""
|
|
|
55 |
|
56 |
# If no images are returned, break the loop
|
57 |
if not image:
|
58 |
+
print("Conversion of page", str(page_num), "to file failed.")
|
59 |
break
|
60 |
|
61 |
images.extend(image)
|
|
|
75 |
print(f"{file_path} is an image file.")
|
76 |
# Perform image processing here
|
77 |
img_object = [Image.open(file_path)]
|
78 |
+
# Load images from the file paths
|
79 |
|
80 |
# Check if the file is a PDF
|
81 |
elif file_extension == '.pdf':
|
|
|
87 |
print(f"{file_path} is not an image or PDF file.")
|
88 |
img_object = ['']
|
89 |
|
90 |
+
print('Image object is:', img_object)
|
91 |
|
92 |
return img_object
|
93 |
|
94 |
+
|
95 |
+
|
96 |
+
def prepare_image_or_text_pdf(
|
97 |
+
file_paths: List[str],
|
98 |
+
in_redact_method: str,
|
99 |
+
in_allow_list: Optional[List[List[str]]] = None,
|
100 |
+
latest_file_completed: int = 0,
|
101 |
+
out_message: List[str] = [],
|
102 |
+
first_loop_state: bool = False,
|
103 |
+
progress: Progress = Progress(track_tqdm=True)
|
104 |
+
) -> tuple[List[str], List[str]]:
|
105 |
+
"""
|
106 |
+
Prepare and process image or text PDF files for redaction.
|
107 |
+
|
108 |
+
This function takes a list of file paths, processes each file based on the specified redaction method,
|
109 |
+
and returns the output messages and processed file paths.
|
110 |
+
|
111 |
+
Args:
|
112 |
+
file_paths (List[str]): List of file paths to process.
|
113 |
+
in_redact_method (str): The redaction method to use.
|
114 |
+
in_allow_list (Optional[List[List[str]]]): List of allowed terms for redaction.
|
115 |
+
latest_file_completed (int): Index of the last completed file.
|
116 |
+
out_message (List[str]): List to store output messages.
|
117 |
+
first_loop_state (bool): Flag indicating if this is the first iteration.
|
118 |
+
progress (Progress): Progress tracker for the operation.
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
|
122 |
+
"""
|
123 |
|
124 |
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
125 |
#if isinstance(out_message, str):
|
126 |
# out_message = [out_message]
|
127 |
|
128 |
+
|
129 |
+
|
130 |
+
# If this is the first time around, set variables to 0/blank
|
131 |
+
if first_loop_state==True:
|
132 |
+
latest_file_completed = 0
|
133 |
+
out_message = []
|
134 |
+
out_file_paths = []
|
135 |
+
else:
|
136 |
+
print("Now attempting file:", str(latest_file_completed + 1))
|
137 |
+
out_file_paths = []
|
138 |
+
|
139 |
if not file_paths:
|
140 |
file_paths = []
|
141 |
|
142 |
+
#out_file_paths = file_paths
|
143 |
|
144 |
latest_file_completed = int(latest_file_completed)
|
145 |
|
146 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
147 |
+
if latest_file_completed == len(file_paths):
|
148 |
print("Last file reached, returning files:", str(latest_file_completed))
|
149 |
#final_out_message = '\n'.join(out_message)
|
150 |
return out_message, out_file_paths
|
151 |
|
152 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
153 |
|
154 |
+
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
155 |
+
print("file_paths_loop:", str(file_paths_loop))
|
156 |
|
157 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
158 |
for file in file_paths_loop:
|
159 |
file_path = file.name
|
160 |
|
161 |
+
print("file_path:", file_path)
|
162 |
+
|
163 |
#if file_path:
|
164 |
# file_path_without_ext = get_file_path_end(file_path)
|
165 |
if not file_path:
|
tools/file_redaction.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from PIL import Image
|
2 |
from typing import List
|
3 |
import pandas as pd
|
4 |
from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
|
@@ -14,13 +14,20 @@ from collections import defaultdict # For efficient grouping
|
|
14 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
15 |
from tools.helper_functions import get_file_path_end, output_folder
|
16 |
from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
|
|
|
17 |
import gradio as gr
|
18 |
|
19 |
|
20 |
-
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], progress=gr.Progress(track_tqdm=True)):
|
21 |
|
22 |
tic = time.perf_counter()
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
25 |
if isinstance(out_message, str):
|
26 |
out_message = [out_message]
|
@@ -44,14 +51,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
44 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
45 |
|
46 |
|
47 |
-
print("File paths:", file_paths)
|
48 |
|
49 |
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
50 |
file_path = file.name
|
51 |
|
52 |
if file_path:
|
53 |
file_path_without_ext = get_file_path_end(file_path)
|
54 |
-
|
|
|
55 |
# If user has not submitted a pdf, assume it's an image
|
56 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
57 |
in_redact_method = "Image analysis"
|
@@ -65,13 +73,19 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
65 |
# if is_pdf_or_image(file_path) == False:
|
66 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
67 |
|
68 |
-
print("Redacting file as image-based
|
69 |
-
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
|
70 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
71 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
72 |
|
73 |
out_file_paths.append(out_image_file_path)
|
74 |
-
out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
# Increase latest file completed count unless we are at the last file
|
77 |
if latest_file_completed != len(file_paths):
|
@@ -84,12 +98,12 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
84 |
|
85 |
# Analyse text-based pdf
|
86 |
print('Redacting file as text-based PDF')
|
87 |
-
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
88 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
89 |
pdf_text.save(out_text_file_path)
|
90 |
|
91 |
#out_file_paths.append(out_text_file_path)
|
92 |
-
out_message_new = "File " + file_path_without_ext + " successfully redacted
|
93 |
out_message.append(out_message_new)
|
94 |
|
95 |
# Convert message
|
@@ -101,6 +115,12 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
101 |
img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
|
102 |
out_file_paths.extend(img_output_file_path)
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
# Add confirmation for converting to image if you want
|
105 |
# out_message.append(img_output_summary)
|
106 |
|
@@ -138,7 +158,7 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
|
138 |
merged_box = group[0]
|
139 |
for next_box in group[1:]:
|
140 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
141 |
-
print("Merging a box")
|
142 |
# Calculate new dimensions for the merged box
|
143 |
new_left = min(merged_box.left, next_box.left)
|
144 |
new_top = min(merged_box.top, next_box.top)
|
@@ -154,16 +174,14 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
|
154 |
merged_bboxes.append(merged_box)
|
155 |
return merged_bboxes
|
156 |
|
157 |
-
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
158 |
'''
|
159 |
Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
|
160 |
'''
|
161 |
-
from PIL import Image, ImageChops, ImageDraw
|
162 |
|
163 |
fill = (0, 0, 0)
|
164 |
|
165 |
if not image_paths:
|
166 |
-
|
167 |
out_message = "PDF does not exist as images. Converting pages to image"
|
168 |
print(out_message)
|
169 |
#progress(0, desc=out_message)
|
@@ -180,12 +198,12 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
180 |
#for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
|
181 |
for i in range(0, number_of_pages):
|
182 |
|
183 |
-
print("Redacting page
|
184 |
|
185 |
# Get the image to redact using PIL lib (pillow)
|
186 |
-
|
187 |
|
188 |
-
image = ImageChops.duplicate(
|
189 |
|
190 |
# %%
|
191 |
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
@@ -200,16 +218,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
200 |
"allow_list": allow_list,
|
201 |
"language": language,
|
202 |
"entities": chosen_redact_entities,
|
203 |
-
"score_threshold": score_threshold
|
|
|
204 |
})
|
205 |
|
|
|
|
|
|
|
|
|
|
|
206 |
#print("For page: ", str(i), "Bounding boxes: ", bboxes)
|
207 |
|
208 |
draw = ImageDraw.Draw(image)
|
209 |
|
210 |
merged_bboxes = merge_img_bboxes(bboxes)
|
211 |
|
212 |
-
print("For page:
|
213 |
|
214 |
# 3. Draw the merged boxes (unchanged)
|
215 |
for box in merged_bboxes:
|
@@ -221,7 +245,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
221 |
|
222 |
images.append(image)
|
223 |
|
224 |
-
return images
|
225 |
|
226 |
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
227 |
'''
|
@@ -242,7 +266,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
242 |
|
243 |
#for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
244 |
for page in pdf.pages:
|
245 |
-
print("Page number is:
|
246 |
|
247 |
annotations_on_page = []
|
248 |
analyzed_bounding_boxes = []
|
@@ -261,8 +285,11 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
261 |
language=language,
|
262 |
entities=chosen_redact_entities,
|
263 |
score_threshold=score_threshold,
|
264 |
-
return_decision_process=
|
265 |
allow_list=allow_list)
|
|
|
|
|
|
|
266 |
|
267 |
characters = [char # This is what we want to include in the list
|
268 |
for line in text_container # Loop through each line in text_container
|
@@ -292,7 +319,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
292 |
current_box = char_box
|
293 |
current_y = char_box[1]
|
294 |
else: # Now we have previous values to compare
|
295 |
-
print("Comparing values")
|
296 |
vertical_diff_bboxes = abs(char_box[1] - current_y)
|
297 |
horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
298 |
#print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
|
@@ -303,9 +330,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
303 |
):
|
304 |
old_right_pos = current_box[2]
|
305 |
current_box[2] = char_box[2]
|
306 |
-
|
307 |
-
print("Old right pos: ", str(old_right_pos), "has been replaced with: ", str(current_box[2]), "for result: ", result)
|
308 |
-
|
309 |
else:
|
310 |
merged_bounding_boxes.append(
|
311 |
{"boundingBox": current_box, "result": result})
|
@@ -324,13 +348,17 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
324 |
combined_analyzer_results.extend(analyzer_results)
|
325 |
|
326 |
if len(analyzer_results) > 0:
|
|
|
|
|
327 |
# Create summary df of annotations to be made
|
328 |
analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
|
329 |
analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
330 |
analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
|
331 |
analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
|
332 |
analyzed_bounding_boxes_df_new['page'] = page_num + 1
|
333 |
-
analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
|
|
|
|
|
334 |
|
335 |
for analyzed_bounding_box in analyzed_bounding_boxes:
|
336 |
bounding_box = analyzed_bounding_box["boundingBox"]
|
@@ -352,11 +380,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
352 |
|
353 |
annotations_all_pages.extend([annotations_on_page])
|
354 |
|
355 |
-
print("For page number:
|
356 |
page.Annots = pdf.make_indirect(annotations_on_page)
|
357 |
|
358 |
page_num += 1
|
359 |
-
|
360 |
-
analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
|
361 |
|
362 |
-
return pdf
|
|
|
1 |
+
from PIL import Image, ImageChops, ImageDraw
|
2 |
from typing import List
|
3 |
import pandas as pd
|
4 |
from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
|
|
|
14 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
15 |
from tools.helper_functions import get_file_path_end, output_folder
|
16 |
from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
|
17 |
+
from tools.data_anonymise import generate_decision_process_output
|
18 |
import gradio as gr
|
19 |
|
20 |
|
21 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
|
22 |
|
23 |
tic = time.perf_counter()
|
24 |
|
25 |
+
# If this is the first time around, set variables to 0/blank
|
26 |
+
if first_loop_state==True:
|
27 |
+
latest_file_completed = 0
|
28 |
+
out_message = []
|
29 |
+
out_file_paths = []
|
30 |
+
|
31 |
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
32 |
if isinstance(out_message, str):
|
33 |
out_message = [out_message]
|
|
|
51 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
52 |
|
53 |
|
54 |
+
#print("File paths:", file_paths)
|
55 |
|
56 |
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
57 |
file_path = file.name
|
58 |
|
59 |
if file_path:
|
60 |
file_path_without_ext = get_file_path_end(file_path)
|
61 |
+
is_a_pdf = is_pdf(file_path) == True
|
62 |
+
if is_a_pdf == False:
|
63 |
# If user has not submitted a pdf, assume it's an image
|
64 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
65 |
in_redact_method = "Image analysis"
|
|
|
73 |
# if is_pdf_or_image(file_path) == False:
|
74 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
75 |
|
76 |
+
print("Redacting file as image-based file")
|
77 |
+
pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf)
|
78 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
79 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
80 |
|
81 |
out_file_paths.append(out_image_file_path)
|
82 |
+
out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file")
|
83 |
+
|
84 |
+
output_logs_str = str(output_logs)
|
85 |
+
logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
|
86 |
+
with open(logs_output_file_name, "w") as f:
|
87 |
+
f.write(output_logs_str)
|
88 |
+
out_file_paths.append(logs_output_file_name)
|
89 |
|
90 |
# Increase latest file completed count unless we are at the last file
|
91 |
if latest_file_completed != len(file_paths):
|
|
|
98 |
|
99 |
# Analyse text-based pdf
|
100 |
print('Redacting file as text-based PDF')
|
101 |
+
pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
102 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
103 |
pdf_text.save(out_text_file_path)
|
104 |
|
105 |
#out_file_paths.append(out_text_file_path)
|
106 |
+
out_message_new = "File " + file_path_without_ext + " successfully redacted"
|
107 |
out_message.append(out_message_new)
|
108 |
|
109 |
# Convert message
|
|
|
115 |
img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
|
116 |
out_file_paths.extend(img_output_file_path)
|
117 |
|
118 |
+
output_logs_str = str(output_logs)
|
119 |
+
logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
|
120 |
+
with open(logs_output_file_name, "w") as f:
|
121 |
+
f.write(output_logs_str)
|
122 |
+
out_file_paths.append(logs_output_file_name)
|
123 |
+
|
124 |
# Add confirmation for converting to image if you want
|
125 |
# out_message.append(img_output_summary)
|
126 |
|
|
|
158 |
merged_box = group[0]
|
159 |
for next_box in group[1:]:
|
160 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
161 |
+
#print("Merging a box")
|
162 |
# Calculate new dimensions for the merged box
|
163 |
new_left = min(merged_box.left, next_box.left)
|
164 |
new_top = min(merged_box.top, next_box.top)
|
|
|
174 |
merged_bboxes.append(merged_box)
|
175 |
return merged_bboxes
|
176 |
|
177 |
+
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, progress=Progress(track_tqdm=True)):
|
178 |
'''
|
179 |
Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
|
180 |
'''
|
|
|
181 |
|
182 |
fill = (0, 0, 0)
|
183 |
|
184 |
if not image_paths:
|
|
|
185 |
out_message = "PDF does not exist as images. Converting pages to image"
|
186 |
print(out_message)
|
187 |
#progress(0, desc=out_message)
|
|
|
198 |
#for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
|
199 |
for i in range(0, number_of_pages):
|
200 |
|
201 |
+
print("Redacting page", str(i + 1))
|
202 |
|
203 |
# Get the image to redact using PIL lib (pillow)
|
204 |
+
#print("image_paths:", image_paths)
|
205 |
|
206 |
+
image = ImageChops.duplicate(image_paths[i])
|
207 |
|
208 |
# %%
|
209 |
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
|
|
218 |
"allow_list": allow_list,
|
219 |
"language": language,
|
220 |
"entities": chosen_redact_entities,
|
221 |
+
"score_threshold": score_threshold,
|
222 |
+
"return_decision_process":True,
|
223 |
})
|
224 |
|
225 |
+
# Text placeholder in this processing step, as the analyze method does not return the OCR text
|
226 |
+
if bboxes:
|
227 |
+
decision_process_output_str = str(bboxes)
|
228 |
+
print("Decision process:", decision_process_output_str)
|
229 |
+
|
230 |
#print("For page: ", str(i), "Bounding boxes: ", bboxes)
|
231 |
|
232 |
draw = ImageDraw.Draw(image)
|
233 |
|
234 |
merged_bboxes = merge_img_bboxes(bboxes)
|
235 |
|
236 |
+
#print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
|
237 |
|
238 |
# 3. Draw the merged boxes (unchanged)
|
239 |
for box in merged_bboxes:
|
|
|
245 |
|
246 |
images.append(image)
|
247 |
|
248 |
+
return images, decision_process_output_str
|
249 |
|
250 |
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
251 |
'''
|
|
|
266 |
|
267 |
#for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
268 |
for page in pdf.pages:
|
269 |
+
print("Page number is:", page_num + 1)
|
270 |
|
271 |
annotations_on_page = []
|
272 |
analyzed_bounding_boxes = []
|
|
|
285 |
language=language,
|
286 |
entities=chosen_redact_entities,
|
287 |
score_threshold=score_threshold,
|
288 |
+
return_decision_process=True,
|
289 |
allow_list=allow_list)
|
290 |
+
|
291 |
+
|
292 |
+
|
293 |
|
294 |
characters = [char # This is what we want to include in the list
|
295 |
for line in text_container # Loop through each line in text_container
|
|
|
319 |
current_box = char_box
|
320 |
current_y = char_box[1]
|
321 |
else: # Now we have previous values to compare
|
322 |
+
#print("Comparing values")
|
323 |
vertical_diff_bboxes = abs(char_box[1] - current_y)
|
324 |
horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
325 |
#print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
|
|
|
330 |
):
|
331 |
old_right_pos = current_box[2]
|
332 |
current_box[2] = char_box[2]
|
|
|
|
|
|
|
333 |
else:
|
334 |
merged_bounding_boxes.append(
|
335 |
{"boundingBox": current_box, "result": result})
|
|
|
348 |
combined_analyzer_results.extend(analyzer_results)
|
349 |
|
350 |
if len(analyzer_results) > 0:
|
351 |
+
#decision_process_output_str = generate_decision_process_output(analyzer_results, {'text':text_to_analyze})
|
352 |
+
#print("Decision process:", decision_process_output_str)
|
353 |
# Create summary df of annotations to be made
|
354 |
analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
|
355 |
analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
356 |
analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
|
357 |
analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
|
358 |
analyzed_bounding_boxes_df_new['page'] = page_num + 1
|
359 |
+
analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
360 |
+
|
361 |
+
print('analyzed_bounding_boxes_df:', analyzed_bounding_boxes_df)
|
362 |
|
363 |
for analyzed_bounding_box in analyzed_bounding_boxes:
|
364 |
bounding_box = analyzed_bounding_box["boundingBox"]
|
|
|
380 |
|
381 |
annotations_all_pages.extend([annotations_on_page])
|
382 |
|
383 |
+
print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
|
384 |
page.Annots = pdf.make_indirect(annotations_on_page)
|
385 |
|
386 |
page_num += 1
|
|
|
|
|
387 |
|
388 |
+
return pdf, analyzed_bounding_boxes_df
|
tools/helper_functions.py
CHANGED
@@ -139,6 +139,23 @@ def add_folder_to_path(folder_path: str):
|
|
139 |
else:
|
140 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
async def get_connection_params(request: gr.Request):
|
143 |
base_folder = ""
|
144 |
|
|
|
139 |
else:
|
140 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
141 |
|
142 |
+
# Upon running a process, the feedback buttons are revealed
|
143 |
+
def reveal_feedback_buttons():
|
144 |
+
return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True)
|
145 |
+
|
146 |
+
def wipe_logs(feedback_logs_loc, usage_logs_loc):
|
147 |
+
try:
|
148 |
+
os.remove(feedback_logs_loc)
|
149 |
+
except Exception as e:
|
150 |
+
print("Could not remove feedback logs file", e)
|
151 |
+
try:
|
152 |
+
os.remove(usage_logs_loc)
|
153 |
+
except Exception as e:
|
154 |
+
print("Could not remove usage logs file", e)
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
async def get_connection_params(request: gr.Request):
|
160 |
base_folder = ""
|
161 |
|