seanpedrickcase commited on
Commit
8c33828
1 Parent(s): 01c88c0

Decision process now saved as log files. Other log files and feedback added

Browse files
app.py CHANGED
@@ -3,7 +3,8 @@ import os
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
 
6
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
 
7
  from tools.file_redaction import choose_and_run_redactor
8
  from tools.file_conversion import prepare_image_or_text_pdf
9
  from tools.data_anonymise import anonymise_data_files
@@ -29,9 +30,15 @@ with app:
29
  output_image_files_state = gr.State([])
30
  output_file_list_state = gr.State([])
31
  text_output_file_list_state = gr.State([])
 
 
32
 
33
  session_hash_state = gr.State()
34
  s3_output_folder_state = gr.State()
 
 
 
 
35
 
36
  gr.Markdown(
37
  """
@@ -39,9 +46,9 @@ with app:
39
 
40
  Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
41
 
42
- WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
43
 
44
- Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
45
  """)
46
 
47
  with gr.Tab("PDFs/images"):
@@ -57,6 +64,15 @@ with app:
57
 
58
  with gr.Row():
59
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
 
 
 
 
 
 
 
 
 
60
 
61
  with gr.Tab(label="Open text or Excel/csv files"):
62
  gr.Markdown(
@@ -73,13 +89,20 @@ with app:
73
 
74
  in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
75
 
76
- tabular_data_redact_btn = gr.Button("Anonymise text", variant="primary")
77
 
78
  with gr.Row():
79
  text_output_summary = gr.Textbox(label="Output result")
80
  text_output_file = gr.File(label="Output files")
81
  text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
82
 
 
 
 
 
 
 
 
83
  with gr.Tab(label="Redaction settings"):
84
  gr.Markdown(
85
  """
@@ -111,44 +134,55 @@ with app:
111
 
112
  # ### Loading AWS data ###
113
  # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
114
-
115
- callback = gr.CSVLogger()
116
 
117
  # Document redaction
118
- redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
119
- outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
120
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
121
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
122
 
123
  # If the output file count text box changes, keep going with redacting each document until done
124
- text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
125
- outputs=[output_summary, prepared_pdf_state]).\
126
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
127
- outputs=[output_summary, output_file, output_file_list_state, text_documents_done])
128
 
129
  # Tabular data redaction
130
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
131
 
132
- tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
133
 
134
  # If the output file count text box changes, keep going with redacting each data file until done
135
- text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done])
 
136
 
 
 
 
137
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
138
 
139
- # This needs to be called at some point prior to the first call to callback.flag()
 
140
  callback.setup([session_hash_textbox], "logs")
141
-
142
- #app.load(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
143
  session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
144
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  # Launch the Gradio app
146
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
147
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
148
 
149
  if __name__ == "__main__":
150
-
151
  if os.environ['COGNITO_AUTH'] == "1":
152
- app.queue().launch(show_error=True, auth=authenticate_user)
153
  else:
154
- app.queue().launch(show_error=True, inbrowser=True)
 
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
 
6
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs
7
+ from tools.aws_functions import upload_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_text_pdf
10
  from tools.data_anonymise import anonymise_data_files
 
30
  output_image_files_state = gr.State([])
31
  output_file_list_state = gr.State([])
32
  text_output_file_list_state = gr.State([])
33
+ first_loop_state = gr.State(True)
34
+ second_loop_state = gr.State(False)
35
 
36
  session_hash_state = gr.State()
37
  s3_output_folder_state = gr.State()
38
+ feedback_logs_state = gr.State('feedback/log.csv')
39
+ feedback_s3_logs_loc_state = gr.State('feedback/')
40
+ usage_logs_state = gr.State('logs/log.csv')
41
+ usage_s3_logs_loc_state = gr.State('logs/')
42
 
43
  gr.Markdown(
44
  """
 
46
 
47
  Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
48
 
49
+ WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
50
 
51
+ This app accepts a maximum file size of 10mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
52
  """)
53
 
54
  with gr.Tab("PDFs/images"):
 
64
 
65
  with gr.Row():
66
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
67
+
68
+ with gr.Row():
69
+ pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
70
+ with gr.Row():
71
+ pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
72
+ pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
73
+
74
+ with gr.Row():
75
+ s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
76
 
77
  with gr.Tab(label="Open text or Excel/csv files"):
78
  gr.Markdown(
 
89
 
90
  in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
91
 
92
+ tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
93
 
94
  with gr.Row():
95
  text_output_summary = gr.Textbox(label="Output result")
96
  text_output_file = gr.File(label="Output files")
97
  text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
98
 
99
+ with gr.Row():
100
+ data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
101
+ choices=["The results were good", "The results were not good"], visible=False)
102
+ with gr.Row():
103
+ data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
104
+ data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
105
+
106
  with gr.Tab(label="Redaction settings"):
107
  gr.Markdown(
108
  """
 
134
 
135
  # ### Loading AWS data ###
136
  # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
 
 
137
 
138
  # Document redaction
139
+ redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
140
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, first_loop_state],
 
141
  outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
142
 
143
  # If the output file count text box changes, keep going with redacting each document until done
144
+ text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
145
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, second_loop_state],
146
+ outputs=[output_summary, output_file, output_file_list_state, text_documents_done]).\
147
+ then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn])
148
 
149
  # Tabular data redaction
150
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
151
 
152
+ tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
153
 
154
  # If the output file count text box changes, keep going with redacting each data file until done
155
+ text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done]).\
156
+ then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn])
157
 
158
+ #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
159
+ # then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
160
+
161
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
162
 
163
+ # Log usernames and times of access to file (to know who is using the app when running on AWS)
164
+ callback = gr.CSVLogger()
165
  callback.setup([session_hash_textbox], "logs")
 
 
166
  session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
167
 
168
+ # User submitted feedback for pdf redactions
169
+ pdf_callback = gr.CSVLogger()
170
+ pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], "feedback")
171
+ pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
172
+ then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
173
+
174
+ # User submitted feedback for data redactions
175
+ data_callback = gr.CSVLogger()
176
+ data_callback.setup([data_feedback_radio, data_further_details_text], "feedback")
177
+ data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
178
+ then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
179
+
180
  # Launch the Gradio app
181
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
182
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
183
 
184
  if __name__ == "__main__":
 
185
  if os.environ['COGNITO_AUTH'] == "1":
186
+ app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='10mb')
187
  else:
188
+ app.queue().launch(show_error=True, inbrowser=True, max_file_size='10mb')
tools/aws_functions.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Type
2
  import pandas as pd
3
  import boto3
4
  import tempfile
@@ -6,12 +6,11 @@ import os
6
  from tools.helper_functions import get_or_create_env_var
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
- bucket_name=""
10
 
11
  # Get AWS credentials if required
12
-
13
  aws_var = "RUN_AWS_FUNCTIONS"
14
- aws_var_default = "0"
15
  aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
16
  print(f'The value of {aws_var} is {aws_var_val}')
17
 
@@ -156,4 +155,46 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
156
  out_message = "No password provided. Please ask the data team for access if you need this."
157
  print(out_message)
158
 
159
- return files, out_message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Type, List
2
  import pandas as pd
3
  import boto3
4
  import tempfile
 
6
  from tools.helper_functions import get_or_create_env_var
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
 
9
 
10
  # Get AWS credentials if required
11
+ bucket_name=""
12
  aws_var = "RUN_AWS_FUNCTIONS"
13
+ aws_var_default = "1"
14
  aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
15
  print(f'The value of {aws_var} is {aws_var_val}')
16
 
 
155
  out_message = "No password provided. Please ask the data team for access if you need this."
156
  print(out_message)
157
 
158
+ return files, out_message
159
+
160
+ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
161
+ """
162
+ Uploads a file from local machine to Amazon S3.
163
+
164
+ Args:
165
+ - local_file_path: Local file path(s) of the file(s) to upload.
166
+ - s3_key: Key (path) to the file in the S3 bucket.
167
+ - s3_bucket: Name of the S3 bucket.
168
+
169
+ Returns:
170
+ - Message as variable/printed to console
171
+ """
172
+ final_out_message = []
173
+
174
+ s3_client = boto3.client('s3')
175
+
176
+ if isinstance(local_file_paths, str):
177
+ local_file_paths = [local_file_paths]
178
+
179
+ for file in local_file_paths:
180
+ try:
181
+ # Get file name off file path
182
+ file_name = os.path.basename(file)
183
+
184
+ s3_key_full = s3_key + file_name
185
+ print("S3 key: ", s3_key_full)
186
+
187
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
188
+ out_message = "File " + file_name + " uploaded successfully to S3!"
189
+ print(out_message)
190
+
191
+ except Exception as e:
192
+ out_message = f"Error uploading file(s) to S3: {e}"
193
+ print(out_message)
194
+
195
+ final_out_message.append(out_message)
196
+ final_out_message_str = '\n'.join(final_out_message)
197
+
198
+ return final_out_message_str
199
+
200
+
tools/data_anonymise.py CHANGED
@@ -5,11 +5,10 @@ import time
5
  import pandas as pd
6
 
7
  from faker import Faker
8
-
9
  from gradio import Progress
10
- from typing import List
11
 
12
- from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
13
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
14
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
15
 
@@ -24,6 +23,76 @@ fake = Faker("en_UK")
24
  def fake_first_name(x):
25
  return fake.first_name()
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def anon_consistent_names(df):
28
  # ## Pick out common names and replace them with the same person value
29
  df_dict = df.to_dict(orient="list")
@@ -118,6 +187,9 @@ def anon_consistent_names(df):
118
 
119
  def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
120
 
 
 
 
121
  key_string = ""
122
 
123
  # DataFrame to dict
@@ -133,34 +205,26 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
133
 
134
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
135
 
136
- # analyzer_results = batch_analyzer.analyze_dict(df_dict, language=language,
137
- # entities=chosen_redact_entities,
138
- # score_threshold=score_threshold,
139
- # return_decision_process=False,
140
- # in_allow_list=in_allow_list_flat)
141
-
142
- print("Identifying personal information")
143
- analyse_tic = time.perf_counter()
144
-
145
- print("Allow list:", in_allow_list)
146
 
147
  # Use custom analyzer to be able to track progress with Gradio
148
  analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
149
  entities=chosen_redact_entities,
150
  score_threshold=score_threshold,
151
- return_decision_process=False,
152
  allow_list=in_allow_list_flat)
 
153
  analyzer_results = list(analyzer_results)
154
- #analyzer_results
 
 
155
 
156
  analyse_toc = time.perf_counter()
157
  analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
158
  print(analyse_time_out)
159
 
160
-
161
-
162
  # Create faker function (note that it has to receive a value)
163
-
164
  fake = Faker("en_UK")
165
 
166
  def fake_first_name(x):
@@ -197,7 +261,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
197
 
198
  scrubbed_df = pd.DataFrame(anonymizer_results)
199
 
200
- return scrubbed_df, key_string
201
 
202
  def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
203
  def check_lists(list1, list2):
@@ -238,7 +302,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
238
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
239
 
240
  # Anonymise the selected columns
241
- anon_df_part_out, key_string = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
242
 
243
  # Rejoin the dataframe together
244
  anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
@@ -261,11 +325,20 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
261
  # Write each DataFrame to a different worksheet.
262
  anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
263
 
 
 
 
 
264
  else:
265
  anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
266
  anon_df_out.to_csv(anon_export_file_name, index = None)
267
 
 
 
 
 
268
  out_file_paths.append(anon_export_file_name)
 
269
 
270
  # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
271
  out_file_paths = list(set(out_file_paths))
@@ -276,10 +349,16 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
276
 
277
  return out_file_paths, out_message, key_string
278
 
279
- def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], progress=Progress(track_tqdm=True)):
280
 
281
  tic = time.perf_counter()
282
 
 
 
 
 
 
 
283
  # Load file
284
  # If out message or out_file_paths are blank, change to a list so it can be appended to
285
  if isinstance(out_message, str):
 
5
  import pandas as pd
6
 
7
  from faker import Faker
 
8
  from gradio import Progress
9
+ from typing import List, Dict, Any
10
 
11
+ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
12
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
 
 
23
  def fake_first_name(x):
24
  return fake.first_name()
25
 
26
+ # Writing decision making process to file
27
+ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]) -> str:
28
+ """
29
+ Generate a detailed output of the decision process for entity recognition.
30
+
31
+ This function takes the results from the analyzer and the original data dictionary,
32
+ and produces a string output detailing the decision process for each recognized entity.
33
+ It includes information such as entity type, position, confidence score, and the context
34
+ in which the entity was found.
35
+
36
+ Args:
37
+ analyzer_results (List[DictAnalyzerResult]): The results from the entity analyzer.
38
+ df_dict (Dict[str, List[Any]]): The original data in dictionary format.
39
+
40
+ Returns:
41
+ str: A string containing the detailed decision process output.
42
+ """
43
+ decision_process_output = []
44
+ keys_to_keep = ['entity_type', 'start', 'end']
45
+
46
+ def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
47
+ output = []
48
+
49
+ if hasattr(result, 'value'):
50
+ text = result.value[data_row]
51
+ else:
52
+ text = ""
53
+
54
+ if isinstance(recognizer_result, list):
55
+ for sub_result in recognizer_result:
56
+ if isinstance(text, str):
57
+ found_text = text[sub_result.start:sub_result.end]
58
+ else:
59
+ found_text = ''
60
+ analysis_explanation = {key: sub_result.__dict__[key] for key in keys_to_keep}
61
+ analysis_explanation.update({
62
+ 'data_row': str(data_row),
63
+ 'column': list(df_dict.keys())[dictionary_key],
64
+ 'entity': found_text
65
+ })
66
+ output.append(str(analysis_explanation))
67
+
68
+ return output
69
+
70
+ #print("Analyser results:", analyzer_results)
71
+
72
+ # Run through each column to analyse for PII
73
+ for i, result in enumerate(analyzer_results):
74
+ print("Looking at result:", str(i))
75
+
76
+ # If a single result
77
+ if isinstance(result, RecognizerResult):
78
+ decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
79
+
80
+ # If a list of results
81
+ elif isinstance(result, List):
82
+ for x, recognizer_result in enumerate(result.recognizer_results):
83
+ decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
84
+
85
+ else:
86
+ try:
87
+ decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
88
+ except Exception as e:
89
+ print(e)
90
+
91
+ decision_process_output_str = '\n'.join(decision_process_output)
92
+
93
+
94
+ return decision_process_output_str
95
+
96
  def anon_consistent_names(df):
97
  # ## Pick out common names and replace them with the same person value
98
  df_dict = df.to_dict(orient="list")
 
187
 
188
  def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
189
 
190
+ print("Identifying personal information")
191
+ analyse_tic = time.perf_counter()
192
+
193
  key_string = ""
194
 
195
  # DataFrame to dict
 
205
 
206
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
207
 
208
+ #print("Allow list:", in_allow_list)
209
+ #print("Input data keys:", df_dict.keys())
 
 
 
 
 
 
 
 
210
 
211
  # Use custom analyzer to be able to track progress with Gradio
212
  analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
213
  entities=chosen_redact_entities,
214
  score_threshold=score_threshold,
215
+ return_decision_process=True,
216
  allow_list=in_allow_list_flat)
217
+
218
  analyzer_results = list(analyzer_results)
219
+
220
+ # Usage in the main function:
221
+ decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
222
 
223
  analyse_toc = time.perf_counter()
224
  analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
225
  print(analyse_time_out)
226
 
 
 
227
  # Create faker function (note that it has to receive a value)
 
228
  fake = Faker("en_UK")
229
 
230
  def fake_first_name(x):
 
261
 
262
  scrubbed_df = pd.DataFrame(anonymizer_results)
263
 
264
+ return scrubbed_df, key_string, decision_process_output_str
265
 
266
  def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
267
  def check_lists(list1, list2):
 
302
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
303
 
304
  # Anonymise the selected columns
305
+ anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
306
 
307
  # Rejoin the dataframe together
308
  anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
 
325
  # Write each DataFrame to a different worksheet.
326
  anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
327
 
328
+ decision_process_log_output_file = anon_xlsx_export_file_name + "decision_process_output.txt"
329
+ with open(decision_process_log_output_file, "w") as f:
330
+ f.write(decision_process_output_str)
331
+
332
  else:
333
  anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
334
  anon_df_out.to_csv(anon_export_file_name, index = None)
335
 
336
+ decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
337
+ with open(decision_process_log_output_file, "w") as f:
338
+ f.write(decision_process_output_str)
339
+
340
  out_file_paths.append(anon_export_file_name)
341
+ out_file_paths.append(decision_process_log_output_file)
342
 
343
  # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
344
  out_file_paths = list(set(out_file_paths))
 
349
 
350
  return out_file_paths, out_message, key_string
351
 
352
+ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
353
 
354
  tic = time.perf_counter()
355
 
356
+ # If this is the first time around, set variables to 0/blank
357
+ if first_loop_state==True:
358
+ latest_file_completed = 0
359
+ out_message = []
360
+ out_file_paths = []
361
+
362
  # Load file
363
  # If out message or out_file_paths are blank, change to a list so it can be appended to
364
  if isinstance(out_message, str):
tools/file_conversion.py CHANGED
@@ -3,7 +3,7 @@ from tools.helper_functions import get_file_path_end, output_folder
3
  from PIL import Image
4
  import os
5
  from gradio import Progress
6
- from typing import List
7
 
8
  def is_pdf_or_image(filename):
9
  """
@@ -55,6 +55,7 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
55
 
56
  # If no images are returned, break the loop
57
  if not image:
 
58
  break
59
 
60
  images.extend(image)
@@ -74,6 +75,7 @@ def process_file(file_path):
74
  print(f"{file_path} is an image file.")
75
  # Perform image processing here
76
  img_object = [Image.open(file_path)]
 
77
 
78
  # Check if the file is a PDF
79
  elif file_extension == '.pdf':
@@ -85,37 +87,79 @@ def process_file(file_path):
85
  print(f"{file_path} is not an image or PDF file.")
86
  img_object = ['']
87
 
88
- # print('Image object is:', img_object)
89
 
90
  return img_object
91
 
92
- def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], progress=Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  # If out message or out_file_paths are blank, change to a list so it can be appended to
95
  #if isinstance(out_message, str):
96
  # out_message = [out_message]
97
 
 
 
 
 
 
 
 
 
 
 
 
98
  if not file_paths:
99
  file_paths = []
100
 
101
- out_file_paths = file_paths
102
 
103
  latest_file_completed = int(latest_file_completed)
104
 
105
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
106
- if latest_file_completed == len(out_file_paths):
107
  print("Last file reached, returning files:", str(latest_file_completed))
108
  #final_out_message = '\n'.join(out_message)
109
  return out_message, out_file_paths
110
 
111
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
112
 
113
- file_paths_loop = [out_file_paths[int(latest_file_completed)]]
 
114
 
115
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
116
  for file in file_paths_loop:
117
  file_path = file.name
118
 
 
 
119
  #if file_path:
120
  # file_path_without_ext = get_file_path_end(file_path)
121
  if not file_path:
 
3
  from PIL import Image
4
  import os
5
  from gradio import Progress
6
+ from typing import List, Optional
7
 
8
  def is_pdf_or_image(filename):
9
  """
 
55
 
56
  # If no images are returned, break the loop
57
  if not image:
58
+ print("Conversion of page", str(page_num), "to file failed.")
59
  break
60
 
61
  images.extend(image)
 
75
  print(f"{file_path} is an image file.")
76
  # Perform image processing here
77
  img_object = [Image.open(file_path)]
78
+ # Load images from the file paths
79
 
80
  # Check if the file is a PDF
81
  elif file_extension == '.pdf':
 
87
  print(f"{file_path} is not an image or PDF file.")
88
  img_object = ['']
89
 
90
+ print('Image object is:', img_object)
91
 
92
  return img_object
93
 
94
+
95
+
96
+ def prepare_image_or_text_pdf(
97
+ file_paths: List[str],
98
+ in_redact_method: str,
99
+ in_allow_list: Optional[List[List[str]]] = None,
100
+ latest_file_completed: int = 0,
101
+ out_message: List[str] = [],
102
+ first_loop_state: bool = False,
103
+ progress: Progress = Progress(track_tqdm=True)
104
+ ) -> tuple[List[str], List[str]]:
105
+ """
106
+ Prepare and process image or text PDF files for redaction.
107
+
108
+ This function takes a list of file paths, processes each file based on the specified redaction method,
109
+ and returns the output messages and processed file paths.
110
+
111
+ Args:
112
+ file_paths (List[str]): List of file paths to process.
113
+ in_redact_method (str): The redaction method to use.
114
+ in_allow_list (Optional[List[List[str]]]): List of allowed terms for redaction.
115
+ latest_file_completed (int): Index of the last completed file.
116
+ out_message (List[str]): List to store output messages.
117
+ first_loop_state (bool): Flag indicating if this is the first iteration.
118
+ progress (Progress): Progress tracker for the operation.
119
+
120
+ Returns:
121
+ tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
122
+ """
123
 
124
  # If out message or out_file_paths are blank, change to a list so it can be appended to
125
  #if isinstance(out_message, str):
126
  # out_message = [out_message]
127
 
128
+
129
+
130
+ # If this is the first time around, set variables to 0/blank
131
+ if first_loop_state==True:
132
+ latest_file_completed = 0
133
+ out_message = []
134
+ out_file_paths = []
135
+ else:
136
+ print("Now attempting file:", str(latest_file_completed + 1))
137
+ out_file_paths = []
138
+
139
  if not file_paths:
140
  file_paths = []
141
 
142
+ #out_file_paths = file_paths
143
 
144
  latest_file_completed = int(latest_file_completed)
145
 
146
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
147
+ if latest_file_completed == len(file_paths):
148
  print("Last file reached, returning files:", str(latest_file_completed))
149
  #final_out_message = '\n'.join(out_message)
150
  return out_message, out_file_paths
151
 
152
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
153
 
154
+ file_paths_loop = [file_paths[int(latest_file_completed)]]
155
+ print("file_paths_loop:", str(file_paths_loop))
156
 
157
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
158
  for file in file_paths_loop:
159
  file_path = file.name
160
 
161
+ print("file_path:", file_path)
162
+
163
  #if file_path:
164
  # file_path_without_ext = get_file_path_end(file_path)
165
  if not file_path:
tools/file_redaction.py CHANGED
@@ -1,4 +1,4 @@
1
- from PIL import Image
2
  from typing import List
3
  import pandas as pd
4
  from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
@@ -14,13 +14,20 @@ from collections import defaultdict # For efficient grouping
14
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
15
  from tools.helper_functions import get_file_path_end, output_folder
16
  from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
 
17
  import gradio as gr
18
 
19
 
20
- def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], progress=gr.Progress(track_tqdm=True)):
21
 
22
  tic = time.perf_counter()
23
 
 
 
 
 
 
 
24
  # If out message is string or out_file_paths are blank, change to a list so it can be appended to
25
  if isinstance(out_message, str):
26
  out_message = [out_message]
@@ -44,14 +51,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
44
  in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
45
 
46
 
47
- print("File paths:", file_paths)
48
 
49
  for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
50
  file_path = file.name
51
 
52
  if file_path:
53
  file_path_without_ext = get_file_path_end(file_path)
54
- if is_pdf(file_path) == False:
 
55
  # If user has not submitted a pdf, assume it's an image
56
  print("File is not a pdf, assuming that image analysis needs to be used.")
57
  in_redact_method = "Image analysis"
@@ -65,13 +73,19 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
65
  # if is_pdf_or_image(file_path) == False:
66
  # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
67
 
68
- print("Redacting file as image-based pdf")
69
- pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
70
  out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
71
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
72
 
73
  out_file_paths.append(out_image_file_path)
74
- out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
 
 
 
 
 
 
75
 
76
  # Increase latest file completed count unless we are at the last file
77
  if latest_file_completed != len(file_paths):
@@ -84,12 +98,12 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
84
 
85
  # Analyse text-based pdf
86
  print('Redacting file as text-based PDF')
87
- pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
88
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
89
  pdf_text.save(out_text_file_path)
90
 
91
  #out_file_paths.append(out_text_file_path)
92
- out_message_new = "File " + file_path_without_ext + " successfully redacted."
93
  out_message.append(out_message_new)
94
 
95
  # Convert message
@@ -101,6 +115,12 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
101
  img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
102
  out_file_paths.extend(img_output_file_path)
103
 
 
 
 
 
 
 
104
  # Add confirmation for converting to image if you want
105
  # out_message.append(img_output_summary)
106
 
@@ -138,7 +158,7 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
138
  merged_box = group[0]
139
  for next_box in group[1:]:
140
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
141
- print("Merging a box")
142
  # Calculate new dimensions for the merged box
143
  new_left = min(merged_box.left, next_box.left)
144
  new_top = min(merged_box.top, next_box.top)
@@ -154,16 +174,14 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
154
  merged_bboxes.append(merged_box)
155
  return merged_bboxes
156
 
157
- def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
158
  '''
159
  Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
160
  '''
161
- from PIL import Image, ImageChops, ImageDraw
162
 
163
  fill = (0, 0, 0)
164
 
165
  if not image_paths:
166
-
167
  out_message = "PDF does not exist as images. Converting pages to image"
168
  print(out_message)
169
  #progress(0, desc=out_message)
@@ -180,12 +198,12 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
180
  #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
181
  for i in range(0, number_of_pages):
182
 
183
- print("Redacting page ", str(i + 1))
184
 
185
  # Get the image to redact using PIL lib (pillow)
186
- image = image_paths[i] #Image.open(image_paths[i])
187
 
188
- image = ImageChops.duplicate(image)
189
 
190
  # %%
191
  image_analyser = ImageAnalyzerEngine(nlp_analyser)
@@ -200,16 +218,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
200
  "allow_list": allow_list,
201
  "language": language,
202
  "entities": chosen_redact_entities,
203
- "score_threshold": score_threshold
 
204
  })
205
 
 
 
 
 
 
206
  #print("For page: ", str(i), "Bounding boxes: ", bboxes)
207
 
208
  draw = ImageDraw.Draw(image)
209
 
210
  merged_bboxes = merge_img_bboxes(bboxes)
211
 
212
- print("For page: ", str(i), "Merged bounding boxes: ", merged_bboxes)
213
 
214
  # 3. Draw the merged boxes (unchanged)
215
  for box in merged_bboxes:
@@ -221,7 +245,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
221
 
222
  images.append(image)
223
 
224
- return images
225
 
226
  def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
227
  '''
@@ -242,7 +266,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
242
 
243
  #for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
244
  for page in pdf.pages:
245
- print("Page number is: ", page_num + 1)
246
 
247
  annotations_on_page = []
248
  analyzed_bounding_boxes = []
@@ -261,8 +285,11 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
261
  language=language,
262
  entities=chosen_redact_entities,
263
  score_threshold=score_threshold,
264
- return_decision_process=False,
265
  allow_list=allow_list)
 
 
 
266
 
267
  characters = [char # This is what we want to include in the list
268
  for line in text_container # Loop through each line in text_container
@@ -292,7 +319,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
292
  current_box = char_box
293
  current_y = char_box[1]
294
  else: # Now we have previous values to compare
295
- print("Comparing values")
296
  vertical_diff_bboxes = abs(char_box[1] - current_y)
297
  horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
298
  #print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
@@ -303,9 +330,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
303
  ):
304
  old_right_pos = current_box[2]
305
  current_box[2] = char_box[2]
306
-
307
- print("Old right pos: ", str(old_right_pos), "has been replaced with: ", str(current_box[2]), "for result: ", result)
308
-
309
  else:
310
  merged_bounding_boxes.append(
311
  {"boundingBox": current_box, "result": result})
@@ -324,13 +348,17 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
324
  combined_analyzer_results.extend(analyzer_results)
325
 
326
  if len(analyzer_results) > 0:
 
 
327
  # Create summary df of annotations to be made
328
  analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
329
  analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
330
  analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
331
  analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
332
  analyzed_bounding_boxes_df_new['page'] = page_num + 1
333
- analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
 
 
334
 
335
  for analyzed_bounding_box in analyzed_bounding_boxes:
336
  bounding_box = analyzed_bounding_box["boundingBox"]
@@ -352,11 +380,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
352
 
353
  annotations_all_pages.extend([annotations_on_page])
354
 
355
- print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
356
  page.Annots = pdf.make_indirect(annotations_on_page)
357
 
358
  page_num += 1
359
-
360
- analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
361
 
362
- return pdf
 
1
+ from PIL import Image, ImageChops, ImageDraw
2
  from typing import List
3
  import pandas as pd
4
  from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
 
14
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
15
  from tools.helper_functions import get_file_path_end, output_folder
16
  from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
17
+ from tools.data_anonymise import generate_decision_process_output
18
  import gradio as gr
19
 
20
 
21
+ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
22
 
23
  tic = time.perf_counter()
24
 
25
+ # If this is the first time around, set variables to 0/blank
26
+ if first_loop_state==True:
27
+ latest_file_completed = 0
28
+ out_message = []
29
+ out_file_paths = []
30
+
31
  # If out message is string or out_file_paths are blank, change to a list so it can be appended to
32
  if isinstance(out_message, str):
33
  out_message = [out_message]
 
51
  in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
52
 
53
 
54
+ #print("File paths:", file_paths)
55
 
56
  for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
57
  file_path = file.name
58
 
59
  if file_path:
60
  file_path_without_ext = get_file_path_end(file_path)
61
+ is_a_pdf = is_pdf(file_path) == True
62
+ if is_a_pdf == False:
63
  # If user has not submitted a pdf, assume it's an image
64
  print("File is not a pdf, assuming that image analysis needs to be used.")
65
  in_redact_method = "Image analysis"
 
73
  # if is_pdf_or_image(file_path) == False:
74
  # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
75
 
76
+ print("Redacting file as image-based file")
77
+ pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf)
78
  out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
79
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
80
 
81
  out_file_paths.append(out_image_file_path)
82
+ out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file")
83
+
84
+ output_logs_str = str(output_logs)
85
+ logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
86
+ with open(logs_output_file_name, "w") as f:
87
+ f.write(output_logs_str)
88
+ out_file_paths.append(logs_output_file_name)
89
 
90
  # Increase latest file completed count unless we are at the last file
91
  if latest_file_completed != len(file_paths):
 
98
 
99
  # Analyse text-based pdf
100
  print('Redacting file as text-based PDF')
101
+ pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
102
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
103
  pdf_text.save(out_text_file_path)
104
 
105
  #out_file_paths.append(out_text_file_path)
106
+ out_message_new = "File " + file_path_without_ext + " successfully redacted"
107
  out_message.append(out_message_new)
108
 
109
  # Convert message
 
115
  img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
116
  out_file_paths.extend(img_output_file_path)
117
 
118
+ output_logs_str = str(output_logs)
119
+ logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
120
+ with open(logs_output_file_name, "w") as f:
121
+ f.write(output_logs_str)
122
+ out_file_paths.append(logs_output_file_name)
123
+
124
  # Add confirmation for converting to image if you want
125
  # out_message.append(img_output_summary)
126
 
 
158
  merged_box = group[0]
159
  for next_box in group[1:]:
160
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
161
+ #print("Merging a box")
162
  # Calculate new dimensions for the merged box
163
  new_left = min(merged_box.left, next_box.left)
164
  new_top = min(merged_box.top, next_box.top)
 
174
  merged_bboxes.append(merged_box)
175
  return merged_bboxes
176
 
177
+ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, progress=Progress(track_tqdm=True)):
178
  '''
179
  Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
180
  '''
 
181
 
182
  fill = (0, 0, 0)
183
 
184
  if not image_paths:
 
185
  out_message = "PDF does not exist as images. Converting pages to image"
186
  print(out_message)
187
  #progress(0, desc=out_message)
 
198
  #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
199
  for i in range(0, number_of_pages):
200
 
201
+ print("Redacting page", str(i + 1))
202
 
203
  # Get the image to redact using PIL lib (pillow)
204
+ #print("image_paths:", image_paths)
205
 
206
+ image = ImageChops.duplicate(image_paths[i])
207
 
208
  # %%
209
  image_analyser = ImageAnalyzerEngine(nlp_analyser)
 
218
  "allow_list": allow_list,
219
  "language": language,
220
  "entities": chosen_redact_entities,
221
+ "score_threshold": score_threshold,
222
+ "return_decision_process":True,
223
  })
224
 
225
+ # Text placeholder in this processing step, as the analyze method does not return the OCR text
226
+ if bboxes:
227
+ decision_process_output_str = str(bboxes)
228
+ print("Decision process:", decision_process_output_str)
229
+
230
  #print("For page: ", str(i), "Bounding boxes: ", bboxes)
231
 
232
  draw = ImageDraw.Draw(image)
233
 
234
  merged_bboxes = merge_img_bboxes(bboxes)
235
 
236
+ #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
237
 
238
  # 3. Draw the merged boxes (unchanged)
239
  for box in merged_bboxes:
 
245
 
246
  images.append(image)
247
 
248
+ return images, decision_process_output_str
249
 
250
  def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
251
  '''
 
266
 
267
  #for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
268
  for page in pdf.pages:
269
+ print("Page number is:", page_num + 1)
270
 
271
  annotations_on_page = []
272
  analyzed_bounding_boxes = []
 
285
  language=language,
286
  entities=chosen_redact_entities,
287
  score_threshold=score_threshold,
288
+ return_decision_process=True,
289
  allow_list=allow_list)
290
+
291
+
292
+
293
 
294
  characters = [char # This is what we want to include in the list
295
  for line in text_container # Loop through each line in text_container
 
319
  current_box = char_box
320
  current_y = char_box[1]
321
  else: # Now we have previous values to compare
322
+ #print("Comparing values")
323
  vertical_diff_bboxes = abs(char_box[1] - current_y)
324
  horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
325
  #print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
 
330
  ):
331
  old_right_pos = current_box[2]
332
  current_box[2] = char_box[2]
 
 
 
333
  else:
334
  merged_bounding_boxes.append(
335
  {"boundingBox": current_box, "result": result})
 
348
  combined_analyzer_results.extend(analyzer_results)
349
 
350
  if len(analyzer_results) > 0:
351
+ #decision_process_output_str = generate_decision_process_output(analyzer_results, {'text':text_to_analyze})
352
+ #print("Decision process:", decision_process_output_str)
353
  # Create summary df of annotations to be made
354
  analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
355
  analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
356
  analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
357
  analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
358
  analyzed_bounding_boxes_df_new['page'] = page_num + 1
359
+ analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
360
+
361
+ print('analyzed_bounding_boxes_df:', analyzed_bounding_boxes_df)
362
 
363
  for analyzed_bounding_box in analyzed_bounding_boxes:
364
  bounding_box = analyzed_bounding_box["boundingBox"]
 
380
 
381
  annotations_all_pages.extend([annotations_on_page])
382
 
383
+ print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
384
  page.Annots = pdf.make_indirect(annotations_on_page)
385
 
386
  page_num += 1
 
 
387
 
388
+ return pdf, analyzed_bounding_boxes_df
tools/helper_functions.py CHANGED
@@ -139,6 +139,23 @@ def add_folder_to_path(folder_path: str):
139
  else:
140
  print(f"Folder not found at {folder_path} - not added to PATH")
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  async def get_connection_params(request: gr.Request):
143
  base_folder = ""
144
 
 
139
  else:
140
  print(f"Folder not found at {folder_path} - not added to PATH")
141
 
142
+ # Upon running a process, the feedback buttons are revealed
143
+ def reveal_feedback_buttons():
144
+ return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True)
145
+
146
+ def wipe_logs(feedback_logs_loc, usage_logs_loc):
147
+ try:
148
+ os.remove(feedback_logs_loc)
149
+ except Exception as e:
150
+ print("Could not remove feedback logs file", e)
151
+ try:
152
+ os.remove(usage_logs_loc)
153
+ except Exception as e:
154
+ print("Could not remove usage logs file", e)
155
+
156
+
157
+
158
+
159
  async def get_connection_params(request: gr.Request):
160
  base_folder = ""
161