seanpedrickcase commited on
Commit
34addbf
1 Parent(s): 230fcc3

Enhanced logging of usage. Small buffer added to redaction rectangles as it seems to miss the tops of text often.

Browse files
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
 
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
@@ -24,21 +25,11 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
24
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
25
  language = 'en'
26
 
27
- feedback_data_folder = 'feedback/' + today_rev + '/'
28
- logs_data_folder = 'logs/' + today_rev + '/'
29
 
30
- def create_logs_folder(session_hash_textbox):
31
- print("session_hash_textbox", session_hash_textbox)
32
-
33
- feedback_data_folder = 'feedback/' + session_hash_textbox + "/" + today_rev + '/'
34
- logs_data_folder = 'logs/' + session_hash_textbox + "/" + today_rev + '/'
35
-
36
- feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
37
- feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
38
- usage_logs_state = gr.State(logs_data_folder + 'log.csv')
39
- usage_s3_logs_loc_state = gr.State(logs_data_folder)
40
-
41
- return feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state
42
 
43
  # Create the gradio interface
44
  app = gr.Blocks(theme = gr.themes.Base())
@@ -56,13 +47,13 @@ with app:
56
  session_hash_state = gr.State()
57
  s3_output_folder_state = gr.State()
58
 
59
-
60
-
61
- feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
62
- feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
63
- usage_logs_state = gr.State(logs_data_folder + 'log.csv')
64
- usage_s3_logs_loc_state = gr.State(logs_data_folder)
65
-
66
 
67
  gr.Markdown(
68
  """
@@ -96,6 +87,8 @@ with app:
96
 
97
  with gr.Row():
98
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
 
 
99
 
100
  with gr.Tab(label="Open text or Excel/csv files"):
101
  gr.Markdown(
@@ -148,7 +141,7 @@ with app:
148
  # Invisible text box to hold the session hash/username just for logging purposes
149
  session_hash_textbox = gr.Textbox(value="", visible=False)
150
 
151
- # AWS options - not yet implemented
152
  # with gr.Tab(label="Advanced options"):
153
  # with gr.Accordion(label = "AWS data access", open = True):
154
  # aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
@@ -163,13 +156,13 @@ with app:
163
 
164
  # Document redaction
165
  redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
166
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max],
167
- outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
168
 
169
  # If the output file count text box changes, keep going with redacting each document until done
170
  text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
171
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max],
172
- outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
173
  then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
174
 
175
  # Tabular data redaction
@@ -181,31 +174,33 @@ with app:
181
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
182
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
183
 
184
- #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
185
- # then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
186
-
187
-
188
- app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])#.\
189
- #then(create_logs_folder, inputs=[session_hash_textbox], outputs = [feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state])
190
 
191
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
192
  callback = gr.CSVLogger()
193
- callback.setup([session_hash_textbox], logs_data_folder)
194
  session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
195
- then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
196
 
197
  # User submitted feedback for pdf redactions
198
  pdf_callback = gr.CSVLogger()
199
- pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_data_folder)
200
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
201
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
202
 
203
  # User submitted feedback for data redactions
204
  data_callback = gr.CSVLogger()
205
- data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_data_folder)
206
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
207
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
208
 
 
 
 
 
 
 
209
  # Launch the Gradio app
210
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
211
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 
1
  import os
2
+ import socket
3
 
4
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
5
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 
25
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
26
  language = 'en'
27
 
28
+ host_name = socket.gethostname()
 
29
 
30
+ feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
31
+ access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
32
+ usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
 
 
 
 
 
 
 
 
 
33
 
34
  # Create the gradio interface
35
  app = gr.Blocks(theme = gr.themes.Base())
 
47
  session_hash_state = gr.State()
48
  s3_output_folder_state = gr.State()
49
 
50
+ # Logging state
51
+ feedback_logs_state = gr.State(feedback_logs_folder + 'log.csv')
52
+ feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
53
+ access_logs_state = gr.State(access_logs_folder + 'log.csv')
54
+ access_s3_logs_loc_state = gr.State(access_logs_folder)
55
+ usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
56
+ usage_s3_logs_loc_state = gr.State(usage_logs_folder)
57
 
58
  gr.Markdown(
59
  """
 
87
 
88
  with gr.Row():
89
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
90
+ # This keeps track of the time taken to redact files for logging purposes.
91
+ estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False)
92
 
93
  with gr.Tab(label="Open text or Excel/csv files"):
94
  gr.Markdown(
 
141
  # Invisible text box to hold the session hash/username just for logging purposes
142
  session_hash_textbox = gr.Textbox(value="", visible=False)
143
 
144
+ # AWS options - placeholder for possibility of storing data on s3
145
  # with gr.Tab(label="Advanced options"):
146
  # with gr.Accordion(label = "AWS data access", open = True):
147
  # aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
 
156
 
157
  # Document redaction
158
  redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
159
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number],
160
+ outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number], api_name="redact_doc")
161
 
162
  # If the output file count text box changes, keep going with redacting each document until done
163
  text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
164
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number],
165
+ outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number]).\
166
  then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
167
 
168
  # Tabular data redaction
 
174
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
175
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
176
 
177
+ # Get connection details on app load
178
+ app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
 
 
 
 
179
 
180
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
181
  callback = gr.CSVLogger()
182
+ callback.setup([session_hash_textbox], access_logs_folder)
183
  session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
184
+ then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
185
 
186
  # User submitted feedback for pdf redactions
187
  pdf_callback = gr.CSVLogger()
188
+ pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_logs_folder)
189
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
190
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
191
 
192
  # User submitted feedback for data redactions
193
  data_callback = gr.CSVLogger()
194
+ data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_logs_folder)
195
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
196
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
197
 
198
+ # Log processing time/token usage when making a query
199
+ usage_callback = gr.CSVLogger()
200
+ usage_callback.setup([session_hash_textbox, in_data_files, estimated_time_taken_number], usage_logs_folder)
201
+ estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, in_data_files, estimated_time_taken_number], None, preprocess=False).\
202
+ then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
203
+
204
  # Launch the Gradio app
205
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
206
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
tools/aws_functions.py CHANGED
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
10
  # Get AWS credentials if required
11
  bucket_name=""
12
  aws_var = "RUN_AWS_FUNCTIONS"
13
- aws_var_default = "0"
14
  aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
15
  print(f'The value of {aws_var} is {aws_var_val}')
16
 
 
10
  # Get AWS credentials if required
11
  bucket_name=""
12
  aws_var = "RUN_AWS_FUNCTIONS"
13
+ aws_var_default = "1"
14
  aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
15
  print(f'The value of {aws_var} is {aws_var_val}')
16
 
tools/file_conversion.py CHANGED
@@ -2,6 +2,7 @@ from pdf2image import convert_from_path, pdfinfo_from_path
2
  from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
3
  from PIL import Image
4
  import os
 
5
  from gradio import Progress
6
  from typing import List, Optional
7
 
@@ -62,6 +63,8 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
62
  # print("Conversion of page", str(page_num), "to file succeeded.")
63
  # print("image:", image)
64
 
 
 
65
  images.extend(image)
66
 
67
  print("PDF has been converted to images.")
@@ -122,6 +125,8 @@ def prepare_image_or_text_pdf(
122
  tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
123
  """
124
 
 
 
125
  # If out message or out_file_paths are blank, change to a list so it can be appended to
126
  #if isinstance(out_message, str):
127
  # out_message = [out_message]
@@ -156,8 +161,9 @@ def prepare_image_or_text_pdf(
156
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
157
  for file in file_paths_loop:
158
  file_path = file.name
 
159
 
160
- print("file_path:", file_path)
161
 
162
  file_extension = os.path.splitext(file_path)[1].lower()
163
 
@@ -191,8 +197,16 @@ def prepare_image_or_text_pdf(
191
  out_file_path = file_path
192
 
193
  out_file_paths.append(out_file_path)
 
 
 
 
 
 
 
 
194
 
195
- return out_message, out_file_paths
196
 
197
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
198
  file_path_without_ext = get_file_path_end(in_file_path)
 
2
  from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
3
  from PIL import Image
4
  import os
5
+ import time
6
  from gradio import Progress
7
  from typing import List, Optional
8
 
 
63
  # print("Conversion of page", str(page_num), "to file succeeded.")
64
  # print("image:", image)
65
 
66
+ #image[0].save(pdf_path + "_" + str(page_num) + ".png", format="PNG")
67
+
68
  images.extend(image)
69
 
70
  print("PDF has been converted to images.")
 
125
  tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
126
  """
127
 
128
+ tic = time.perf_counter()
129
+
130
  # If out message or out_file_paths are blank, change to a list so it can be appended to
131
  #if isinstance(out_message, str):
132
  # out_message = [out_message]
 
161
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
162
  for file in file_paths_loop:
163
  file_path = file.name
164
+ file_path_without_ext = get_file_path_end(file_path)
165
 
166
+ #print("file:", file_path)
167
 
168
  file_extension = os.path.splitext(file_path)[1].lower()
169
 
 
197
  out_file_path = file_path
198
 
199
  out_file_paths.append(out_file_path)
200
+
201
+ toc = time.perf_counter()
202
+ out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
203
+
204
+ print(out_time)
205
+
206
+ out_message.append(out_time)
207
+ out_message_out = '\n'.join(out_message)
208
 
209
+ return out_message_out, out_file_paths
210
 
211
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
212
  file_path_without_ext = get_file_path_end(in_file_path)
tools/file_redaction.py CHANGED
@@ -9,6 +9,7 @@ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
9
  from pikepdf import Pdf, Dictionary, Name
10
  from gradio import Progress
11
  import time
 
12
  from collections import defaultdict # For efficient grouping
13
 
14
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
@@ -18,15 +19,14 @@ from tools.data_anonymise import generate_decision_process_output
18
  import gradio as gr
19
 
20
 
21
- def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, progress=gr.Progress(track_tqdm=True)):
22
 
23
  tic = time.perf_counter()
24
 
25
-
26
  # If this is the first time around, set variables to 0/blank
27
  if first_loop_state==True:
28
  latest_file_completed = 0
29
- out_message = []
30
  out_file_paths = []
31
 
32
  # If out message is string or out_file_paths are blank, change to a list so it can be appended to
@@ -44,7 +44,30 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
44
  # Set to a very high number so as not to mess with subsequent file processing by the user
45
  latest_file_completed = 99
46
  final_out_message = '\n'.join(out_message)
47
- return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  file_paths_loop = [file_paths[int(latest_file_completed)]]
50
 
@@ -65,7 +88,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
65
  else:
66
  out_message = "No file selected"
67
  print(out_message)
68
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
69
 
70
  if in_redact_method == "Image analysis":
71
  # Analyse and redact image-based pdf or image
@@ -78,7 +101,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
78
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
79
 
80
  out_file_paths.append(out_image_file_path)
81
- out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file")
82
 
83
  output_logs_str = str(output_logs)
84
  logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
@@ -101,9 +124,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
101
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
102
  pdf_text.save(out_text_file_path)
103
 
104
- #out_file_paths.append(out_text_file_path)
105
- out_message_new = "File " + file_path_without_ext + " successfully redacted"
106
- out_message.append(out_message_new)
107
 
108
  # Convert message
109
  convert_message="Converting PDF to image-based PDF to embed redactions."
@@ -123,6 +144,10 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
123
  # Add confirmation for converting to image if you want
124
  # out_message.append(img_output_summary)
125
 
 
 
 
 
126
  if latest_file_completed != len(file_paths):
127
  print("Completed file number:", str(latest_file_completed), "more files to do")
128
  latest_file_completed += 1
@@ -130,7 +155,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
130
  else:
131
  out_message = "No redaction method selected"
132
  print(out_message)
133
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
134
 
135
 
136
  toc = time.perf_counter()
@@ -140,9 +165,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
140
  out_message_out = '\n'.join(out_message)
141
  out_message_out = out_message_out + " " + out_time
142
 
143
- out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
144
-
145
- return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
146
 
147
  def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
148
  merged_bboxes = []
@@ -317,7 +340,7 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
317
  return [], []
318
 
319
  # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
320
- def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
321
  analyzed_bounding_boxes = []
322
  if len(analyzer_results) > 0 and len(characters) > 0:
323
  merged_bounding_boxes = []
@@ -329,6 +352,8 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
329
  for char in characters[result.start : result.end]:
330
  if isinstance(char, LTChar):
331
  char_box = list(char.bbox)
 
 
332
 
333
  if current_y is None or current_box is None:
334
  current_box = char_box
@@ -342,6 +367,7 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
342
  and horizontal_diff_bboxes <= combine_pixel_dist
343
  ):
344
  current_box[2] = char_box[2] # Extend the current box horizontally
 
345
  else:
346
  merged_bounding_boxes.append(
347
  {"boundingBox": current_box, "result": result})
 
9
  from pikepdf import Pdf, Dictionary, Name
10
  from gradio import Progress
11
  import time
12
+ import re
13
  from collections import defaultdict # For efficient grouping
14
 
15
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 
19
  import gradio as gr
20
 
21
 
22
+ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, progress=gr.Progress(track_tqdm=True)):
23
 
24
  tic = time.perf_counter()
25
 
 
26
  # If this is the first time around, set variables to 0/blank
27
  if first_loop_state==True:
28
  latest_file_completed = 0
29
+ #out_message = []
30
  out_file_paths = []
31
 
32
  # If out message is string or out_file_paths are blank, change to a list so it can be appended to
 
44
  # Set to a very high number so as not to mess with subsequent file processing by the user
45
  latest_file_completed = 99
46
  final_out_message = '\n'.join(out_message)
47
+ #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
48
+
49
+ def sum_numbers_from_string(string):
50
+ """Extracts all numbers from a string and adds them up.
51
+
52
+ Args:
53
+ string: The input string.
54
+
55
+ Returns:
56
+ The sum of all numbers extracted from the string.
57
+ """
58
+
59
+ # Extract all numbers using regular expression
60
+ numbers = re.findall(r'\d+', string)
61
+
62
+ # Convert the numbers to integers and sum them up
63
+ sum_of_numbers = sum(int(num) for num in numbers)
64
+
65
+ return sum_of_numbers
66
+
67
+ estimate_total_processing_time = sum_numbers_from_string(final_out_message)
68
+ print("Estimated total processing time:", str(estimate_total_processing_time))
69
+
70
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time
71
 
72
  file_paths_loop = [file_paths[int(latest_file_completed)]]
73
 
 
88
  else:
89
  out_message = "No file selected"
90
  print(out_message)
91
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
92
 
93
  if in_redact_method == "Image analysis":
94
  # Analyse and redact image-based pdf or image
 
101
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
102
 
103
  out_file_paths.append(out_image_file_path)
104
+ out_message.append("File '" + file_path_without_ext + "' successfully redacted")
105
 
106
  output_logs_str = str(output_logs)
107
  logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
 
124
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
125
  pdf_text.save(out_text_file_path)
126
 
127
+
 
 
128
 
129
  # Convert message
130
  convert_message="Converting PDF to image-based PDF to embed redactions."
 
144
  # Add confirmation for converting to image if you want
145
  # out_message.append(img_output_summary)
146
 
147
+ #out_file_paths.append(out_text_file_path)
148
+ out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
149
+ out_message.append(out_message_new)
150
+
151
  if latest_file_completed != len(file_paths):
152
  print("Completed file number:", str(latest_file_completed), "more files to do")
153
  latest_file_completed += 1
 
155
  else:
156
  out_message = "No redaction method selected"
157
  print(out_message)
158
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
159
 
160
 
161
  toc = time.perf_counter()
 
165
  out_message_out = '\n'.join(out_message)
166
  out_message_out = out_message_out + " " + out_time
167
 
168
+ return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
 
 
169
 
170
  def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
171
  merged_bboxes = []
 
340
  return [], []
341
 
342
  # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
343
+ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
344
  analyzed_bounding_boxes = []
345
  if len(analyzer_results) > 0 and len(characters) > 0:
346
  merged_bounding_boxes = []
 
352
  for char in characters[result.start : result.end]:
353
  if isinstance(char, LTChar):
354
  char_box = list(char.bbox)
355
+ # Add vertical padding to the top of the box
356
+ char_box[3] += vertical_padding
357
 
358
  if current_y is None or current_box is None:
359
  current_box = char_box
 
367
  and horizontal_diff_bboxes <= combine_pixel_dist
368
  ):
369
  current_box[2] = char_box[2] # Extend the current box horizontally
370
+ current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
371
  else:
372
  merged_bounding_boxes.append(
373
  {"boundingBox": current_box, "result": result})