seanpedrickcase
commited on
Commit
•
34addbf
1
Parent(s):
230fcc3
Enhanced logging of usage. Small buffer added to redaction rectangles as it seems to miss the tops of text often.
Browse files- app.py +31 -36
- tools/aws_functions.py +1 -1
- tools/file_conversion.py +16 -2
- tools/file_redaction.py +40 -14
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
@@ -24,21 +25,11 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
|
|
24 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
25 |
language = 'en'
|
26 |
|
27 |
-
|
28 |
-
logs_data_folder = 'logs/' + today_rev + '/'
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
feedback_data_folder = 'feedback/' + session_hash_textbox + "/" + today_rev + '/'
|
34 |
-
logs_data_folder = 'logs/' + session_hash_textbox + "/" + today_rev + '/'
|
35 |
-
|
36 |
-
feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
|
37 |
-
feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
|
38 |
-
usage_logs_state = gr.State(logs_data_folder + 'log.csv')
|
39 |
-
usage_s3_logs_loc_state = gr.State(logs_data_folder)
|
40 |
-
|
41 |
-
return feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state
|
42 |
|
43 |
# Create the gradio interface
|
44 |
app = gr.Blocks(theme = gr.themes.Base())
|
@@ -56,13 +47,13 @@ with app:
|
|
56 |
session_hash_state = gr.State()
|
57 |
s3_output_folder_state = gr.State()
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
|
67 |
gr.Markdown(
|
68 |
"""
|
@@ -96,6 +87,8 @@ with app:
|
|
96 |
|
97 |
with gr.Row():
|
98 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
|
|
|
|
99 |
|
100 |
with gr.Tab(label="Open text or Excel/csv files"):
|
101 |
gr.Markdown(
|
@@ -148,7 +141,7 @@ with app:
|
|
148 |
# Invisible text box to hold the session hash/username just for logging purposes
|
149 |
session_hash_textbox = gr.Textbox(value="", visible=False)
|
150 |
|
151 |
-
# AWS options -
|
152 |
# with gr.Tab(label="Advanced options"):
|
153 |
# with gr.Accordion(label = "AWS data access", open = True):
|
154 |
# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
@@ -163,13 +156,13 @@ with app:
|
|
163 |
|
164 |
# Document redaction
|
165 |
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
166 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max],
|
167 |
-
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
|
168 |
|
169 |
# If the output file count text box changes, keep going with redacting each document until done
|
170 |
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
|
171 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max],
|
172 |
-
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
|
173 |
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
174 |
|
175 |
# Tabular data redaction
|
@@ -181,31 +174,33 @@ with app:
|
|
181 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
182 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
183 |
|
184 |
-
#
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])#.\
|
189 |
-
#then(create_logs_folder, inputs=[session_hash_textbox], outputs = [feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state])
|
190 |
|
191 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
192 |
callback = gr.CSVLogger()
|
193 |
-
callback.setup([session_hash_textbox],
|
194 |
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
195 |
-
then(fn = upload_file_to_s3, inputs=[
|
196 |
|
197 |
# User submitted feedback for pdf redactions
|
198 |
pdf_callback = gr.CSVLogger()
|
199 |
-
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file],
|
200 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
|
201 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
202 |
|
203 |
# User submitted feedback for data redactions
|
204 |
data_callback = gr.CSVLogger()
|
205 |
-
data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files],
|
206 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
|
207 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
# Launch the Gradio app
|
210 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
211 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
|
|
1 |
import os
|
2 |
+
import socket
|
3 |
|
4 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
5 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
|
|
25 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
26 |
language = 'en'
|
27 |
|
28 |
+
host_name = socket.gethostname()
|
|
|
29 |
|
30 |
+
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
31 |
+
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
32 |
+
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# Create the gradio interface
|
35 |
app = gr.Blocks(theme = gr.themes.Base())
|
|
|
47 |
session_hash_state = gr.State()
|
48 |
s3_output_folder_state = gr.State()
|
49 |
|
50 |
+
# Logging state
|
51 |
+
feedback_logs_state = gr.State(feedback_logs_folder + 'log.csv')
|
52 |
+
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
53 |
+
access_logs_state = gr.State(access_logs_folder + 'log.csv')
|
54 |
+
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
55 |
+
usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
|
56 |
+
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
57 |
|
58 |
gr.Markdown(
|
59 |
"""
|
|
|
87 |
|
88 |
with gr.Row():
|
89 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
90 |
+
# This keeps track of the time taken to redact files for logging purposes.
|
91 |
+
estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False)
|
92 |
|
93 |
with gr.Tab(label="Open text or Excel/csv files"):
|
94 |
gr.Markdown(
|
|
|
141 |
# Invisible text box to hold the session hash/username just for logging purposes
|
142 |
session_hash_textbox = gr.Textbox(value="", visible=False)
|
143 |
|
144 |
+
# AWS options - placeholder for possibility of storing data on s3
|
145 |
# with gr.Tab(label="Advanced options"):
|
146 |
# with gr.Accordion(label = "AWS data access", open = True):
|
147 |
# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
|
|
156 |
|
157 |
# Document redaction
|
158 |
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
159 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number],
|
160 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number], api_name="redact_doc")
|
161 |
|
162 |
# If the output file count text box changes, keep going with redacting each document until done
|
163 |
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
|
164 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number],
|
165 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number]).\
|
166 |
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
167 |
|
168 |
# Tabular data redaction
|
|
|
174 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
175 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
176 |
|
177 |
+
# Get connection details on app load
|
178 |
+
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
|
|
|
|
|
|
|
|
179 |
|
180 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
181 |
callback = gr.CSVLogger()
|
182 |
+
callback.setup([session_hash_textbox], access_logs_folder)
|
183 |
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
184 |
+
then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
185 |
|
186 |
# User submitted feedback for pdf redactions
|
187 |
pdf_callback = gr.CSVLogger()
|
188 |
+
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_logs_folder)
|
189 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
|
190 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
191 |
|
192 |
# User submitted feedback for data redactions
|
193 |
data_callback = gr.CSVLogger()
|
194 |
+
data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_logs_folder)
|
195 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
|
196 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
197 |
|
198 |
+
# Log processing time/token usage when making a query
|
199 |
+
usage_callback = gr.CSVLogger()
|
200 |
+
usage_callback.setup([session_hash_textbox, in_data_files, estimated_time_taken_number], usage_logs_folder)
|
201 |
+
estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, in_data_files, estimated_time_taken_number], None, preprocess=False).\
|
202 |
+
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
203 |
+
|
204 |
# Launch the Gradio app
|
205 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
206 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
tools/aws_functions.py
CHANGED
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
10 |
# Get AWS credentials if required
|
11 |
bucket_name=""
|
12 |
aws_var = "RUN_AWS_FUNCTIONS"
|
13 |
-
aws_var_default = "
|
14 |
aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
|
15 |
print(f'The value of {aws_var} is {aws_var_val}')
|
16 |
|
|
|
10 |
# Get AWS credentials if required
|
11 |
bucket_name=""
|
12 |
aws_var = "RUN_AWS_FUNCTIONS"
|
13 |
+
aws_var_default = "1"
|
14 |
aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
|
15 |
print(f'The value of {aws_var} is {aws_var_val}')
|
16 |
|
tools/file_conversion.py
CHANGED
@@ -2,6 +2,7 @@ from pdf2image import convert_from_path, pdfinfo_from_path
|
|
2 |
from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
|
3 |
from PIL import Image
|
4 |
import os
|
|
|
5 |
from gradio import Progress
|
6 |
from typing import List, Optional
|
7 |
|
@@ -62,6 +63,8 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
|
|
62 |
# print("Conversion of page", str(page_num), "to file succeeded.")
|
63 |
# print("image:", image)
|
64 |
|
|
|
|
|
65 |
images.extend(image)
|
66 |
|
67 |
print("PDF has been converted to images.")
|
@@ -122,6 +125,8 @@ def prepare_image_or_text_pdf(
|
|
122 |
tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
|
123 |
"""
|
124 |
|
|
|
|
|
125 |
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
126 |
#if isinstance(out_message, str):
|
127 |
# out_message = [out_message]
|
@@ -156,8 +161,9 @@ def prepare_image_or_text_pdf(
|
|
156 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
157 |
for file in file_paths_loop:
|
158 |
file_path = file.name
|
|
|
159 |
|
160 |
-
print("
|
161 |
|
162 |
file_extension = os.path.splitext(file_path)[1].lower()
|
163 |
|
@@ -191,8 +197,16 @@ def prepare_image_or_text_pdf(
|
|
191 |
out_file_path = file_path
|
192 |
|
193 |
out_file_paths.append(out_file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
return
|
196 |
|
197 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
198 |
file_path_without_ext = get_file_path_end(in_file_path)
|
|
|
2 |
from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
|
3 |
from PIL import Image
|
4 |
import os
|
5 |
+
import time
|
6 |
from gradio import Progress
|
7 |
from typing import List, Optional
|
8 |
|
|
|
63 |
# print("Conversion of page", str(page_num), "to file succeeded.")
|
64 |
# print("image:", image)
|
65 |
|
66 |
+
#image[0].save(pdf_path + "_" + str(page_num) + ".png", format="PNG")
|
67 |
+
|
68 |
images.extend(image)
|
69 |
|
70 |
print("PDF has been converted to images.")
|
|
|
125 |
tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
|
126 |
"""
|
127 |
|
128 |
+
tic = time.perf_counter()
|
129 |
+
|
130 |
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
131 |
#if isinstance(out_message, str):
|
132 |
# out_message = [out_message]
|
|
|
161 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
162 |
for file in file_paths_loop:
|
163 |
file_path = file.name
|
164 |
+
file_path_without_ext = get_file_path_end(file_path)
|
165 |
|
166 |
+
#print("file:", file_path)
|
167 |
|
168 |
file_extension = os.path.splitext(file_path)[1].lower()
|
169 |
|
|
|
197 |
out_file_path = file_path
|
198 |
|
199 |
out_file_paths.append(out_file_path)
|
200 |
+
|
201 |
+
toc = time.perf_counter()
|
202 |
+
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
203 |
+
|
204 |
+
print(out_time)
|
205 |
+
|
206 |
+
out_message.append(out_time)
|
207 |
+
out_message_out = '\n'.join(out_message)
|
208 |
|
209 |
+
return out_message_out, out_file_paths
|
210 |
|
211 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
212 |
file_path_without_ext = get_file_path_end(in_file_path)
|
tools/file_redaction.py
CHANGED
@@ -9,6 +9,7 @@ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
|
|
9 |
from pikepdf import Pdf, Dictionary, Name
|
10 |
from gradio import Progress
|
11 |
import time
|
|
|
12 |
from collections import defaultdict # For efficient grouping
|
13 |
|
14 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
@@ -18,15 +19,14 @@ from tools.data_anonymise import generate_decision_process_output
|
|
18 |
import gradio as gr
|
19 |
|
20 |
|
21 |
-
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, progress=gr.Progress(track_tqdm=True)):
|
22 |
|
23 |
tic = time.perf_counter()
|
24 |
|
25 |
-
|
26 |
# If this is the first time around, set variables to 0/blank
|
27 |
if first_loop_state==True:
|
28 |
latest_file_completed = 0
|
29 |
-
out_message = []
|
30 |
out_file_paths = []
|
31 |
|
32 |
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
@@ -44,7 +44,30 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
44 |
# Set to a very high number so as not to mess with subsequent file processing by the user
|
45 |
latest_file_completed = 99
|
46 |
final_out_message = '\n'.join(out_message)
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
50 |
|
@@ -65,7 +88,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
65 |
else:
|
66 |
out_message = "No file selected"
|
67 |
print(out_message)
|
68 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
69 |
|
70 |
if in_redact_method == "Image analysis":
|
71 |
# Analyse and redact image-based pdf or image
|
@@ -78,7 +101,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
78 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
79 |
|
80 |
out_file_paths.append(out_image_file_path)
|
81 |
-
out_message.append("File '" + file_path_without_ext + "' successfully redacted
|
82 |
|
83 |
output_logs_str = str(output_logs)
|
84 |
logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
|
@@ -101,9 +124,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
101 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
102 |
pdf_text.save(out_text_file_path)
|
103 |
|
104 |
-
|
105 |
-
out_message_new = "File " + file_path_without_ext + " successfully redacted"
|
106 |
-
out_message.append(out_message_new)
|
107 |
|
108 |
# Convert message
|
109 |
convert_message="Converting PDF to image-based PDF to embed redactions."
|
@@ -123,6 +144,10 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
123 |
# Add confirmation for converting to image if you want
|
124 |
# out_message.append(img_output_summary)
|
125 |
|
|
|
|
|
|
|
|
|
126 |
if latest_file_completed != len(file_paths):
|
127 |
print("Completed file number:", str(latest_file_completed), "more files to do")
|
128 |
latest_file_completed += 1
|
@@ -130,7 +155,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
130 |
else:
|
131 |
out_message = "No redaction method selected"
|
132 |
print(out_message)
|
133 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
134 |
|
135 |
|
136 |
toc = time.perf_counter()
|
@@ -140,9 +165,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
140 |
out_message_out = '\n'.join(out_message)
|
141 |
out_message_out = out_message_out + " " + out_time
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
146 |
|
147 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
148 |
merged_bboxes = []
|
@@ -317,7 +340,7 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
|
|
317 |
return [], []
|
318 |
|
319 |
# Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
|
320 |
-
def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
|
321 |
analyzed_bounding_boxes = []
|
322 |
if len(analyzer_results) > 0 and len(characters) > 0:
|
323 |
merged_bounding_boxes = []
|
@@ -329,6 +352,8 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
|
|
329 |
for char in characters[result.start : result.end]:
|
330 |
if isinstance(char, LTChar):
|
331 |
char_box = list(char.bbox)
|
|
|
|
|
332 |
|
333 |
if current_y is None or current_box is None:
|
334 |
current_box = char_box
|
@@ -342,6 +367,7 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
|
|
342 |
and horizontal_diff_bboxes <= combine_pixel_dist
|
343 |
):
|
344 |
current_box[2] = char_box[2] # Extend the current box horizontally
|
|
|
345 |
else:
|
346 |
merged_bounding_boxes.append(
|
347 |
{"boundingBox": current_box, "result": result})
|
|
|
9 |
from pikepdf import Pdf, Dictionary, Name
|
10 |
from gradio import Progress
|
11 |
import time
|
12 |
+
import re
|
13 |
from collections import defaultdict # For efficient grouping
|
14 |
|
15 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
|
|
19 |
import gradio as gr
|
20 |
|
21 |
|
22 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, progress=gr.Progress(track_tqdm=True)):
|
23 |
|
24 |
tic = time.perf_counter()
|
25 |
|
|
|
26 |
# If this is the first time around, set variables to 0/blank
|
27 |
if first_loop_state==True:
|
28 |
latest_file_completed = 0
|
29 |
+
#out_message = []
|
30 |
out_file_paths = []
|
31 |
|
32 |
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
|
|
44 |
# Set to a very high number so as not to mess with subsequent file processing by the user
|
45 |
latest_file_completed = 99
|
46 |
final_out_message = '\n'.join(out_message)
|
47 |
+
#final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
48 |
+
|
49 |
+
def sum_numbers_from_string(string):
|
50 |
+
"""Extracts all numbers from a string and adds them up.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
string: The input string.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
The sum of all numbers extracted from the string.
|
57 |
+
"""
|
58 |
+
|
59 |
+
# Extract all numbers using regular expression
|
60 |
+
numbers = re.findall(r'\d+', string)
|
61 |
+
|
62 |
+
# Convert the numbers to integers and sum them up
|
63 |
+
sum_of_numbers = sum(int(num) for num in numbers)
|
64 |
+
|
65 |
+
return sum_of_numbers
|
66 |
+
|
67 |
+
estimate_total_processing_time = sum_numbers_from_string(final_out_message)
|
68 |
+
print("Estimated total processing time:", str(estimate_total_processing_time))
|
69 |
+
|
70 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time
|
71 |
|
72 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
73 |
|
|
|
88 |
else:
|
89 |
out_message = "No file selected"
|
90 |
print(out_message)
|
91 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
|
92 |
|
93 |
if in_redact_method == "Image analysis":
|
94 |
# Analyse and redact image-based pdf or image
|
|
|
101 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
102 |
|
103 |
out_file_paths.append(out_image_file_path)
|
104 |
+
out_message.append("File '" + file_path_without_ext + "' successfully redacted")
|
105 |
|
106 |
output_logs_str = str(output_logs)
|
107 |
logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
|
|
|
124 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
125 |
pdf_text.save(out_text_file_path)
|
126 |
|
127 |
+
|
|
|
|
|
128 |
|
129 |
# Convert message
|
130 |
convert_message="Converting PDF to image-based PDF to embed redactions."
|
|
|
144 |
# Add confirmation for converting to image if you want
|
145 |
# out_message.append(img_output_summary)
|
146 |
|
147 |
+
#out_file_paths.append(out_text_file_path)
|
148 |
+
out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
|
149 |
+
out_message.append(out_message_new)
|
150 |
+
|
151 |
if latest_file_completed != len(file_paths):
|
152 |
print("Completed file number:", str(latest_file_completed), "more files to do")
|
153 |
latest_file_completed += 1
|
|
|
155 |
else:
|
156 |
out_message = "No redaction method selected"
|
157 |
print(out_message)
|
158 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
|
159 |
|
160 |
|
161 |
toc = time.perf_counter()
|
|
|
165 |
out_message_out = '\n'.join(out_message)
|
166 |
out_message_out = out_message_out + " " + out_time
|
167 |
|
168 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
|
|
|
|
|
169 |
|
170 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
171 |
merged_bboxes = []
|
|
|
340 |
return [], []
|
341 |
|
342 |
# Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
|
343 |
+
def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
|
344 |
analyzed_bounding_boxes = []
|
345 |
if len(analyzer_results) > 0 and len(characters) > 0:
|
346 |
merged_bounding_boxes = []
|
|
|
352 |
for char in characters[result.start : result.end]:
|
353 |
if isinstance(char, LTChar):
|
354 |
char_box = list(char.bbox)
|
355 |
+
# Add vertical padding to the top of the box
|
356 |
+
char_box[3] += vertical_padding
|
357 |
|
358 |
if current_y is None or current_box is None:
|
359 |
current_box = char_box
|
|
|
367 |
and horizontal_diff_bboxes <= combine_pixel_dist
|
368 |
):
|
369 |
current_box[2] = char_box[2] # Extend the current box horizontally
|
370 |
+
current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
371 |
else:
|
372 |
merged_bounding_boxes.append(
|
373 |
{"boundingBox": current_box, "result": result})
|