Spaces:

seanpedrickcase
/

document_redaction

Sleeping

App Files Files Community

seanpedrickcase commited on Sep 16, 2024

Commit

34addbf

1 Parent(s): 230fcc3

Enhanced logging of usage. Small buffer added to redaction rectangles as it seems to miss the tops of text often.

Browse files

Files changed (4) hide show

app.py +31 -36
tools/aws_functions.py +1 -1
tools/file_conversion.py +16 -2
tools/file_redaction.py +40 -14

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
@@ -24,21 +25,11 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
-feedback_data_folder = 'feedback/' + today_rev + '/'
-logs_data_folder = 'logs/' + today_rev + '/'
-def create_logs_folder(session_hash_textbox):
-    print("session_hash_textbox", session_hash_textbox)
-    feedback_data_folder = 'feedback/' + session_hash_textbox + "/" + today_rev + '/'
-    logs_data_folder = 'logs/' + session_hash_textbox + "/" + today_rev + '/'
-    feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
-    feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
-    usage_logs_state = gr.State(logs_data_folder + 'log.csv')
-    usage_s3_logs_loc_state = gr.State(logs_data_folder)
-    return feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
@@ -56,13 +47,13 @@ with app:
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
-    feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
-    feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
-    usage_logs_state = gr.State(logs_data_folder + 'log.csv')
-    usage_s3_logs_loc_state = gr.State(logs_data_folder)
     gr.Markdown(
     """
@@ -96,6 +87,8 @@ with app:
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
@@ -148,7 +141,7 @@ with app:
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(value="", visible=False)
-    # AWS options - not yet implemented
     # with gr.Tab(label="Advanced options"):
     #     with gr.Accordion(label = "AWS data access", open = True):
     #         aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
@@ -163,13 +156,13 @@ with app:
     # Document redaction
     redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
     text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
      # Tabular data redaction
@@ -181,31 +174,33 @@ with app:
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
-    #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
-    #    then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
-    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])#.\
-    #then(create_logs_folder, inputs=[session_hash_textbox], outputs = [feedback_logs_state, feedback_s3_logs_loc_state, usage_logs_state, usage_s3_logs_loc_state])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     callback = gr.CSVLogger()
-    callback.setup([session_hash_textbox], logs_data_folder)
     session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
-    then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger()
-    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_data_folder)
     pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
     data_callback = gr.CSVLogger()
-    data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_data_folder)
     data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
 # Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')

 import os
+import socket
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
+host_name = socket.gethostname()
+feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
+access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
+usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
+    # Logging state
+    feedback_logs_state = gr.State(feedback_logs_folder + 'log.csv')
+    feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
+    access_logs_state = gr.State(access_logs_folder + 'log.csv')
+    access_s3_logs_loc_state = gr.State(access_logs_folder)
+    usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
+    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     gr.Markdown(
     """
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
+            # This keeps track of the time taken to redact files for logging purposes.
+            estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False)
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(value="", visible=False)
+    # AWS options - placeholder for possibility of storing data on s3
     # with gr.Tab(label="Advanced options"):
     #     with gr.Accordion(label = "AWS data access", open = True):
     #         aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
     # Document redaction
     redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
     text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number]).\
     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
      # Tabular data redaction
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
+    # Get connection details on app load
+    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     callback = gr.CSVLogger()
+    callback.setup([session_hash_textbox], access_logs_folder)
     session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger()
+    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_logs_folder)
     pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
     data_callback = gr.CSVLogger()
+    data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_logs_folder)
     data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
+    # Log processing time/token usage when making a query
+    usage_callback = gr.CSVLogger()
+    usage_callback.setup([session_hash_textbox, in_data_files, estimated_time_taken_number], usage_logs_folder)
+    estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, in_data_files, estimated_time_taken_number], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')

tools/aws_functions.py CHANGED Viewed

@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials if required
 bucket_name=""
 aws_var = "RUN_AWS_FUNCTIONS"
-aws_var_default = "0"
 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')

 # Get AWS credentials if required
 bucket_name=""
 aws_var = "RUN_AWS_FUNCTIONS"
+aws_var_default = "1"
 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')

tools/file_conversion.py CHANGED Viewed

@@ -2,6 +2,7 @@ from pdf2image import convert_from_path, pdfinfo_from_path
 from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
 from PIL import Image
 import os
 from gradio import Progress
 from typing import List, Optional
@@ -62,6 +63,8 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
         # print("Conversion of page", str(page_num), "to file succeeded.")
         # print("image:", image)
         images.extend(image)
     print("PDF has been converted to images.")
@@ -122,6 +125,8 @@ def prepare_image_or_text_pdf(
         tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
     """
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     #if isinstance(out_message, str):
     #    out_message = [out_message]
@@ -156,8 +161,9 @@ def prepare_image_or_text_pdf(
     #for file in progress.tqdm(file_paths, desc="Preparing files"):
     for file in file_paths_loop:
         file_path = file.name
-        print("file_path:", file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
@@ -191,8 +197,16 @@ def prepare_image_or_text_pdf(
             out_file_path = file_path
         out_file_paths.append(out_file_path)
-    return out_message, out_file_paths
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)

 from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
 from PIL import Image
 import os
+import time
 from gradio import Progress
 from typing import List, Optional
         # print("Conversion of page", str(page_num), "to file succeeded.")
         # print("image:", image)
+        #image[0].save(pdf_path + "_" + str(page_num) + ".png", format="PNG")
         images.extend(image)
     print("PDF has been converted to images.")
         tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
     """
+    tic = time.perf_counter()
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     #if isinstance(out_message, str):
     #    out_message = [out_message]
     #for file in progress.tqdm(file_paths, desc="Preparing files"):
     for file in file_paths_loop:
         file_path = file.name
+        file_path_without_ext = get_file_path_end(file_path)
+        #print("file:", file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
             out_file_path = file_path
         out_file_paths.append(out_file_path)
+        toc = time.perf_counter()
+        out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
+        print(out_time)
+        out_message.append(out_time)
+        out_message_out = '\n'.join(out_message)
+    return out_message_out, out_file_paths
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)

tools/file_redaction.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from gradio import Progress
 import time
 from collections import defaultdict  # For efficient grouping
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
@@ -18,15 +19,14 @@ from tools.data_anonymise import generate_decision_process_output
 import gradio as gr
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
-        out_message = []
         out_file_paths = []
     # If out message is string or out_file_paths are blank, change to a list so it can be appended to
@@ -44,7 +44,30 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         # Set to a very high number so as not to mess with subsequent file processing by the user
         latest_file_completed = 99
         final_out_message = '\n'.join(out_message)
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
     file_paths_loop = [file_paths[int(latest_file_completed)]]
@@ -65,7 +88,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No file selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
         if in_redact_method == "Image analysis":
             # Analyse and redact image-based pdf or image
@@ -78,7 +101,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             out_file_paths.append(out_image_file_path)
-            out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file")
             output_logs_str = str(output_logs)
             logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
@@ -101,9 +124,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
-            #out_file_paths.append(out_text_file_path)
-            out_message_new = "File " + file_path_without_ext + " successfully redacted"
-            out_message.append(out_message_new)
             # Convert message
             convert_message="Converting PDF to image-based PDF to embed redactions."
@@ -123,6 +144,10 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             # Add confirmation for converting to image if you want
             # out_message.append(img_output_summary)
             if latest_file_completed != len(file_paths):
                 print("Completed file number:", str(latest_file_completed), "more files to do")
                 latest_file_completed += 1
@@ -130,7 +155,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
     toc = time.perf_counter()
@@ -140,9 +165,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     out_message_out = '\n'.join(out_message)
     out_message_out = out_message_out + " " + out_time
-    out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
-    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
 def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
             merged_bboxes = []
@@ -317,7 +340,7 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
     return [], []
 # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
-def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
     analyzed_bounding_boxes = []
     if len(analyzer_results) > 0 and len(characters) > 0:
         merged_bounding_boxes = []
@@ -329,6 +352,8 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
             for char in characters[result.start : result.end]:
                 if isinstance(char, LTChar):
                     char_box = list(char.bbox)
                     if current_y is None or current_box is None:
                         current_box = char_box
@@ -342,6 +367,7 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
                             and horizontal_diff_bboxes <= combine_pixel_dist
                         ):
                             current_box[2] = char_box[2]  # Extend the current box horizontally
                         else:
                             merged_bounding_boxes.append(
                                 {"boundingBox": current_box, "result": result})

 from pikepdf import Pdf, Dictionary, Name
 from gradio import Progress
 import time
+import re
 from collections import defaultdict  # For efficient grouping
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 import gradio as gr
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
+        #out_message = []
         out_file_paths = []
     # If out message is string or out_file_paths are blank, change to a list so it can be appended to
         # Set to a very high number so as not to mess with subsequent file processing by the user
         latest_file_completed = 99
         final_out_message = '\n'.join(out_message)
+        #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
+        def sum_numbers_from_string(string):
+            """Extracts all numbers from a string and adds them up.
+            Args:
+                string: The input string.
+            Returns:
+                The sum of all numbers extracted from the string.
+            """
+            # Extract all numbers using regular expression
+            numbers = re.findall(r'\d+', string)
+            # Convert the numbers to integers and sum them up
+            sum_of_numbers = sum(int(num) for num in numbers)
+            return sum_of_numbers
+        estimate_total_processing_time = sum_numbers_from_string(final_out_message)
+        print("Estimated total processing time:", str(estimate_total_processing_time))
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time
     file_paths_loop = [file_paths[int(latest_file_completed)]]
         else:
             out_message = "No file selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
         if in_redact_method == "Image analysis":
             # Analyse and redact image-based pdf or image
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             out_file_paths.append(out_image_file_path)
+            out_message.append("File '" + file_path_without_ext + "' successfully redacted")
             output_logs_str = str(output_logs)
             logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
             # Convert message
             convert_message="Converting PDF to image-based PDF to embed redactions."
             # Add confirmation for converting to image if you want
             # out_message.append(img_output_summary)
+            #out_file_paths.append(out_text_file_path)
+            out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
+            out_message.append(out_message_new)
             if latest_file_completed != len(file_paths):
                 print("Completed file number:", str(latest_file_completed), "more files to do")
                 latest_file_completed += 1
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
     toc = time.perf_counter()
     out_message_out = '\n'.join(out_message)
     out_message_out = out_message_out + " " + out_time
+    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
 def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
             merged_bboxes = []
     return [], []
 # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
+def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
     analyzed_bounding_boxes = []
     if len(analyzer_results) > 0 and len(characters) > 0:
         merged_bounding_boxes = []
             for char in characters[result.start : result.end]:
                 if isinstance(char, LTChar):
                     char_box = list(char.bbox)
+                    # Add vertical padding to the top of the box
+                    char_box[3] += vertical_padding
                     if current_y is None or current_box is None:
                         current_box = char_box
                             and horizontal_diff_bboxes <= combine_pixel_dist
                         ):
                             current_box[2] = char_box[2]  # Extend the current box horizontally
+                            current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
                         else:
                             merged_bounding_boxes.append(
                                 {"boundingBox": current_box, "result": result})