Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Aug 20, 2024

Commit

8c33828

1 Parent(s): 01c88c0

Decision process now saved as log files. Other log files and feedback added

Browse files

Files changed (6) hide show

app.py +55 -21
tools/aws_functions.py +46 -5
tools/data_anonymise.py +100 -21
tools/file_conversion.py +50 -6
tools/file_redaction.py +56 -30
tools/helper_functions.py +17 -0

app.py CHANGED Viewed

@@ -3,7 +3,8 @@ import os
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf
 from tools.data_anonymise import anonymise_data_files
@@ -29,9 +30,15 @@ with app:
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
     text_output_file_list_state = gr.State([])
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     gr.Markdown(
     """
@@ -39,9 +46,9 @@ with app:
     Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
-    WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
-    Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
     """)
     with gr.Tab("PDFs/images"):
@@ -57,6 +64,15 @@ with app:
         with gr.Row():
             convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
@@ -73,13 +89,20 @@ with app:
         in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
-        tabular_data_redact_btn = gr.Button("Anonymise text", variant="primary")
         with gr.Row():
             text_output_summary = gr.Textbox(label="Output result")
             text_output_file = gr.File(label="Output files")
             text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
     """
@@ -111,44 +134,55 @@ with app:
     # ### Loading AWS data ###
     # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
-    callback = gr.CSVLogger()
     # Document redaction
-    redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
-                    outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
-    text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
-                    outputs=[output_summary, prepared_pdf_state]).\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done])
      # Tabular data redaction
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
-    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
     # If the output file count text box changes, keep going with redacting each data file until done
-    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done])
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
-    # This needs to be called at some point prior to the first call to callback.flag()
     callback.setup([session_hash_textbox], "logs")
-    #app.load(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
     session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
 # Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
-        app.queue().launch(show_error=True, auth=authenticate_user)
     else:
-        app.queue().launch(show_error=True, inbrowser=True)

 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs
+from tools.aws_functions import upload_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf
 from tools.data_anonymise import anonymise_data_files
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
     text_output_file_list_state = gr.State([])
+    first_loop_state = gr.State(True)
+    second_loop_state = gr.State(False)
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
+    feedback_logs_state = gr.State('feedback/log.csv')
+    feedback_s3_logs_loc_state = gr.State('feedback/')
+    usage_logs_state = gr.State('logs/log.csv')
+    usage_s3_logs_loc_state = gr.State('logs/')
     gr.Markdown(
     """
     Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
+    WARNING: In testing the app seems to only find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
+    This app accepts a maximum file size of 10mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
     """)
     with gr.Tab("PDFs/images"):
         with gr.Row():
             convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
+        with gr.Row():
+            pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
+        with gr.Row():
+            pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
+            pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+        with gr.Row():
+            s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
         in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
+        tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
         with gr.Row():
             text_output_summary = gr.Textbox(label="Output result")
             text_output_file = gr.File(label="Output files")
             text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
+        with gr.Row():
+            data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
+                choices=["The results were good", "The results were not good"], visible=False)
+        with gr.Row():
+            data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
+            data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
     """
     # ### Loading AWS data ###
     # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
     # Document redaction
+    redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, first_loop_state],
                     outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
+    text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, second_loop_state],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done]).\
+    then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn])
      # Tabular data redaction
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
+    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
     # If the output file count text box changes, keep going with redacting each data file until done
+    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done]).\
+    then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn])
+    #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
+    #    then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
+    # Log usernames and times of access to file (to know who is using the app when running on AWS)
+    callback = gr.CSVLogger()
     callback.setup([session_hash_textbox], "logs")
     session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
+    # User submitted feedback for pdf redactions
+    pdf_callback = gr.CSVLogger()
+    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], "feedback")
+    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+    # User submitted feedback for data redactions
+    data_callback = gr.CSVLogger()
+    data_callback.setup([data_feedback_radio, data_further_details_text], "feedback")
+    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
+        app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='10mb')
     else:
+        app.queue().launch(show_error=True, inbrowser=True, max_file_size='10mb')

tools/aws_functions.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Type
 import pandas as pd
 import boto3
 import tempfile
@@ -6,12 +6,11 @@ import os
 from tools.helper_functions import get_or_create_env_var
 PandasDataFrame = Type[pd.DataFrame]
-bucket_name=""
 # Get AWS credentials if required
 aws_var = "RUN_AWS_FUNCTIONS"
-aws_var_default = "0"
 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')
@@ -156,4 +155,46 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
         out_message = "No password provided. Please ask the data team for access if you need this."
         print(out_message)
-    return files, out_message

+from typing import Type, List
 import pandas as pd
 import boto3
 import tempfile
 from tools.helper_functions import get_or_create_env_var
 PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials if required
+bucket_name=""
 aws_var = "RUN_AWS_FUNCTIONS"
+aws_var_default = "1"
 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')
         out_message = "No password provided. Please ask the data team for access if you need this."
         print(out_message)
+    return files, out_message
+def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
+    """
+    Uploads a file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    final_out_message = []
+    s3_client = boto3.client('s3')
+    if isinstance(local_file_paths, str):
+        local_file_paths = [local_file_paths]
+    for file in local_file_paths:
+        try:
+            # Get file name off file path
+            file_name = os.path.basename(file)
+            s3_key_full = s3_key + file_name
+            print("S3 key: ", s3_key_full)
+            s3_client.upload_file(file, s3_bucket, s3_key_full)
+            out_message = "File " + file_name + " uploaded successfully to S3!"
+            print(out_message)
+        except Exception as e:
+            out_message = f"Error uploading file(s) to S3: {e}"
+            print(out_message)
+        final_out_message.append(out_message)
+        final_out_message_str = '\n'.join(final_out_message)
+    return final_out_message_str

tools/data_anonymise.py CHANGED Viewed

@@ -5,11 +5,10 @@ import time
 import pandas as pd
 from faker import Faker
 from gradio import Progress
-from typing import List
-from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
@@ -24,6 +23,76 @@ fake = Faker("en_UK")
 def fake_first_name(x):
     return fake.first_name()
 def anon_consistent_names(df):
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")
@@ -118,6 +187,9 @@ def anon_consistent_names(df):
 def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
     key_string = ""
     # DataFrame to dict
@@ -133,34 +205,26 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
-    # analyzer_results = batch_analyzer.analyze_dict(df_dict, language=language,
-    #                                                         entities=chosen_redact_entities,
-    #                                                         score_threshold=score_threshold,
-    #                                                         return_decision_process=False,
-    #                                                         in_allow_list=in_allow_list_flat)
-    print("Identifying personal information")
-    analyse_tic = time.perf_counter()
-    print("Allow list:", in_allow_list)
     # Use custom analyzer to be able to track progress with Gradio
     analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
                                                             entities=chosen_redact_entities,
                                                             score_threshold=score_threshold,
-                                                            return_decision_process=False,
                                                             allow_list=in_allow_list_flat)
     analyzer_results = list(analyzer_results)
-    #analyzer_results
     analyse_toc = time.perf_counter()
     analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
     # Create faker function (note that it has to receive a value)
     fake = Faker("en_UK")
     def fake_first_name(x):
@@ -197,7 +261,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     scrubbed_df = pd.DataFrame(anonymizer_results)
-    return scrubbed_df, key_string
 def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
     def check_lists(list1, list2):
@@ -238,7 +302,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
-    anon_df_part_out, key_string = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
     # Rejoin the dataframe together
     anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
@@ -261,11 +325,20 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
             # Write each DataFrame to a different worksheet.
             anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
     else:
         anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
         anon_df_out.to_csv(anon_export_file_name, index = None)
     out_file_paths.append(anon_export_file_name)
     # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
     out_file_paths = list(set(out_file_paths))
@@ -276,10 +349,16 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
     return out_file_paths, out_message, key_string
-def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], progress=Progress(track_tqdm=True)):
     tic = time.perf_counter()
     # Load file
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str):

 import pandas as pd
 from faker import Faker
 from gradio import Progress
+from typing import List, Dict, Any
+from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 def fake_first_name(x):
     return fake.first_name()
+# Writing decision making process to file
+def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]) -> str:
+    """
+    Generate a detailed output of the decision process for entity recognition.
+    This function takes the results from the analyzer and the original data dictionary,
+    and produces a string output detailing the decision process for each recognized entity.
+    It includes information such as entity type, position, confidence score, and the context
+    in which the entity was found.
+    Args:
+        analyzer_results (List[DictAnalyzerResult]): The results from the entity analyzer.
+        df_dict (Dict[str, List[Any]]): The original data in dictionary format.
+    Returns:
+        str: A string containing the detailed decision process output.
+    """
+    decision_process_output = []
+    keys_to_keep = ['entity_type', 'start', 'end']
+    def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
+        output = []
+        if hasattr(result, 'value'):
+            text = result.value[data_row]
+        else:
+            text = ""
+        if isinstance(recognizer_result, list):
+            for sub_result in recognizer_result:
+                if isinstance(text, str):
+                    found_text = text[sub_result.start:sub_result.end]
+                else:
+                    found_text = ''
+                analysis_explanation = {key: sub_result.__dict__[key] for key in keys_to_keep}
+                analysis_explanation.update({
+                    'data_row': str(data_row),
+                    'column': list(df_dict.keys())[dictionary_key],
+                    'entity': found_text
+                })
+                output.append(str(analysis_explanation))
+        return output
+    #print("Analyser results:", analyzer_results)
+    # Run through each column to analyse for PII
+    for i, result in enumerate(analyzer_results):
+        print("Looking at result:", str(i))
+        # If a single result
+        if isinstance(result, RecognizerResult):
+            decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
+        # If a list of results
+        elif isinstance(result, List):
+            for x, recognizer_result in enumerate(result.recognizer_results):
+                decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
+        else:
+            try:
+                decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
+            except Exception as e:
+                print(e)
+    decision_process_output_str = '\n'.join(decision_process_output)
+    return decision_process_output_str
 def anon_consistent_names(df):
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")
 def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
+    print("Identifying personal information")
+    analyse_tic = time.perf_counter()
     key_string = ""
     # DataFrame to dict
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
+    #print("Allow list:", in_allow_list)
+    #print("Input data keys:", df_dict.keys())
     # Use custom analyzer to be able to track progress with Gradio
     analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
                                                             entities=chosen_redact_entities,
                                                             score_threshold=score_threshold,
+                                                            return_decision_process=True,
                                                             allow_list=in_allow_list_flat)
     analyzer_results = list(analyzer_results)
+    # Usage in the main function:
+    decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
     analyse_toc = time.perf_counter()
     analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
     # Create faker function (note that it has to receive a value)
     fake = Faker("en_UK")
     def fake_first_name(x):
     scrubbed_df = pd.DataFrame(anonymizer_results)
+    return scrubbed_df, key_string, decision_process_output_str
 def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
     def check_lists(list1, list2):
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
+    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
     # Rejoin the dataframe together
     anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
             # Write each DataFrame to a different worksheet.
             anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
+        decision_process_log_output_file = anon_xlsx_export_file_name + "decision_process_output.txt"
+        with open(decision_process_log_output_file, "w") as f:
+            f.write(decision_process_output_str)
     else:
         anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
         anon_df_out.to_csv(anon_export_file_name, index = None)
+        decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
+        with open(decision_process_log_output_file, "w") as f:
+            f.write(decision_process_output_str)
     out_file_paths.append(anon_export_file_name)
+    out_file_paths.append(decision_process_log_output_file)
     # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
     out_file_paths = list(set(out_file_paths))
     return out_file_paths, out_message, key_string
+def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
     tic = time.perf_counter()
+    # If this is the first time around, set variables to 0/blank
+    if first_loop_state==True:
+        latest_file_completed = 0
+        out_message = []
+        out_file_paths = []
     # Load file
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str):

tools/file_conversion.py CHANGED Viewed

@@ -3,7 +3,7 @@ from tools.helper_functions import get_file_path_end, output_folder
 from PIL import Image
 import os
 from gradio import Progress
-from typing import List
 def is_pdf_or_image(filename):
     """
@@ -55,6 +55,7 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
         # If no images are returned, break the loop
         if not image:
             break
         images.extend(image)
@@ -74,6 +75,7 @@ def process_file(file_path):
         print(f"{file_path} is an image file.")
         # Perform image processing here
         img_object = [Image.open(file_path)]
     # Check if the file is a PDF
     elif file_extension == '.pdf':
@@ -85,37 +87,79 @@ def process_file(file_path):
         print(f"{file_path} is not an image or PDF file.")
         img_object = ['']
-    # print('Image object is:', img_object)
     return img_object
-def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], progress=Progress(track_tqdm=True)):
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     #if isinstance(out_message, str):
     #    out_message = [out_message]
     if not file_paths:
         file_paths = []
-    out_file_paths = file_paths
     latest_file_completed = int(latest_file_completed)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
-    if latest_file_completed == len(out_file_paths):
         print("Last file reached, returning files:", str(latest_file_completed))
         #final_out_message = '\n'.join(out_message)
         return out_message, out_file_paths
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
-    file_paths_loop = [out_file_paths[int(latest_file_completed)]]
     #for file in progress.tqdm(file_paths, desc="Preparing files"):
     for file in file_paths_loop:
         file_path = file.name
         #if file_path:
         #    file_path_without_ext = get_file_path_end(file_path)
         if not file_path:

 from PIL import Image
 import os
 from gradio import Progress
+from typing import List, Optional
 def is_pdf_or_image(filename):
     """
         # If no images are returned, break the loop
         if not image:
+            print("Conversion of page", str(page_num), "to file failed.")
             break
         images.extend(image)
         print(f"{file_path} is an image file.")
         # Perform image processing here
         img_object = [Image.open(file_path)]
+        # Load images from the file paths
     # Check if the file is a PDF
     elif file_extension == '.pdf':
         print(f"{file_path} is not an image or PDF file.")
         img_object = ['']
+    print('Image object is:', img_object)
     return img_object
+def prepare_image_or_text_pdf(
+    file_paths: List[str],
+    in_redact_method: str,
+    in_allow_list: Optional[List[List[str]]] = None,
+    latest_file_completed: int = 0,
+    out_message: List[str] = [],
+    first_loop_state: bool = False,
+    progress: Progress = Progress(track_tqdm=True)
+) -> tuple[List[str], List[str]]:
+    """
+    Prepare and process image or text PDF files for redaction.
+    This function takes a list of file paths, processes each file based on the specified redaction method,
+    and returns the output messages and processed file paths.
+    Args:
+        file_paths (List[str]): List of file paths to process.
+        in_redact_method (str): The redaction method to use.
+        in_allow_list (Optional[List[List[str]]]): List of allowed terms for redaction.
+        latest_file_completed (int): Index of the last completed file.
+        out_message (List[str]): List to store output messages.
+        first_loop_state (bool): Flag indicating if this is the first iteration.
+        progress (Progress): Progress tracker for the operation.
+    Returns:
+        tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
+    """
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     #if isinstance(out_message, str):
     #    out_message = [out_message]
+    # If this is the first time around, set variables to 0/blank
+    if first_loop_state==True:
+        latest_file_completed = 0
+        out_message = []
+        out_file_paths = []
+    else:
+        print("Now attempting file:", str(latest_file_completed + 1))
+        out_file_paths = []
     if not file_paths:
         file_paths = []
+    #out_file_paths = file_paths
     latest_file_completed = int(latest_file_completed)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed == len(file_paths):
         print("Last file reached, returning files:", str(latest_file_completed))
         #final_out_message = '\n'.join(out_message)
         return out_message, out_file_paths
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    file_paths_loop = [file_paths[int(latest_file_completed)]]
+    print("file_paths_loop:", str(file_paths_loop))
     #for file in progress.tqdm(file_paths, desc="Preparing files"):
     for file in file_paths_loop:
         file_path = file.name
+        print("file_path:", file_path)
         #if file_path:
         #    file_path_without_ext = get_file_path_end(file_path)
         if not file_path:

tools/file_redaction.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from PIL import Image
 from typing import List
 import pandas as pd
 from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
@@ -14,13 +14,20 @@ from collections import defaultdict  # For efficient grouping
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
 import gradio as gr
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
     # If out message is string or out_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str):
         out_message = [out_message]
@@ -44,14 +51,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
-    print("File paths:", file_paths)
     for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
         if file_path:
             file_path_without_ext = get_file_path_end(file_path)
-            if is_pdf(file_path) == False:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
                 in_redact_method = "Image analysis"
@@ -65,13 +73,19 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             # if is_pdf_or_image(file_path) == False:
             #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-            print("Redacting file as image-based pdf")
-            pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             out_file_paths.append(out_image_file_path)
-            out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
             # Increase latest file completed count unless we are at the last file
             if latest_file_completed != len(file_paths):
@@ -84,12 +98,12 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
             #out_file_paths.append(out_text_file_path)
-            out_message_new = "File " + file_path_without_ext + " successfully redacted."
             out_message.append(out_message_new)
             # Convert message
@@ -101,6 +115,12 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
             out_file_paths.extend(img_output_file_path)
             # Add confirmation for converting to image if you want
             # out_message.append(img_output_summary)
@@ -138,7 +158,7 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
                 merged_box = group[0]
                 for next_box in group[1:]:
                     if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
-                        print("Merging a box")
                         # Calculate new dimensions for the merged box
                         new_left = min(merged_box.left, next_box.left)
                         new_top = min(merged_box.top, next_box.top)
@@ -154,16 +174,14 @@ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
                 merged_bboxes.append(merged_box)
             return merged_bboxes
-def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
-    from PIL import Image, ImageChops, ImageDraw
     fill = (0, 0, 0)
     if not image_paths:
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
         #progress(0, desc=out_message)
@@ -180,12 +198,12 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
     for i in range(0, number_of_pages):
-        print("Redacting page ", str(i + 1))
         # Get the image to redact using PIL lib (pillow)
-        image = image_paths[i] #Image.open(image_paths[i])
-        image = ImageChops.duplicate(image)
         # %%
         image_analyser = ImageAnalyzerEngine(nlp_analyser)
@@ -200,16 +218,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 "allow_list": allow_list,
                 "language": language,
                 "entities": chosen_redact_entities,
-                "score_threshold": score_threshold
             })
         #print("For page: ", str(i), "Bounding boxes: ", bboxes)
         draw = ImageDraw.Draw(image)
         merged_bboxes = merge_img_bboxes(bboxes)
-        print("For page: ", str(i), "Merged bounding boxes: ", merged_bboxes)
         # 3. Draw the merged boxes (unchanged)
         for box in merged_bboxes:
@@ -221,7 +245,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
         images.append(image)
-    return images
 def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
@@ -242,7 +266,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
     #for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
     for page in pdf.pages:
-        print("Page number is: ", page_num + 1)
         annotations_on_page = []
         analyzed_bounding_boxes = []
@@ -261,8 +285,11 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                                                             language=language,
                                                             entities=chosen_redact_entities,
                                                             score_threshold=score_threshold,
-                                                            return_decision_process=False,
                                                             allow_list=allow_list)
                     characters = [char                    # This is what we want to include in the list
                             for line in text_container          # Loop through each line in text_container
@@ -292,7 +319,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                                         current_box = char_box
                                         current_y = char_box[1]
                                     else:  # Now we have previous values to compare
-                                        print("Comparing values")
                                         vertical_diff_bboxes = abs(char_box[1] - current_y)
                                         horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
                                         #print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
@@ -303,9 +330,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                                         ):
                                             old_right_pos = current_box[2]
                                             current_box[2] = char_box[2]
-                                            print("Old right pos: ", str(old_right_pos), "has been replaced with: ", str(current_box[2]), "for result: ", result)
                                         else:
                                             merged_bounding_boxes.append(
                                                 {"boundingBox": current_box, "result": result})
@@ -324,13 +348,17 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                         combined_analyzer_results.extend(analyzer_results)
             if len(analyzer_results) > 0:
                 # Create summary df of annotations to be made
                 analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
                 analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
                 analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
                 analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
                 analyzed_bounding_boxes_df_new['page'] = page_num + 1
-                analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
             for analyzed_bounding_box in analyzed_bounding_boxes:
                 bounding_box = analyzed_bounding_box["boundingBox"]
@@ -352,11 +380,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
             annotations_all_pages.extend([annotations_on_page])
-            print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
             page.Annots = pdf.make_indirect(annotations_on_page)
             page_num += 1
-    analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
-    return pdf

+from PIL import Image, ImageChops, ImageDraw
 from typing import List
 import pandas as pd
 from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
+from tools.data_anonymise import generate_decision_process_output
 import gradio as gr
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
+    # If this is the first time around, set variables to 0/blank
+    if first_loop_state==True:
+        latest_file_completed = 0
+        out_message = []
+        out_file_paths = []
     # If out message is string or out_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str):
         out_message = [out_message]
         in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    #print("File paths:", file_paths)
     for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
         if file_path:
             file_path_without_ext = get_file_path_end(file_path)
+            is_a_pdf = is_pdf(file_path) == True
+            if is_a_pdf == False:
                 # If user has not submitted a pdf, assume it's an image
                 print("File is not a pdf, assuming that image analysis needs to be used.")
                 in_redact_method = "Image analysis"
             # if is_pdf_or_image(file_path) == False:
             #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
+            print("Redacting file as image-based file")
+            pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf)
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             out_file_paths.append(out_image_file_path)
+            out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file")
+            output_logs_str = str(output_logs)
+            logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
+            with open(logs_output_file_name, "w") as f:
+                f.write(output_logs_str)
+            out_file_paths.append(logs_output_file_name)
             # Increase latest file completed count unless we are at the last file
             if latest_file_completed != len(file_paths):
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
             #out_file_paths.append(out_text_file_path)
+            out_message_new = "File " + file_path_without_ext + " successfully redacted"
             out_message.append(out_message_new)
             # Convert message
             img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
             out_file_paths.extend(img_output_file_path)
+            output_logs_str = str(output_logs)
+            logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
+            with open(logs_output_file_name, "w") as f:
+                f.write(output_logs_str)
+            out_file_paths.append(logs_output_file_name)
             # Add confirmation for converting to image if you want
             # out_message.append(img_output_summary)
                 merged_box = group[0]
                 for next_box in group[1:]:
                     if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
+                        #print("Merging a box")
                         # Calculate new dimensions for the merged box
                         new_left = min(merged_box.left, next_box.left)
                         new_top = min(merged_box.top, next_box.top)
                 merged_bboxes.append(merged_box)
             return merged_bboxes
+def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
     fill = (0, 0, 0)
     if not image_paths:
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
         #progress(0, desc=out_message)
     #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
     for i in range(0, number_of_pages):
+        print("Redacting page", str(i + 1))
         # Get the image to redact using PIL lib (pillow)
+        #print("image_paths:", image_paths)
+        image = ImageChops.duplicate(image_paths[i])
         # %%
         image_analyser = ImageAnalyzerEngine(nlp_analyser)
                 "allow_list": allow_list,
                 "language": language,
                 "entities": chosen_redact_entities,
+                "score_threshold": score_threshold,
+                "return_decision_process":True,
             })
+        # Text placeholder in this processing step, as the analyze method does not return the OCR text
+        if bboxes:
+            decision_process_output_str = str(bboxes)
+            print("Decision process:", decision_process_output_str)
         #print("For page: ", str(i), "Bounding boxes: ", bboxes)
         draw = ImageDraw.Draw(image)
         merged_bboxes = merge_img_bboxes(bboxes)
+        #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
         # 3. Draw the merged boxes (unchanged)
         for box in merged_bboxes:
         images.append(image)
+    return images, decision_process_output_str
 def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     #for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
     for page in pdf.pages:
+        print("Page number is:", page_num + 1)
         annotations_on_page = []
         analyzed_bounding_boxes = []
                                                             language=language,
                                                             entities=chosen_redact_entities,
                                                             score_threshold=score_threshold,
+                                                            return_decision_process=True,
                                                             allow_list=allow_list)
                     characters = [char                    # This is what we want to include in the list
                             for line in text_container          # Loop through each line in text_container
                                         current_box = char_box
                                         current_y = char_box[1]
                                     else:  # Now we have previous values to compare
+                                        #print("Comparing values")
                                         vertical_diff_bboxes = abs(char_box[1] - current_y)
                                         horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
                                         #print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
                                         ):
                                             old_right_pos = current_box[2]
                                             current_box[2] = char_box[2]
                                         else:
                                             merged_bounding_boxes.append(
                                                 {"boundingBox": current_box, "result": result})
                         combined_analyzer_results.extend(analyzer_results)
             if len(analyzer_results) > 0:
+                #decision_process_output_str = generate_decision_process_output(analyzer_results, {'text':text_to_analyze})
+                #print("Decision process:", decision_process_output_str)
                 # Create summary df of annotations to be made
                 analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
                 analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
                 analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
                 analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
                 analyzed_bounding_boxes_df_new['page'] = page_num + 1
+                analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
+                print('analyzed_bounding_boxes_df:', analyzed_bounding_boxes_df)
             for analyzed_bounding_box in analyzed_bounding_boxes:
                 bounding_box = analyzed_bounding_box["boundingBox"]
             annotations_all_pages.extend([annotations_on_page])
+            print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
             page.Annots = pdf.make_indirect(annotations_on_page)
             page_num += 1
+    return pdf, analyzed_bounding_boxes_df

tools/helper_functions.py CHANGED Viewed

@@ -139,6 +139,23 @@ def add_folder_to_path(folder_path: str):
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 async def get_connection_params(request: gr.Request):
     base_folder = ""

     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
+# Upon running a process, the feedback buttons are revealed
+def reveal_feedback_buttons():
+    return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True)
+def wipe_logs(feedback_logs_loc, usage_logs_loc):
+    try:
+        os.remove(feedback_logs_loc)
+    except Exception as e:
+        print("Could not remove feedback logs file", e)
+    try:
+        os.remove(usage_logs_loc)
+    except Exception as e:
+        print("Could not remove usage logs file", e)
 async def get_connection_params(request: gr.Request):
     base_folder = ""