seanpedrickcase commited on
Commit
8235bbb
·
1 Parent(s): f0f9378

Improved logging

Browse files
README.md CHANGED
@@ -11,13 +11,15 @@ license: agpl-3.0
11
 
12
  # Document redaction
13
 
14
- Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
 
 
15
 
16
  You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
17
 
18
  NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
19
 
20
- This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
21
 
22
  # USER GUIDE
23
 
 
11
 
12
  # Document redaction
13
 
14
+ Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
15
+
16
+ See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
17
 
18
  You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
19
 
20
  NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
21
 
22
+ This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
23
 
24
  # USER GUIDE
25
 
app.py CHANGED
@@ -4,20 +4,20 @@ import socket
4
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
5
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
6
 
 
 
 
7
  from gradio_image_annotation import image_annotator
8
 
9
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
10
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
11
  from tools.file_redaction import choose_and_run_redactor
12
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
13
  from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
14
  from tools.data_anonymise import anonymise_data_files
15
  from tools.auth import authenticate_user
16
- #from tools.aws_functions import load_data_from_aws
17
- import gradio as gr
18
- import pandas as pd
19
 
20
- from datetime import datetime
21
  today_rev = datetime.now().strftime("%Y%m%d")
22
 
23
  add_folder_to_path("tesseract/")
@@ -36,12 +36,10 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
36
  language = 'en'
37
 
38
  host_name = socket.gethostname()
39
-
40
  feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
41
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
42
  usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
43
 
44
-
45
  text_ocr_option = "Simple text analysis - PDFs with selectable text"
46
  tesseract_ocr_option = "Quick image analysis - typed text"
47
  textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
@@ -70,10 +68,6 @@ with app:
70
  all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
71
  all_decision_process_table_state = gr.State(pd.DataFrame())
72
 
73
- def reset_state_vars():
74
- return [], [], pd.DataFrame(), pd.DataFrame()
75
-
76
-
77
  in_allow_list_state = gr.State(pd.DataFrame())
78
 
79
  session_hash_state = gr.State()
@@ -88,25 +82,32 @@ with app:
88
  output_image_files_state = gr.State([])
89
  output_file_list_state = gr.State([])
90
  text_output_file_list_state = gr.State([])
91
- log_files_output_list_state = gr.State([])
92
 
 
93
  # Logging state
94
- feedback_logs_state = gr.State(feedback_logs_folder + 'dataset1.csv') #'log.csv')
 
 
95
  feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
96
- access_logs_state = gr.State(access_logs_folder + 'dataset1.csv') #'log.csv')
97
  access_s3_logs_loc_state = gr.State(access_logs_folder)
98
- usage_logs_state = gr.State(usage_logs_folder + 'dataset1.csv') #'log.csv')
99
  usage_s3_logs_loc_state = gr.State(usage_logs_folder)
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Invisible elements effectively used as state variables
102
- session_hash_textbox = gr.Textbox(value="", visible=False) # Invisible text box to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
103
- textract_metadata_textbox = gr.Textbox(value="", visible=False)
104
- doc_file_name_textbox = gr.Textbox(value="", visible=False)
105
- doc_file_name_with_extension_textbox = gr.Textbox(value="", visible=False)
106
- data_file_name_textbox = gr.Textbox(value="", visible=False)
107
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
108
- estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
109
- annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
110
 
111
 
112
  ###
@@ -114,17 +115,17 @@ with app:
114
  ###
115
 
116
  gr.Markdown(
117
- """
118
- # Document redaction
119
 
120
- Redact personal information from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost, so please only use for more complex redaction tasks). Also see the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
 
 
121
 
122
  You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
123
 
124
  NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
125
 
126
- This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
127
- """)
128
 
129
  # PDF / IMAGES TAB
130
  with gr.Tab("PDFs/images"):
@@ -148,7 +149,7 @@ with app:
148
 
149
  # Feedback elements are invisible until revealed by redaction action
150
  pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
151
- pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
152
  pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
153
  pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
154
 
@@ -226,9 +227,6 @@ with app:
226
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
227
 
228
 
229
-
230
-
231
-
232
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
233
  with gr.Row():
234
  in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
@@ -257,15 +255,15 @@ with app:
257
  ###
258
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
259
 
260
- document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
261
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
262
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
263
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
264
  #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
265
 
266
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
267
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
268
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
269
 
270
  # If a file has been completed, the function will continue onto the next document
271
  latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
@@ -321,27 +319,27 @@ with app:
321
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
322
 
323
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
324
- access_callback = gr.CSVLogger()
325
  access_callback.setup([session_hash_textbox], access_logs_folder)
326
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
327
  then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
328
 
329
  # User submitted feedback for pdf redactions
330
- pdf_callback = gr.CSVLogger()
331
- pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_doc_files], feedback_logs_folder)
332
- pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_doc_files], None, preprocess=False).\
333
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
334
 
335
  # User submitted feedback for data redactions
336
- data_callback = gr.CSVLogger()
337
- data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_logs_folder)
338
- data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
339
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
340
 
341
  # Log processing time/token usage when making a query
342
- usage_callback = gr.CSVLogger()
343
- usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], usage_logs_folder)
344
- estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox], None, preprocess=False).\
345
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
346
 
347
  # Launch the Gradio app
 
4
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
5
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
6
 
7
+ import gradio as gr
8
+ import pandas as pd
9
+ from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
 
12
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars
13
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
  from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
 
 
 
19
 
20
+
21
  today_rev = datetime.now().strftime("%Y%m%d")
22
 
23
  add_folder_to_path("tesseract/")
 
36
  language = 'en'
37
 
38
  host_name = socket.gethostname()
 
39
  feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
40
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
41
  usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
42
 
 
43
  text_ocr_option = "Simple text analysis - PDFs with selectable text"
44
  tesseract_ocr_option = "Quick image analysis - typed text"
45
  textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
 
68
  all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
69
  all_decision_process_table_state = gr.State(pd.DataFrame())
70
 
 
 
 
 
71
  in_allow_list_state = gr.State(pd.DataFrame())
72
 
73
  session_hash_state = gr.State()
 
82
  output_image_files_state = gr.State([])
83
  output_file_list_state = gr.State([])
84
  text_output_file_list_state = gr.State([])
85
+ log_files_output_list_state = gr.State([])
86
 
87
+
88
  # Logging state
89
+ log_file_name = 'log.csv'
90
+
91
+ feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
92
  feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
93
+ access_logs_state = gr.State(access_logs_folder + log_file_name)
94
  access_s3_logs_loc_state = gr.State(access_logs_folder)
95
+ usage_logs_state = gr.State(usage_logs_folder + log_file_name)
96
  usage_s3_logs_loc_state = gr.State(usage_logs_folder)
97
+
98
+ # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
99
+ session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
100
+ textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
101
+ comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
102
+
103
+ doc_file_name_textbox = gr.Textbox(label = "doc_file_name_textbox", value="", visible=False)
104
+ doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
105
+ data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
106
+
107
+ estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
108
+ annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
109
 
 
 
 
 
 
 
110
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
 
 
111
 
112
 
113
  ###
 
115
  ###
116
 
117
  gr.Markdown(
118
+ """# Document redaction
 
119
 
120
+ Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
121
+
122
+ See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
123
 
124
  You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
125
 
126
  NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
127
 
128
+ This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.""")
 
129
 
130
  # PDF / IMAGES TAB
131
  with gr.Tab("PDFs/images"):
 
149
 
150
  # Feedback elements are invisible until revealed by redaction action
151
  pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
152
+ pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
153
  pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
154
  pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
155
 
 
227
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
228
 
229
 
 
 
 
230
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
231
  with gr.Row():
232
  in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
 
255
  ###
256
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
257
 
258
+ document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox]).\
259
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
260
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
261
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc")#.\
262
  #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
263
 
264
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
265
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
266
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number])
267
 
268
  # If a file has been completed, the function will continue onto the next document
269
  latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
 
319
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
320
 
321
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
322
+ access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
323
  access_callback.setup([session_hash_textbox], access_logs_folder)
324
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
325
  then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
326
 
327
  # User submitted feedback for pdf redactions
328
+ pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
329
+ pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], feedback_logs_folder)
330
+ pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_textbox], None, preprocess=False).\
331
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
332
 
333
  # User submitted feedback for data redactions
334
+ data_callback = gr.CSVLogger(dataset_file_name=log_file_name)
335
+ data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_textbox], feedback_logs_folder)
336
+ data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_name_textbox], None, preprocess=False).\
337
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
338
 
339
  # Log processing time/token usage when making a query
340
+ usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
341
+ usage_callback.setup([session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
342
+ estimated_time_taken_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
343
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
344
 
345
  # Launch the Gradio app
tools/custom_image_analyser_engine.py CHANGED
@@ -471,6 +471,7 @@ class CustomImageAnalyzerEngine:
471
 
472
  horizontal_buffer = 0 # add pixels to right of width
473
  height_buffer = 2 # add pixels to bounding box height
 
474
 
475
  allow_list = text_analyzer_kwargs.get('allow_list', [])
476
 
@@ -494,6 +495,8 @@ class CustomImageAnalyzerEngine:
494
  LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
495
  )
496
 
 
 
497
  for result in response["Entities"]:
498
  result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
499
 
@@ -577,7 +580,7 @@ class CustomImageAnalyzerEngine:
577
 
578
  combined_results.extend(line_results)
579
 
580
- return combined_results
581
 
582
  @staticmethod
583
  def map_analyzer_results_to_bounding_boxes(
 
471
 
472
  horizontal_buffer = 0 # add pixels to right of width
473
  height_buffer = 2 # add pixels to bounding box height
474
+ comprehend_query_number = 0
475
 
476
  allow_list = text_analyzer_kwargs.get('allow_list', [])
477
 
 
495
  LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
496
  )
497
 
498
+ comprehend_query_number += 1
499
+
500
  for result in response["Entities"]:
501
  result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
502
 
 
580
 
581
  combined_results.extend(line_results)
582
 
583
+ return combined_results, comprehend_query_number
584
 
585
  @staticmethod
586
  def map_analyzer_results_to_bounding_boxes(
tools/file_redaction.py CHANGED
@@ -91,6 +91,7 @@ def choose_and_run_redactor(file_paths:List[str],
91
  current_loop_page:int=0,
92
  page_break_return:bool=False,
93
  pii_identification_method:str="Local",
 
94
  progress=gr.Progress(track_tqdm=True)):
95
  '''
96
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -120,6 +121,7 @@ def choose_and_run_redactor(file_paths:List[str],
120
  - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
121
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
122
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
 
123
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
124
 
125
  The function returns a redacted document along with processing logs.
@@ -171,7 +173,7 @@ def choose_and_run_redactor(file_paths:List[str],
171
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
172
  print("Estimated total processing time:", str(estimate_total_processing_time))
173
 
174
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
175
 
176
  # If we have reached the last page, return message
177
  if current_loop_page >= number_of_pages:
@@ -181,7 +183,7 @@ def choose_and_run_redactor(file_paths:List[str],
181
  current_loop_page = 999
182
  combined_out_message = out_message
183
 
184
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
185
 
186
  # Create allow list
187
  if not in_allow_list.empty:
@@ -220,7 +222,7 @@ def choose_and_run_redactor(file_paths:List[str],
220
  out_message = "No file selected"
221
  print(out_message)
222
 
223
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
224
 
225
  if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
226
 
@@ -231,16 +233,16 @@ def choose_and_run_redactor(file_paths:List[str],
231
  except:
232
  out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
233
  print(out_message)
234
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
235
 
236
  #Analyse and redact image-based pdf or image
237
  if is_pdf_or_image(file_path) == False:
238
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
239
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
240
 
241
  print("Redacting file " + file_path_without_ext + " as an image-based file")
242
 
243
- pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path,
244
  prepared_pdf_image_paths,
245
  language,
246
  chosen_redact_entities,
@@ -259,7 +261,8 @@ def choose_and_run_redactor(file_paths:List[str],
259
  all_line_level_ocr_results_df,
260
  all_decision_process_table,
261
  pymupdf_doc,
262
- pii_identification_method)
 
263
 
264
  # Save Textract request metadata (if exists)
265
  if new_request_metadata:
@@ -272,12 +275,12 @@ def choose_and_run_redactor(file_paths:List[str],
272
 
273
  if is_pdf(file_path) == False:
274
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
275
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
276
 
277
  # Analyse text-based pdf
278
  print('Redacting file as text-based PDF')
279
 
280
- pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path,
281
  prepared_pdf_image_paths,language,
282
  chosen_redact_entities,
283
  chosen_redact_comprehend_entities,
@@ -291,12 +294,13 @@ def choose_and_run_redactor(file_paths:List[str],
291
  all_line_level_ocr_results_df,
292
  all_decision_process_table,
293
  pymupdf_doc,
294
- pii_identification_method)
 
295
 
296
  else:
297
  out_message = "No redaction method selected"
298
  print(out_message)
299
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
300
 
301
  # If at last page, save to file
302
  if current_loop_page >= number_of_pages:
@@ -392,7 +396,7 @@ def choose_and_run_redactor(file_paths:List[str],
392
 
393
  # If textract requests made, write to logging file
394
  if all_request_metadata:
395
- all_request_metadata_str = '\n'.join(all_request_metadata)
396
 
397
  all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
398
 
@@ -412,7 +416,7 @@ def choose_and_run_redactor(file_paths:List[str],
412
  out_file_paths = list(set(out_file_paths))
413
 
414
 
415
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
416
 
417
  def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
418
  '''
@@ -769,9 +773,10 @@ def redact_image_pdf(file_path:str,
769
  all_decision_process_table = pd.DataFrame(),
770
  pymupdf_doc = [],
771
  pii_identification_method:str="Local",
 
772
  page_break_val:int=int(page_break_value),
773
  logging_file_paths:List=[],
774
- max_time:int=int(max_time_value),
775
  progress=Progress(track_tqdm=True)):
776
 
777
  '''
@@ -796,9 +801,10 @@ def redact_image_pdf(file_path:str,
796
  - all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
797
  - pymupdf_doc (List, optional): The document as a PyMupdf object.
798
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
 
799
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
800
  - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
801
- - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
802
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
803
 
804
  The function returns a fully or partially-redacted PDF document.
@@ -806,6 +812,7 @@ def redact_image_pdf(file_path:str,
806
  file_name = get_file_path_end(file_path)
807
  fill = (0, 0, 0) # Fill colour
808
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
 
809
 
810
  #print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
811
 
@@ -836,7 +843,6 @@ def redact_image_pdf(file_path:str,
836
  if current_loop_page == 0: page_loop_start = 0
837
  else: page_loop_start = current_loop_page
838
 
839
- #progress_bar = progress.tqdm(range(page_loop_start, number_of_pages), unit="pages", desc="Redacting pages")
840
  progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
841
 
842
  for page_no in progress_bar:
@@ -872,8 +878,7 @@ def redact_image_pdf(file_path:str,
872
  page_width, page_height = image.size
873
 
874
  # Possibility to use different languages
875
- if language == 'en':
876
- ocr_lang = 'eng'
877
  else: ocr_lang = language
878
 
879
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
@@ -943,7 +948,7 @@ def redact_image_pdf(file_path:str,
943
 
944
  pii_identification_method= "AWS Comprehend" #"Local"
945
 
946
- redaction_bboxes = image_analyser.analyze_text(
947
  line_level_ocr_results,
948
  line_level_ocr_results_with_children,
949
  chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
@@ -954,6 +959,8 @@ def redact_image_pdf(file_path:str,
954
  score_threshold=score_threshold
955
  )
956
 
 
 
957
  # redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
958
  # line_level_ocr_results_with_children,
959
  # language,
@@ -1063,7 +1070,7 @@ def redact_image_pdf(file_path:str,
1063
 
1064
  current_loop_page += 1
1065
 
1066
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
1067
 
1068
  if is_pdf(file_path) == False:
1069
  images.append(image)
@@ -1079,9 +1086,9 @@ def redact_image_pdf(file_path:str,
1079
  progress.close(_tqdm=progress_bar)
1080
  tqdm._instances.clear()
1081
 
1082
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
1083
 
1084
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
1085
 
1086
 
1087
  ###
@@ -1299,7 +1306,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
1299
  '''
1300
  Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
1301
  '''
1302
-
1303
  analyser_results = []
1304
 
1305
  #text_to_analyse = initial_clean(text_container.text).strip()
@@ -1323,6 +1330,8 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
1323
  LanguageCode=language # Specify the language of the text
1324
  )
1325
 
 
 
1326
  for result in response["Entities"]:
1327
 
1328
  result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
@@ -1340,7 +1349,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
1340
  analyser_results = []
1341
 
1342
 
1343
- return analyser_results
1344
 
1345
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1346
  decision_process_table = pd.DataFrame()
@@ -1397,6 +1406,7 @@ def redact_text_pdf(
1397
  all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
1398
  pymupdf_doc: List = [], # List of PyMuPDF documents
1399
  pii_identification_method: str = "Local",
 
1400
  page_break_val: int = int(page_break_value), # Value for page break
1401
  max_time: int = int(max_time_value),
1402
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
@@ -1422,12 +1432,14 @@ def redact_text_pdf(
1422
  - all_decision_process_table: DataFrame for decision process table
1423
  - pymupdf_doc: List of PyMuPDF documents
1424
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
 
1425
  - page_break_val: Value for page break
1426
- - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1427
  - progress: Progress tracking object
1428
  '''
1429
 
1430
  tic = time.perf_counter()
 
1431
 
1432
  # Open with Pikepdf to get text lines
1433
  pikepdf_pdf = Pdf.open(filename)
@@ -1517,7 +1529,9 @@ def redact_text_pdf(
1517
 
1518
  if chosen_redact_entities:
1519
 
1520
- text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
 
 
1521
 
1522
  else:
1523
  text_line_analyser_result = []
@@ -1576,7 +1590,7 @@ def redact_text_pdf(
1576
 
1577
  current_loop_page += 1
1578
 
1579
- return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
1580
 
1581
 
1582
  annotations_all_pages.append(image_annotations)
@@ -1588,7 +1602,7 @@ def redact_text_pdf(
1588
  page_break_return = True
1589
  progress.close(_tqdm=progress_bar)
1590
 
1591
- return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
1592
 
1593
 
1594
- return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
 
91
  current_loop_page:int=0,
92
  page_break_return:bool=False,
93
  pii_identification_method:str="Local",
94
+ comprehend_query_number:int=0,
95
  progress=gr.Progress(track_tqdm=True)):
96
  '''
97
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
 
121
  - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
122
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
123
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
124
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
125
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
126
 
127
  The function returns a redacted document along with processing logs.
 
173
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
174
  print("Estimated total processing time:", str(estimate_total_processing_time))
175
 
176
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
177
 
178
  # If we have reached the last page, return message
179
  if current_loop_page >= number_of_pages:
 
183
  current_loop_page = 999
184
  combined_out_message = out_message
185
 
186
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
187
 
188
  # Create allow list
189
  if not in_allow_list.empty:
 
222
  out_message = "No file selected"
223
  print(out_message)
224
 
225
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
226
 
227
  if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
228
 
 
233
  except:
234
  out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
235
  print(out_message)
236
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
237
 
238
  #Analyse and redact image-based pdf or image
239
  if is_pdf_or_image(file_path) == False:
240
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
241
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
242
 
243
  print("Redacting file " + file_path_without_ext + " as an image-based file")
244
 
245
+ pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
246
  prepared_pdf_image_paths,
247
  language,
248
  chosen_redact_entities,
 
261
  all_line_level_ocr_results_df,
262
  all_decision_process_table,
263
  pymupdf_doc,
264
+ pii_identification_method,
265
+ comprehend_query_number)
266
 
267
  # Save Textract request metadata (if exists)
268
  if new_request_metadata:
 
275
 
276
  if is_pdf(file_path) == False:
277
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
278
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
279
 
280
  # Analyse text-based pdf
281
  print('Redacting file as text-based PDF')
282
 
283
+ pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(file_path,
284
  prepared_pdf_image_paths,language,
285
  chosen_redact_entities,
286
  chosen_redact_comprehend_entities,
 
294
  all_line_level_ocr_results_df,
295
  all_decision_process_table,
296
  pymupdf_doc,
297
+ pii_identification_method,
298
+ comprehend_query_number)
299
 
300
  else:
301
  out_message = "No redaction method selected"
302
  print(out_message)
303
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
304
 
305
  # If at last page, save to file
306
  if current_loop_page >= number_of_pages:
 
396
 
397
  # If textract requests made, write to logging file
398
  if all_request_metadata:
399
+ all_request_metadata_str = '\n'.join(all_request_metadata).strip()
400
 
401
  all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
402
 
 
416
  out_file_paths = list(set(out_file_paths))
417
 
418
 
419
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
420
 
421
  def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
422
  '''
 
773
  all_decision_process_table = pd.DataFrame(),
774
  pymupdf_doc = [],
775
  pii_identification_method:str="Local",
776
+ comprehend_query_number:int=0,
777
  page_break_val:int=int(page_break_value),
778
  logging_file_paths:List=[],
779
+ max_time:int=int(max_time_value),
780
  progress=Progress(track_tqdm=True)):
781
 
782
  '''
 
801
  - all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
802
  - pymupdf_doc (List, optional): The document as a PyMupdf object.
803
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
804
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
805
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
806
  - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
807
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
808
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
809
 
810
  The function returns a fully or partially-redacted PDF document.
 
812
  file_name = get_file_path_end(file_path)
813
  fill = (0, 0, 0) # Fill colour
814
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
815
+ comprehend_query_number_new = 0
816
 
817
  #print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
818
 
 
843
  if current_loop_page == 0: page_loop_start = 0
844
  else: page_loop_start = current_loop_page
845
 
 
846
  progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
847
 
848
  for page_no in progress_bar:
 
878
  page_width, page_height = image.size
879
 
880
  # Possibility to use different languages
881
+ if language == 'en': ocr_lang = 'eng'
 
882
  else: ocr_lang = language
883
 
884
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
 
948
 
949
  pii_identification_method= "AWS Comprehend" #"Local"
950
 
951
+ redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
952
  line_level_ocr_results,
953
  line_level_ocr_results_with_children,
954
  chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
 
959
  score_threshold=score_threshold
960
  )
961
 
962
+ comprehend_query_number = comprehend_query_number_new
963
+
964
  # redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
965
  # line_level_ocr_results_with_children,
966
  # language,
 
1070
 
1071
  current_loop_page += 1
1072
 
1073
+ return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1074
 
1075
  if is_pdf(file_path) == False:
1076
  images.append(image)
 
1086
  progress.close(_tqdm=progress_bar)
1087
  tqdm._instances.clear()
1088
 
1089
+ return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1090
 
1091
+ return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1092
 
1093
 
1094
  ###
 
1306
  '''
1307
  Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
1308
  '''
1309
+ comprehend_query_number = 0
1310
  analyser_results = []
1311
 
1312
  #text_to_analyse = initial_clean(text_container.text).strip()
 
1330
  LanguageCode=language # Specify the language of the text
1331
  )
1332
 
1333
+ comprehend_query_number += 1
1334
+
1335
  for result in response["Entities"]:
1336
 
1337
  result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
 
1349
  analyser_results = []
1350
 
1351
 
1352
+ return analyser_results, comprehend_query_number
1353
 
1354
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1355
  decision_process_table = pd.DataFrame()
 
1406
  all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
1407
  pymupdf_doc: List = [], # List of PyMuPDF documents
1408
  pii_identification_method: str = "Local",
1409
+ comprehend_query_number:int = 0,
1410
  page_break_val: int = int(page_break_value), # Value for page break
1411
  max_time: int = int(max_time_value),
1412
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
 
1432
  - all_decision_process_table: DataFrame for decision process table
1433
  - pymupdf_doc: List of PyMuPDF documents
1434
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
1435
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
1436
  - page_break_val: Value for page break
1437
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1438
  - progress: Progress tracking object
1439
  '''
1440
 
1441
  tic = time.perf_counter()
1442
+ comprehend_query_number_new = 0
1443
 
1444
  # Open with Pikepdf to get text lines
1445
  pikepdf_pdf = Pdf.open(filename)
 
1529
 
1530
  if chosen_redact_entities:
1531
 
1532
+ text_line_analyser_result, comprehend_query_number_new = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
1533
+
1534
+ comprehend_query_number = comprehend_query_number + comprehend_query_number_new
1535
 
1536
  else:
1537
  text_line_analyser_result = []
 
1590
 
1591
  current_loop_page += 1
1592
 
1593
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1594
 
1595
 
1596
  annotations_all_pages.append(image_annotations)
 
1602
  page_break_return = True
1603
  progress.close(_tqdm=progress_bar)
1604
 
1605
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1606
 
1607
 
1608
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
tools/helper_functions.py CHANGED
@@ -4,6 +4,9 @@ import gradio as gr
4
  import pandas as pd
5
  import unicodedata
6
 
 
 
 
7
  def get_or_create_env_var(var_name, default_value):
8
  # Get the environment variable if it exists
9
  value = os.environ.get(var_name)
@@ -183,64 +186,60 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
183
  async def get_connection_params(request: gr.Request):
184
  base_folder = ""
185
 
186
- if request:
187
- #print("request user:", request.username)
188
-
189
- #request_data = await request.json() # Parse JSON body
190
- #print("All request data:", request_data)
191
- #context_value = request_data.get('context')
192
- #if 'context' in request_data:
193
- # print("Request context dictionary:", request_data['context'])
194
-
195
- # print("Request headers dictionary:", request.headers)
196
- # print("All host elements", request.client)
197
- # print("IP address:", request.client.host)
198
- # print("Query parameters:", dict(request.query_params))
199
- # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
200
- #print("Request dictionary to object:", request.request.body())
201
- print("Session hash:", request.session_hash)
202
-
203
- # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
204
- CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
205
- #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
206
-
207
- # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
208
- CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
209
- #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
210
-
211
- if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
212
- if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
213
- supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
214
- if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
215
- print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
216
- else:
217
- raise(ValueError, "Custom Cloudfront header value does not match expected value.")
218
-
219
- # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
220
-
221
- if request.username:
222
- out_session_hash = request.username
223
- base_folder = "user-files/"
224
- print("Request username found:", out_session_hash)
225
-
226
- elif 'x-cognito-id' in request.headers:
227
- out_session_hash = request.headers['x-cognito-id']
228
- base_folder = "user-files/"
229
- print("Cognito ID found:", out_session_hash)
230
 
231
- else:
232
- out_session_hash = request.session_hash
233
- base_folder = "temp-files/"
234
- # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
235
 
236
- output_folder = base_folder + out_session_hash + "/"
237
- #if bucket_name:
238
- # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
239
 
240
- return out_session_hash, output_folder, out_session_hash
241
- else:
242
- print("No session parameters found.")
243
- return "",""
244
 
245
 
246
  def clean_unicode_text(text):
 
4
  import pandas as pd
5
  import unicodedata
6
 
7
+ def reset_state_vars():
8
+ return [], [], pd.DataFrame(), pd.DataFrame(), 0, ""
9
+
10
  def get_or_create_env_var(var_name, default_value):
11
  # Get the environment variable if it exists
12
  value = os.environ.get(var_name)
 
186
  async def get_connection_params(request: gr.Request):
187
  base_folder = ""
188
 
189
+ #print("request user:", request.username)
190
+
191
+ #request_data = await request.json() # Parse JSON body
192
+ #print("All request data:", request_data)
193
+ #context_value = request_data.get('context')
194
+ #if 'context' in request_data:
195
+ # print("Request context dictionary:", request_data['context'])
196
+
197
+ # print("Request headers dictionary:", request.headers)
198
+ # print("All host elements", request.client)
199
+ # print("IP address:", request.client.host)
200
+ # print("Query parameters:", dict(request.query_params))
201
+ # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
202
+ #print("Request dictionary to object:", request.request.body())
203
+ print("Session hash:", request.session_hash)
204
+
205
+ # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
206
+ CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
207
+ #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
208
+
209
+ # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
210
+ CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
211
+ #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
212
+
213
+ if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
214
+ if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
215
+ supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
216
+ if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
217
+ print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
218
+ else:
219
+ raise(ValueError, "Custom Cloudfront header value does not match expected value.")
220
+
221
+ # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
222
+
223
+ if request.username:
224
+ out_session_hash = request.username
225
+ base_folder = "user-files/"
226
+ print("Request username found:", out_session_hash)
227
+
228
+ elif 'x-cognito-id' in request.headers:
229
+ out_session_hash = request.headers['x-cognito-id']
230
+ base_folder = "user-files/"
231
+ print("Cognito ID found:", out_session_hash)
 
232
 
233
+ else:
234
+ out_session_hash = request.session_hash
235
+ base_folder = "temp-files/"
236
+ # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
237
 
238
+ output_folder = base_folder + out_session_hash + "/"
239
+ #if bucket_name:
240
+ # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
241
 
242
+ return out_session_hash, output_folder, out_session_hash
 
 
 
243
 
244
 
245
  def clean_unicode_text(text):