seanpedrickcase
commited on
Commit
•
e2aae24
1
Parent(s):
e69ae00
Only shows AWS options when AWS functions enabled. Can now upload previous review files to continue review later. Some review debugging.
Browse files- app.py +43 -28
- tools/aws_functions.py +2 -6
- tools/aws_textract.py +7 -6
- tools/custom_image_analyser_engine.py +2 -2
- tools/file_conversion.py +96 -51
- tools/file_redaction.py +82 -105
- tools/helper_functions.py +10 -0
- tools/redaction_review.py +31 -24
app.py
CHANGED
@@ -9,11 +9,11 @@ import pandas as pd
|
|
9 |
from datetime import datetime
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
|
12 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons,
|
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
-
from tools.redaction_review import apply_redactions,
|
17 |
from tools.data_anonymise import anonymise_data_files
|
18 |
from tools.auth import authenticate_user
|
19 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
@@ -45,12 +45,6 @@ feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
|
45 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
46 |
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
47 |
|
48 |
-
text_ocr_option = "Simple text analysis - PDFs with selectable text"
|
49 |
-
tesseract_ocr_option = "Quick image analysis - typed text"
|
50 |
-
textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
|
51 |
-
|
52 |
-
local_pii_detector = "Local"
|
53 |
-
aws_pii_detector = "AWS Comprehend"
|
54 |
|
55 |
if RUN_AWS_FUNCTIONS == "1":
|
56 |
default_ocr_val = textract_option
|
@@ -104,7 +98,8 @@ with app:
|
|
104 |
textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
|
105 |
comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
|
106 |
|
107 |
-
|
|
|
108 |
doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
|
109 |
data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
|
110 |
|
@@ -127,6 +122,9 @@ with app:
|
|
127 |
zoom_true_bool = gr.State(True)
|
128 |
zoom_false_bool = gr.State(False)
|
129 |
|
|
|
|
|
|
|
130 |
|
131 |
###
|
132 |
# UI DESIGN
|
@@ -145,8 +143,12 @@ with app:
|
|
145 |
with gr.Tab("PDFs/images"):
|
146 |
with gr.Accordion("Redact document", open = True):
|
147 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
150 |
|
151 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
152 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
@@ -178,6 +180,8 @@ with app:
|
|
178 |
with gr.Row():
|
179 |
annotate_zoom_in = gr.Button("Zoom in")
|
180 |
annotate_zoom_out = gr.Button("Zoom out")
|
|
|
|
|
181 |
|
182 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
183 |
|
@@ -199,7 +203,8 @@ with app:
|
|
199 |
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
200 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
201 |
|
202 |
-
output_review_files = gr.File(label="Review output files")
|
|
|
203 |
|
204 |
# TEXT / TABULAR DATA TAB
|
205 |
with gr.Tab(label="Open text or Excel/csv files"):
|
@@ -231,6 +236,8 @@ with app:
|
|
231 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
232 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
233 |
|
|
|
|
|
234 |
# SETTINGS TAB
|
235 |
with gr.Tab(label="Redaction settings"):
|
236 |
gr.Markdown(
|
@@ -272,10 +279,10 @@ with app:
|
|
272 |
###
|
273 |
# PDF/IMAGE REDACTION
|
274 |
###
|
275 |
-
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[
|
276 |
|
277 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
278 |
-
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
279 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
280 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
281 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
|
@@ -283,10 +290,10 @@ with app:
|
|
283 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
284 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
285 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
286 |
-
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
|
287 |
|
288 |
# If a file has been completed, the function will continue onto the next document
|
289 |
-
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
|
290 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
291 |
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
|
292 |
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
@@ -299,12 +306,12 @@ with app:
|
|
299 |
# Page controls at top
|
300 |
annotate_current_page.submit(
|
301 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
302 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
303 |
|
304 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
305 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
306 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
307 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
308 |
|
309 |
# Zoom in and out on annotator
|
310 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
@@ -313,20 +320,28 @@ with app:
|
|
313 |
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
314 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
315 |
|
316 |
-
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
|
|
|
|
|
|
317 |
|
318 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
319 |
-
annotation_button_apply.click(apply_redactions, inputs=[annotator,
|
320 |
|
321 |
# Page controls at bottom
|
322 |
annotate_current_page_bottom.submit(
|
323 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
324 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
325 |
|
326 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
327 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
328 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
329 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
###
|
332 |
# TABULAR DATA REDACTION
|
@@ -364,8 +379,8 @@ with app:
|
|
364 |
|
365 |
# User submitted feedback for pdf redactions
|
366 |
pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
367 |
-
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text,
|
368 |
-
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text,
|
369 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
370 |
|
371 |
# User submitted feedback for data redactions
|
@@ -376,8 +391,8 @@ with app:
|
|
376 |
|
377 |
# Log processing time/token usage when making a query
|
378 |
usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
379 |
-
usage_callback.setup([session_hash_textbox,
|
380 |
-
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox,
|
381 |
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
382 |
|
383 |
# Launch the Gradio app
|
|
|
9 |
from datetime import datetime
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
|
12 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
+
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
|
17 |
from tools.data_anonymise import anonymise_data_files
|
18 |
from tools.auth import authenticate_user
|
19 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
|
|
45 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
46 |
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
if RUN_AWS_FUNCTIONS == "1":
|
50 |
default_ocr_val = textract_option
|
|
|
98 |
textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
|
99 |
comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
|
100 |
|
101 |
+
doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
|
102 |
+
doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
|
103 |
doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
|
104 |
data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
|
105 |
|
|
|
122 |
zoom_true_bool = gr.State(True)
|
123 |
zoom_false_bool = gr.State(False)
|
124 |
|
125 |
+
clear_all_page_redactions = gr.State(True)
|
126 |
+
prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
|
127 |
+
|
128 |
|
129 |
###
|
130 |
# UI DESIGN
|
|
|
143 |
with gr.Tab("PDFs/images"):
|
144 |
with gr.Accordion("Redact document", open = True):
|
145 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
|
146 |
+
if RUN_AWS_FUNCTIONS == "1":
|
147 |
+
in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
148 |
+
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
149 |
+
else:
|
150 |
+
in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
|
151 |
+
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
|
152 |
|
153 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
154 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
|
|
180 |
with gr.Row():
|
181 |
annotate_zoom_in = gr.Button("Zoom in")
|
182 |
annotate_zoom_out = gr.Button("Zoom out")
|
183 |
+
with gr.Row():
|
184 |
+
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page")
|
185 |
|
186 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
187 |
|
|
|
203 |
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
204 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
205 |
|
206 |
+
output_review_files = gr.File(label="Review output files", file_count='multiple')
|
207 |
+
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...redactions.json)")
|
208 |
|
209 |
# TEXT / TABULAR DATA TAB
|
210 |
with gr.Tab(label="Open text or Excel/csv files"):
|
|
|
236 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
237 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
238 |
|
239 |
+
|
240 |
+
|
241 |
# SETTINGS TAB
|
242 |
with gr.Tab(label="Redaction settings"):
|
243 |
gr.Markdown(
|
|
|
279 |
###
|
280 |
# PDF/IMAGE REDACTION
|
281 |
###
|
282 |
+
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox])
|
283 |
|
284 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
285 |
+
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
|
286 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
287 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
288 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
|
|
|
290 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
291 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
292 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
293 |
+
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
294 |
|
295 |
# If a file has been completed, the function will continue onto the next document
|
296 |
+
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
|
297 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
298 |
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
|
299 |
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
|
|
306 |
# Page controls at top
|
307 |
annotate_current_page.submit(
|
308 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
309 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
310 |
|
311 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
312 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
313 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
314 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
315 |
|
316 |
# Zoom in and out on annotator
|
317 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
|
|
320 |
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
321 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
322 |
|
323 |
+
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
324 |
+
|
325 |
+
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
326 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
327 |
|
328 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
329 |
+
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
330 |
|
331 |
# Page controls at bottom
|
332 |
annotate_current_page_bottom.submit(
|
333 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
334 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
335 |
|
336 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
337 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
338 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
339 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
340 |
+
|
341 |
+
# Upload previous files for modifying redactions
|
342 |
+
upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox]).\
|
343 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
|
344 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
345 |
|
346 |
###
|
347 |
# TABULAR DATA REDACTION
|
|
|
379 |
|
380 |
# User submitted feedback for pdf redactions
|
381 |
pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
382 |
+
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
|
383 |
+
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
384 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
385 |
|
386 |
# User submitted feedback for data redactions
|
|
|
391 |
|
392 |
# Log processing time/token usage when making a query
|
393 |
usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
394 |
+
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
|
395 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
|
396 |
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
397 |
|
398 |
# Launch the Gradio app
|
tools/aws_functions.py
CHANGED
@@ -10,17 +10,13 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
10 |
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
|
13 |
-
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "
|
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
15 |
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
17 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
18 |
|
19 |
-
|
20 |
-
comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
|
21 |
-
except Exception as e:
|
22 |
-
print(e)
|
23 |
-
comprehend_client = ""
|
24 |
|
25 |
def get_assumed_role_info():
|
26 |
sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
|
|
|
10 |
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
|
13 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
15 |
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
17 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
18 |
|
19 |
+
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def get_assumed_role_info():
|
22 |
sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
|
tools/aws_textract.py
CHANGED
@@ -23,15 +23,16 @@ def extract_textract_metadata(response):
|
|
23 |
#'NumberOfPages': number_of_pages
|
24 |
})
|
25 |
|
26 |
-
def analyse_page_with_textract(pdf_page_bytes, page_no):
|
27 |
'''
|
28 |
Analyse page with AWS Textract
|
29 |
'''
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
35 |
|
36 |
print("Analysing page with AWS Textract")
|
37 |
|
|
|
23 |
#'NumberOfPages': number_of_pages
|
24 |
})
|
25 |
|
26 |
+
def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
|
27 |
'''
|
28 |
Analyse page with AWS Textract
|
29 |
'''
|
30 |
+
if client == "":
|
31 |
+
try:
|
32 |
+
client = boto3.client('textract')
|
33 |
+
except:
|
34 |
+
print("Cannot connect to AWS Textract")
|
35 |
+
return [], "" # Return an empty list and an empty string
|
36 |
|
37 |
print("Analysing page with AWS Textract")
|
38 |
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -11,7 +11,6 @@ from PIL import ImageDraw, ImageFont, Image
|
|
11 |
from typing import Optional, Tuple, Union
|
12 |
from copy import deepcopy
|
13 |
from tools.helper_functions import clean_unicode_text
|
14 |
-
from tools.aws_functions import comprehend_client
|
15 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
16 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
17 |
#import string # Import string to get a list of common punctuation characters
|
@@ -464,7 +463,8 @@ class CustomImageAnalyzerEngine:
|
|
464 |
line_level_ocr_results: List[OCRResult],
|
465 |
ocr_results_with_children: Dict[str, Dict],
|
466 |
chosen_redact_comprehend_entities:List[str],
|
467 |
-
pii_identification_method:str="Local",
|
|
|
468 |
**text_analyzer_kwargs
|
469 |
) -> List[CustomImageRecognizerResult]:
|
470 |
# Define English as default language, if not specified
|
|
|
11 |
from typing import Optional, Tuple, Union
|
12 |
from copy import deepcopy
|
13 |
from tools.helper_functions import clean_unicode_text
|
|
|
14 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
#import string # Import string to get a list of common punctuation characters
|
|
|
463 |
line_level_ocr_results: List[OCRResult],
|
464 |
ocr_results_with_children: Dict[str, Dict],
|
465 |
chosen_redact_comprehend_entities:List[str],
|
466 |
+
pii_identification_method:str="Local",
|
467 |
+
comprehend_client="",
|
468 |
**text_analyzer_kwargs
|
469 |
) -> List[CustomImageRecognizerResult]:
|
470 |
# Define English as default language, if not specified
|
tools/file_conversion.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
-
from tools.helper_functions import get_file_path_end, output_folder,
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
|
6 |
import os
|
|
|
7 |
import gradio as gr
|
8 |
import time
|
9 |
import json
|
@@ -96,8 +97,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = imag
|
|
96 |
|
97 |
return images
|
98 |
|
99 |
-
|
100 |
-
# %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
101 |
def process_file(file_path):
|
102 |
# Get the file extension
|
103 |
file_extension = os.path.splitext(file_path)[1].lower()
|
@@ -127,11 +127,15 @@ def get_input_file_names(file_input):
|
|
127 |
'''
|
128 |
|
129 |
all_relevant_files = []
|
|
|
|
|
130 |
|
131 |
#print("file_input:", file_input)
|
132 |
|
133 |
if isinstance(file_input, str):
|
134 |
file_input_list = [file_input]
|
|
|
|
|
135 |
|
136 |
for file in file_input_list:
|
137 |
if isinstance(file, str):
|
@@ -141,21 +145,19 @@ def get_input_file_names(file_input):
|
|
141 |
|
142 |
file_path_without_ext = get_file_path_end(file_path)
|
143 |
|
144 |
-
#print("file:", file_path)
|
145 |
-
|
146 |
file_extension = os.path.splitext(file_path)[1].lower()
|
147 |
|
148 |
-
file_name_with_extension = file_path_without_ext + file_extension
|
149 |
-
|
150 |
# Check if the file is an image type
|
151 |
if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
|
152 |
all_relevant_files.append(file_path_without_ext)
|
|
|
|
|
153 |
|
154 |
all_relevant_files_str = ", ".join(all_relevant_files)
|
155 |
|
156 |
-
|
157 |
|
158 |
-
return all_relevant_files_str, file_name_with_extension
|
159 |
|
160 |
def prepare_image_or_pdf(
|
161 |
file_paths: List[str],
|
@@ -166,6 +168,8 @@ def prepare_image_or_pdf(
|
|
166 |
first_loop_state: bool = False,
|
167 |
number_of_pages:int = 1,
|
168 |
current_loop_page_number:int=0,
|
|
|
|
|
169 |
progress: Progress = Progress(track_tqdm=True)
|
170 |
) -> tuple[List[str], List[str]]:
|
171 |
"""
|
@@ -182,7 +186,10 @@ def prepare_image_or_pdf(
|
|
182 |
out_message (List[str]): List to store output messages.
|
183 |
first_loop_state (bool): Flag indicating if this is the first iteration.
|
184 |
number_of_pages (int): integer indicating the number of pages in the document
|
|
|
|
|
185 |
progress (Progress): Progress tracker for the operation.
|
|
|
186 |
|
187 |
Returns:
|
188 |
tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
|
@@ -194,7 +201,8 @@ def prepare_image_or_pdf(
|
|
194 |
if first_loop_state==True:
|
195 |
print("first_loop_state is True")
|
196 |
latest_file_completed = 0
|
197 |
-
out_message = []
|
|
|
198 |
else:
|
199 |
print("Now attempting file:", str(latest_file_completed))
|
200 |
|
@@ -222,7 +230,7 @@ def prepare_image_or_pdf(
|
|
222 |
else:
|
223 |
file_path_number = len(file_paths)
|
224 |
|
225 |
-
print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
|
226 |
print("Number of file paths:", file_path_number)
|
227 |
print("Latest_file_completed:", latest_file_completed)
|
228 |
|
@@ -235,7 +243,7 @@ def prepare_image_or_pdf(
|
|
235 |
final_out_message = '\n'.join(out_message)
|
236 |
else:
|
237 |
final_out_message = out_message
|
238 |
-
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
239 |
|
240 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
241 |
|
@@ -245,13 +253,16 @@ def prepare_image_or_pdf(
|
|
245 |
file_paths_list = [file_paths]
|
246 |
file_paths_loop = file_paths_list
|
247 |
else:
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
|
|
|
|
|
|
253 |
|
254 |
-
#
|
255 |
for file in file_paths_loop:
|
256 |
if isinstance(file, str):
|
257 |
file_path = file
|
@@ -259,50 +270,87 @@ def prepare_image_or_pdf(
|
|
259 |
file_path = file.name
|
260 |
file_path_without_ext = get_file_path_end(file_path)
|
261 |
|
262 |
-
|
|
|
|
|
|
|
263 |
|
264 |
file_extension = os.path.splitext(file_path)[1].lower()
|
265 |
|
266 |
# Check if the file is an image type
|
267 |
if file_extension in ['.jpg', '.jpeg', '.png']:
|
268 |
-
in_redact_method =
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
if is_pdf_or_image(file_path) == False:
|
289 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
290 |
print(out_message)
|
291 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
292 |
|
293 |
converted_file_path = process_file(file_path)
|
294 |
image_file_path = converted_file_path
|
295 |
-
#print("Out file path at image conversion step:", converted_file_path)
|
296 |
|
297 |
-
elif in_redact_method ==
|
298 |
if is_pdf(file_path) == False:
|
299 |
out_message = "Please upload a PDF file for text analysis."
|
300 |
print(out_message)
|
301 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
302 |
|
303 |
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
304 |
image_file_path = process_file(file_path)
|
305 |
-
|
306 |
|
307 |
converted_file_paths.append(converted_file_path)
|
308 |
image_file_paths.extend(image_file_path)
|
@@ -310,7 +358,7 @@ def prepare_image_or_pdf(
|
|
310 |
# If a pdf, load as a pymupdf document
|
311 |
if is_pdf(file_path):
|
312 |
pymupdf_doc = pymupdf.open(file_path)
|
313 |
-
|
314 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
315 |
# Convert image to a pymupdf document
|
316 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
@@ -318,9 +366,7 @@ def prepare_image_or_pdf(
|
|
318 |
rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
|
319 |
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
320 |
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
321 |
-
|
322 |
-
#pymupdf_doc.save(output_path) # Uncomment and specify output_path if needed
|
323 |
-
#pymupdf_doc.close() # Close the PDF document
|
324 |
|
325 |
toc = time.perf_counter()
|
326 |
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
@@ -332,9 +378,8 @@ def prepare_image_or_pdf(
|
|
332 |
|
333 |
number_of_pages = len(image_file_paths)
|
334 |
|
335 |
-
|
336 |
-
|
337 |
-
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
338 |
|
339 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
340 |
file_path_without_ext = get_file_path_end(in_file_path)
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
|
6 |
import os
|
7 |
+
import re
|
8 |
import gradio as gr
|
9 |
import time
|
10 |
import json
|
|
|
97 |
|
98 |
return images
|
99 |
|
100 |
+
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
|
|
101 |
def process_file(file_path):
|
102 |
# Get the file extension
|
103 |
file_extension = os.path.splitext(file_path)[1].lower()
|
|
|
127 |
'''
|
128 |
|
129 |
all_relevant_files = []
|
130 |
+
file_name_with_extension = ""
|
131 |
+
full_file_name = ""
|
132 |
|
133 |
#print("file_input:", file_input)
|
134 |
|
135 |
if isinstance(file_input, str):
|
136 |
file_input_list = [file_input]
|
137 |
+
else:
|
138 |
+
file_input_list = file_input
|
139 |
|
140 |
for file in file_input_list:
|
141 |
if isinstance(file, str):
|
|
|
145 |
|
146 |
file_path_without_ext = get_file_path_end(file_path)
|
147 |
|
|
|
|
|
148 |
file_extension = os.path.splitext(file_path)[1].lower()
|
149 |
|
|
|
|
|
150 |
# Check if the file is an image type
|
151 |
if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
|
152 |
all_relevant_files.append(file_path_without_ext)
|
153 |
+
file_name_with_extension = file_path_without_ext + file_extension
|
154 |
+
full_file_name = file_path
|
155 |
|
156 |
all_relevant_files_str = ", ".join(all_relevant_files)
|
157 |
|
158 |
+
print("all_relevant_files_str:", all_relevant_files_str)
|
159 |
|
160 |
+
return all_relevant_files_str, file_name_with_extension, full_file_name
|
161 |
|
162 |
def prepare_image_or_pdf(
|
163 |
file_paths: List[str],
|
|
|
168 |
first_loop_state: bool = False,
|
169 |
number_of_pages:int = 1,
|
170 |
current_loop_page_number:int=0,
|
171 |
+
all_annotations_object:List = [],
|
172 |
+
prepare_for_review:bool = False,
|
173 |
progress: Progress = Progress(track_tqdm=True)
|
174 |
) -> tuple[List[str], List[str]]:
|
175 |
"""
|
|
|
186 |
out_message (List[str]): List to store output messages.
|
187 |
first_loop_state (bool): Flag indicating if this is the first iteration.
|
188 |
number_of_pages (int): integer indicating the number of pages in the document
|
189 |
+
all_annotations_object(List of annotation objects): All annotations for current document
|
190 |
+
prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
191 |
progress (Progress): Progress tracker for the operation.
|
192 |
+
|
193 |
|
194 |
Returns:
|
195 |
tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
|
|
|
201 |
if first_loop_state==True:
|
202 |
print("first_loop_state is True")
|
203 |
latest_file_completed = 0
|
204 |
+
out_message = []
|
205 |
+
all_annotations_object = []
|
206 |
else:
|
207 |
print("Now attempting file:", str(latest_file_completed))
|
208 |
|
|
|
230 |
else:
|
231 |
file_path_number = len(file_paths)
|
232 |
|
233 |
+
#print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
|
234 |
print("Number of file paths:", file_path_number)
|
235 |
print("Latest_file_completed:", latest_file_completed)
|
236 |
|
|
|
243 |
final_out_message = '\n'.join(out_message)
|
244 |
else:
|
245 |
final_out_message = out_message
|
246 |
+
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
247 |
|
248 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
249 |
|
|
|
253 |
file_paths_list = [file_paths]
|
254 |
file_paths_loop = file_paths_list
|
255 |
else:
|
256 |
+
if prepare_for_review == False:
|
257 |
+
file_paths_list = file_paths
|
258 |
+
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
259 |
+
else:
|
260 |
+
file_paths_list = file_paths
|
261 |
+
file_paths_loop = file_paths
|
262 |
+
# Sort files to prioritise PDF files first, then JSON files. This means that the pdf can be loaded in, and pdf page path locations can be added to the json
|
263 |
+
file_paths_loop = sorted(file_paths_loop, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
264 |
|
265 |
+
# Loop through files to load in
|
266 |
for file in file_paths_loop:
|
267 |
if isinstance(file, str):
|
268 |
file_path = file
|
|
|
270 |
file_path = file.name
|
271 |
file_path_without_ext = get_file_path_end(file_path)
|
272 |
|
273 |
+
if not file_path:
|
274 |
+
out_message = "Please select a file."
|
275 |
+
print(out_message)
|
276 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
277 |
|
278 |
file_extension = os.path.splitext(file_path)[1].lower()
|
279 |
|
280 |
# Check if the file is an image type
|
281 |
if file_extension in ['.jpg', '.jpeg', '.png']:
|
282 |
+
in_redact_method = tesseract_ocr_option
|
283 |
+
|
284 |
+
|
285 |
+
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
286 |
+
if file_path.endswith(".json"):
|
287 |
+
|
288 |
+
if prepare_for_review == True:
|
289 |
+
if isinstance(file_path, str):
|
290 |
+
with open(file_path, 'r') as json_file:
|
291 |
+
all_annotations_object = json.load(json_file)
|
292 |
+
else:
|
293 |
+
# Assuming file_path is a NamedString or similar
|
294 |
+
all_annotations_object = json.loads(file_path) # Use loads for string content
|
295 |
+
|
296 |
+
# Get list of page numbers
|
297 |
+
image_file_paths_pages = [
|
298 |
+
int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
|
299 |
+
for s in image_file_paths
|
300 |
+
if re.search(r'_(\d+)\.png$', os.path.basename(s))
|
301 |
+
]
|
302 |
+
image_file_paths_pages = [int(i) for i in image_file_paths_pages]
|
303 |
+
|
304 |
+
|
305 |
+
# If PDF pages have been converted to image files, replace the current image paths in the json to this
|
306 |
+
if image_file_paths:
|
307 |
+
for i, annotation in enumerate(all_annotations_object):
|
308 |
+
annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
|
309 |
+
|
310 |
+
# Check if the annotation page number exists in the image file paths pages
|
311 |
+
if annotation_page_number in image_file_paths_pages:
|
312 |
+
|
313 |
+
# Set the correct image page directly since we know it's in the list
|
314 |
+
correct_image_page = annotation_page_number
|
315 |
+
annotation["image"] = image_file_paths[correct_image_page]
|
316 |
+
else:
|
317 |
+
print("Page not found.")
|
318 |
+
|
319 |
+
#print("all_annotations_object:", all_annotations_object)
|
320 |
+
|
321 |
+
# Write the response to a JSON file in output folder
|
322 |
+
out_folder = output_folder + file_path_without_ext + file_extension
|
323 |
+
with open(out_folder, 'w') as json_file:
|
324 |
+
json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
325 |
+
continue
|
326 |
+
|
327 |
+
else:
|
328 |
+
# If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
|
329 |
+
json_contents = json.load(file_path)
|
330 |
+
# Write the response to a JSON file in output folder
|
331 |
+
out_folder = output_folder + file_path_without_ext + file_extension
|
332 |
+
with open(out_folder, 'w') as json_file:
|
333 |
+
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
334 |
+
continue
|
335 |
+
|
336 |
+
# Convert pdf/image file to correct format for redaction
|
337 |
+
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
338 |
if is_pdf_or_image(file_path) == False:
|
339 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
340 |
print(out_message)
|
341 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
342 |
|
343 |
converted_file_path = process_file(file_path)
|
344 |
image_file_path = converted_file_path
|
|
|
345 |
|
346 |
+
elif in_redact_method == text_ocr_option:
|
347 |
if is_pdf(file_path) == False:
|
348 |
out_message = "Please upload a PDF file for text analysis."
|
349 |
print(out_message)
|
350 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
351 |
|
352 |
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
353 |
image_file_path = process_file(file_path)
|
|
|
354 |
|
355 |
converted_file_paths.append(converted_file_path)
|
356 |
image_file_paths.extend(image_file_path)
|
|
|
358 |
# If a pdf, load as a pymupdf document
|
359 |
if is_pdf(file_path):
|
360 |
pymupdf_doc = pymupdf.open(file_path)
|
361 |
+
|
362 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
363 |
# Convert image to a pymupdf document
|
364 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
|
|
366 |
rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
|
367 |
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
368 |
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
369 |
+
|
|
|
|
|
370 |
|
371 |
toc = time.perf_counter()
|
372 |
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
|
|
378 |
|
379 |
number_of_pages = len(image_file_paths)
|
380 |
|
381 |
+
|
382 |
+
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
|
|
383 |
|
384 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
385 |
file_path_without_ext = get_file_path_end(in_file_path)
|
tools/file_redaction.py
CHANGED
@@ -8,7 +8,6 @@ import boto3
|
|
8 |
from tqdm import tqdm
|
9 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
10 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
11 |
-
|
12 |
from typing import List, Dict, Tuple
|
13 |
import pandas as pd
|
14 |
|
@@ -19,32 +18,27 @@ from pikepdf import Pdf, Dictionary, Name
|
|
19 |
import pymupdf
|
20 |
from pymupdf import Rect
|
21 |
from fitz import Document, Page
|
22 |
-
|
23 |
import gradio as gr
|
24 |
from gradio import Progress
|
25 |
from collections import defaultdict # For efficient grouping
|
26 |
|
27 |
from presidio_analyzer import RecognizerResult
|
28 |
-
|
29 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
30 |
from tools.file_conversion import process_file, image_dpi
|
31 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
|
32 |
-
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
|
33 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
34 |
-
# from tools.data_anonymise import generate_decision_process_output
|
35 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
36 |
-
from tools.aws_functions import comprehend_client
|
37 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
38 |
|
39 |
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
40 |
-
|
41 |
page_break_value = get_or_create_env_var('page_break_value', '500')
|
42 |
print(f'The value of page_break_value is {page_break_value}')
|
43 |
|
44 |
max_time_value = get_or_create_env_var('max_time_value', '105')
|
45 |
print(f'The value of max_time_value is {max_time_value}')
|
46 |
|
47 |
-
|
48 |
def sum_numbers_before_seconds(string:str):
|
49 |
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
50 |
|
@@ -192,8 +186,33 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
192 |
else:
|
193 |
in_allow_list_flat = []
|
194 |
|
195 |
-
progress(0.5, desc="Redacting file")
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
if isinstance(file_paths, str):
|
199 |
file_paths_list = [file_paths]
|
@@ -217,28 +236,21 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
217 |
if is_a_pdf == False:
|
218 |
# If user has not submitted a pdf, assume it's an image
|
219 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
220 |
-
in_redact_method =
|
221 |
else:
|
222 |
out_message = "No file selected"
|
223 |
print(out_message)
|
224 |
|
225 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
226 |
|
227 |
-
if in_redact_method ==
|
228 |
|
229 |
-
|
230 |
-
# Try accessing Textract through boto3
|
231 |
-
try:
|
232 |
-
boto3.client('textract')
|
233 |
-
except:
|
234 |
-
out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
|
235 |
-
print(out_message)
|
236 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
|
237 |
|
238 |
#Analyse and redact image-based pdf or image
|
239 |
if is_pdf_or_image(file_path) == False:
|
240 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
241 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, comprehend_query_number
|
242 |
|
243 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
244 |
|
@@ -262,14 +274,16 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
262 |
all_decision_process_table,
|
263 |
pymupdf_doc,
|
264 |
pii_identification_method,
|
265 |
-
comprehend_query_number
|
|
|
|
|
266 |
|
267 |
# Save Textract request metadata (if exists)
|
268 |
if new_request_metadata:
|
269 |
print("Request metadata:", new_request_metadata)
|
270 |
all_request_metadata.append(new_request_metadata)
|
271 |
|
272 |
-
elif in_redact_method ==
|
273 |
|
274 |
logging_file_paths = ""
|
275 |
|
@@ -287,7 +301,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
287 |
in_allow_list_flat,
|
288 |
page_min,
|
289 |
page_max,
|
290 |
-
|
291 |
current_loop_page,
|
292 |
page_break_return,
|
293 |
annotations_all_pages,
|
@@ -295,7 +309,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
295 |
all_decision_process_table,
|
296 |
pymupdf_doc,
|
297 |
pii_identification_method,
|
298 |
-
comprehend_query_number
|
|
|
299 |
|
300 |
else:
|
301 |
out_message = "No redaction method selected"
|
@@ -328,14 +343,21 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
328 |
|
329 |
logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
|
330 |
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
331 |
-
#log_files_output_paths.append(logs_output_file_name)
|
332 |
out_file_paths.append(logs_output_file_name)
|
333 |
|
334 |
all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
|
335 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
336 |
-
#log_files_output_paths.append(all_text_output_file_name)
|
337 |
out_file_paths.append(all_text_output_file_name)
|
338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
# Make a combined message for the file
|
340 |
if isinstance(out_message, list):
|
341 |
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
@@ -351,38 +373,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
351 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
352 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
353 |
|
354 |
-
#out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
|
355 |
-
#combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
356 |
-
|
357 |
-
# Increase latest file completed count unless we are at the last file
|
358 |
-
# if latest_file_completed != len(file_paths):
|
359 |
-
# print("Completed file number:", str(latest_file_completed), "more files to do")
|
360 |
-
|
361 |
-
# if current_loop_page >= number_of_pages:
|
362 |
-
|
363 |
-
# print("Current page loop", current_loop_page, "is greater than or equal to number of pages:", number_of_pages)
|
364 |
-
# latest_file_completed += 1
|
365 |
-
|
366 |
-
# # Set to 999 to be a big number not to interrupt processing of large files by user
|
367 |
-
# current_loop_page = 999
|
368 |
-
|
369 |
-
# out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
370 |
-
# pymupdf_doc.save(out_text_file_path)
|
371 |
-
# out_file_paths.append(out_text_file_path)
|
372 |
-
|
373 |
-
# # Write logs to file
|
374 |
-
# decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
|
375 |
-
# all_decision_process_table.to_csv(decision_logs_output_file_name)
|
376 |
-
# log_files_output_paths.append(decision_logs_output_file_name)
|
377 |
-
|
378 |
-
# all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
|
379 |
-
# all_line_level_ocr_results_df.to_csv(all_text_output_file_name)
|
380 |
-
# log_files_output_paths.append(all_text_output_file_name)
|
381 |
-
|
382 |
-
# out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
|
383 |
-
|
384 |
-
# if isinstance(out_message, list):
|
385 |
-
# out_message.append(out_message_new) # Ensure out_message is a list of strings
|
386 |
else:
|
387 |
toc = time.perf_counter()
|
388 |
time_taken = toc - tic
|
@@ -501,27 +491,6 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
|
|
501 |
|
502 |
return x1, new_y1, x2, new_y2
|
503 |
|
504 |
-
# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
505 |
-
# '''
|
506 |
-
# Converts coordinates from pymupdf format to image coordinates.
|
507 |
-
# '''
|
508 |
-
|
509 |
-
# rect_height = pymupdf_page.rect.height
|
510 |
-
# rect_width = pymupdf_page.rect.width
|
511 |
-
|
512 |
-
# image_page_width, image_page_height = image.size
|
513 |
-
|
514 |
-
# # Calculate scaling factors between pymupdf and PIL image
|
515 |
-
# scale_width = image_page_width / rect_width
|
516 |
-
# scale_height = image_page_height / rect_height
|
517 |
-
|
518 |
-
# x1_image = x1 * scale_width
|
519 |
-
# y1_image = ((rect_height - y2) * scale_height)
|
520 |
-
# x2_image = x2 * scale_width
|
521 |
-
# y2_image = ((rect_height - y1) * scale_height)
|
522 |
-
|
523 |
-
# return x1_image, y1_image, x2_image, y2_image
|
524 |
-
|
525 |
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
526 |
'''
|
527 |
Converts coordinates from pymupdf format to image coordinates,
|
@@ -625,10 +594,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
|
|
625 |
# Should already be in correct format if img_annotator_box is an input
|
626 |
if isinstance(annot, dict):
|
627 |
img_annotation_box = annot
|
628 |
-
#try:
|
629 |
-
# img_annotation_box["label"] = annot["label"]
|
630 |
-
#except:
|
631 |
-
# img_annotation_box["label"] = "Redaction"
|
632 |
|
633 |
x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
|
634 |
|
@@ -823,7 +788,7 @@ def redact_image_pdf(file_path:str,
|
|
823 |
is_a_pdf:bool=True,
|
824 |
page_min:int=0,
|
825 |
page_max:int=999,
|
826 |
-
analysis_type:str=
|
827 |
handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
|
828 |
request_metadata:str="", current_loop_page:int=0,
|
829 |
page_break_return:bool=False,
|
@@ -834,6 +799,8 @@ def redact_image_pdf(file_path:str,
|
|
834 |
pymupdf_doc = [],
|
835 |
pii_identification_method:str="Local",
|
836 |
comprehend_query_number:int=0,
|
|
|
|
|
837 |
page_break_val:int=int(page_break_value),
|
838 |
logging_file_paths:List=[],
|
839 |
max_time:int=int(max_time_value),
|
@@ -851,7 +818,7 @@ def redact_image_pdf(file_path:str,
|
|
851 |
- is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
|
852 |
- page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
|
853 |
- page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
|
854 |
-
- analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to
|
855 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
|
856 |
- request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
|
857 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
@@ -862,6 +829,8 @@ def redact_image_pdf(file_path:str,
|
|
862 |
- pymupdf_doc (List, optional): The document as a PyMupdf object.
|
863 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
864 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
|
|
|
|
865 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
866 |
- logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
|
867 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
@@ -874,7 +843,15 @@ def redact_image_pdf(file_path:str,
|
|
874 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
875 |
comprehend_query_number_new = 0
|
876 |
|
877 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
878 |
|
879 |
tic = time.perf_counter()
|
880 |
|
@@ -897,8 +874,8 @@ def redact_image_pdf(file_path:str,
|
|
897 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
898 |
#print("Current_loop_page:", current_loop_page)
|
899 |
|
900 |
-
if analysis_type ==
|
901 |
-
elif analysis_type ==
|
902 |
|
903 |
if current_loop_page == 0: page_loop_start = 0
|
904 |
else: page_loop_start = current_loop_page
|
@@ -942,7 +919,7 @@ def redact_image_pdf(file_path:str,
|
|
942 |
else: ocr_lang = language
|
943 |
|
944 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
945 |
-
if analysis_type ==
|
946 |
|
947 |
word_level_ocr_results = image_analyser.perform_ocr(image)
|
948 |
|
@@ -951,7 +928,7 @@ def redact_image_pdf(file_path:str,
|
|
951 |
|
952 |
|
953 |
# Import results from json and convert
|
954 |
-
if analysis_type ==
|
955 |
|
956 |
# Convert the image to bytes using an in-memory buffer
|
957 |
image_buffer = io.BytesIO()
|
@@ -962,7 +939,7 @@ def redact_image_pdf(file_path:str,
|
|
962 |
json_file_path = output_folder + file_name + "_textract.json"
|
963 |
|
964 |
if not os.path.exists(json_file_path):
|
965 |
-
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number) # Analyse page with Textract
|
966 |
logging_file_paths.append(json_file_path)
|
967 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
968 |
|
@@ -1010,7 +987,8 @@ def redact_image_pdf(file_path:str,
|
|
1010 |
line_level_ocr_results,
|
1011 |
line_level_ocr_results_with_children,
|
1012 |
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
1013 |
-
pii_identification_method = pii_identification_method,
|
|
|
1014 |
language=language,
|
1015 |
entities=chosen_redact_entities,
|
1016 |
allow_list=allow_list,
|
@@ -1018,21 +996,13 @@ def redact_image_pdf(file_path:str,
|
|
1018 |
)
|
1019 |
|
1020 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
1021 |
-
|
1022 |
-
# redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
|
1023 |
-
# line_level_ocr_results_with_children,
|
1024 |
-
# language,
|
1025 |
-
# chosen_redact_entities,
|
1026 |
-
# allow_list,
|
1027 |
-
# score_threshold,
|
1028 |
-
# pii_identification_method)
|
1029 |
|
1030 |
else:
|
1031 |
redaction_bboxes = []
|
1032 |
|
1033 |
|
1034 |
-
if analysis_type ==
|
1035 |
-
elif analysis_type ==
|
1036 |
|
1037 |
# Save decision making process
|
1038 |
bboxes_str = str(redaction_bboxes)
|
@@ -1409,7 +1379,7 @@ def redact_text_pdf(
|
|
1409 |
allow_list: List[str] = None, # Optional list of allowed entities
|
1410 |
page_min: int = 0, # Minimum page number to start redaction
|
1411 |
page_max: int = 999, # Maximum page number to end redaction
|
1412 |
-
analysis_type: str =
|
1413 |
current_loop_page: int = 0, # Current page being processed in the loop
|
1414 |
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
1415 |
annotations_all_pages: List = [], # List of annotations across all pages
|
@@ -1418,6 +1388,7 @@ def redact_text_pdf(
|
|
1418 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1419 |
pii_identification_method: str = "Local",
|
1420 |
comprehend_query_number:int = 0,
|
|
|
1421 |
page_break_val: int = int(page_break_value), # Value for page break
|
1422 |
max_time: int = int(max_time_value),
|
1423 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
@@ -1443,12 +1414,18 @@ def redact_text_pdf(
|
|
1443 |
- all_decision_process_table: DataFrame for decision process table
|
1444 |
- pymupdf_doc: List of PyMuPDF documents
|
1445 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
1446 |
-
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
|
|
1447 |
- page_break_val: Value for page break
|
1448 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1449 |
- progress: Progress tracking object
|
1450 |
'''
|
1451 |
|
|
|
|
|
|
|
|
|
|
|
1452 |
tic = time.perf_counter()
|
1453 |
|
1454 |
# Open with Pikepdf to get text lines
|
@@ -1500,7 +1477,7 @@ def redact_text_pdf(
|
|
1500 |
decision_process_table_on_page = pd.DataFrame()
|
1501 |
page_text_outputs = pd.DataFrame()
|
1502 |
|
1503 |
-
if analysis_type ==
|
1504 |
for n, text_container in enumerate(page_layout):
|
1505 |
|
1506 |
text_container_analyser_results = []
|
|
|
8 |
from tqdm import tqdm
|
9 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
10 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
|
11 |
from typing import List, Dict, Tuple
|
12 |
import pandas as pd
|
13 |
|
|
|
18 |
import pymupdf
|
19 |
from pymupdf import Rect
|
20 |
from fitz import Document, Page
|
|
|
21 |
import gradio as gr
|
22 |
from gradio import Progress
|
23 |
from collections import defaultdict # For efficient grouping
|
24 |
|
25 |
from presidio_analyzer import RecognizerResult
|
26 |
+
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
27 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
28 |
from tools.file_conversion import process_file, image_dpi
|
29 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
|
30 |
+
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
31 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
|
|
32 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
|
|
33 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
34 |
|
35 |
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
|
|
36 |
page_break_value = get_or_create_env_var('page_break_value', '500')
|
37 |
print(f'The value of page_break_value is {page_break_value}')
|
38 |
|
39 |
max_time_value = get_or_create_env_var('max_time_value', '105')
|
40 |
print(f'The value of max_time_value is {max_time_value}')
|
41 |
|
|
|
42 |
def sum_numbers_before_seconds(string:str):
|
43 |
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
44 |
|
|
|
186 |
else:
|
187 |
in_allow_list_flat = []
|
188 |
|
|
|
189 |
|
190 |
+
# Try to connect to AWS services only if RUN_AWS_FUNCTIONS environmental variable is 1
|
191 |
+
if pii_identification_method == "AWS Comprehend":
|
192 |
+
print("Trying to connect to AWS Comprehend service")
|
193 |
+
if RUN_AWS_FUNCTIONS == "1":
|
194 |
+
comprehend_client = boto3.client('comprehend')
|
195 |
+
else:
|
196 |
+
comprehend_client = ""
|
197 |
+
out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
|
198 |
+
print(out_message)
|
199 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
200 |
+
else:
|
201 |
+
comprehend_client = ""
|
202 |
+
|
203 |
+
if in_redact_method == textract_option:
|
204 |
+
print("Trying to connect to AWS Comprehend service")
|
205 |
+
if RUN_AWS_FUNCTIONS == "1":
|
206 |
+
textract_client = boto3.client('textract')
|
207 |
+
else:
|
208 |
+
textract_client = ""
|
209 |
+
out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
|
210 |
+
print(out_message)
|
211 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
212 |
+
else:
|
213 |
+
textract_client = ""
|
214 |
+
|
215 |
+
progress(0.5, desc="Redacting file")
|
216 |
|
217 |
if isinstance(file_paths, str):
|
218 |
file_paths_list = [file_paths]
|
|
|
236 |
if is_a_pdf == False:
|
237 |
# If user has not submitted a pdf, assume it's an image
|
238 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
239 |
+
in_redact_method = tesseract_ocr_option
|
240 |
else:
|
241 |
out_message = "No file selected"
|
242 |
print(out_message)
|
243 |
|
244 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
245 |
|
246 |
+
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
247 |
|
248 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
#Analyse and redact image-based pdf or image
|
251 |
if is_pdf_or_image(file_path) == False:
|
252 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
253 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
254 |
|
255 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
256 |
|
|
|
274 |
all_decision_process_table,
|
275 |
pymupdf_doc,
|
276 |
pii_identification_method,
|
277 |
+
comprehend_query_number,
|
278 |
+
comprehend_client,
|
279 |
+
textract_client)
|
280 |
|
281 |
# Save Textract request metadata (if exists)
|
282 |
if new_request_metadata:
|
283 |
print("Request metadata:", new_request_metadata)
|
284 |
all_request_metadata.append(new_request_metadata)
|
285 |
|
286 |
+
elif in_redact_method == text_ocr_option:
|
287 |
|
288 |
logging_file_paths = ""
|
289 |
|
|
|
301 |
in_allow_list_flat,
|
302 |
page_min,
|
303 |
page_max,
|
304 |
+
text_ocr_option,
|
305 |
current_loop_page,
|
306 |
page_break_return,
|
307 |
annotations_all_pages,
|
|
|
309 |
all_decision_process_table,
|
310 |
pymupdf_doc,
|
311 |
pii_identification_method,
|
312 |
+
comprehend_query_number,
|
313 |
+
comprehend_client)
|
314 |
|
315 |
else:
|
316 |
out_message = "No redaction method selected"
|
|
|
343 |
|
344 |
logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
|
345 |
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
|
|
346 |
out_file_paths.append(logs_output_file_name)
|
347 |
|
348 |
all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
|
349 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
|
|
350 |
out_file_paths.append(all_text_output_file_name)
|
351 |
|
352 |
+
# Save the gradio_annotation_boxes to a JSON file
|
353 |
+
try:
|
354 |
+
out_annotation_file_path = out_image_file_path + '_redactions.json'
|
355 |
+
with open(out_annotation_file_path, 'w') as f:
|
356 |
+
json.dump(annotations_all_pages, f)
|
357 |
+
out_file_paths.append(out_annotation_file_path)
|
358 |
+
except:
|
359 |
+
print("Could not save annotations to json file.")
|
360 |
+
|
361 |
# Make a combined message for the file
|
362 |
if isinstance(out_message, list):
|
363 |
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
|
|
373 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
374 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
else:
|
377 |
toc = time.perf_counter()
|
378 |
time_taken = toc - tic
|
|
|
491 |
|
492 |
return x1, new_y1, x2, new_y2
|
493 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
495 |
'''
|
496 |
Converts coordinates from pymupdf format to image coordinates,
|
|
|
594 |
# Should already be in correct format if img_annotator_box is an input
|
595 |
if isinstance(annot, dict):
|
596 |
img_annotation_box = annot
|
|
|
|
|
|
|
|
|
597 |
|
598 |
x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
|
599 |
|
|
|
788 |
is_a_pdf:bool=True,
|
789 |
page_min:int=0,
|
790 |
page_max:int=999,
|
791 |
+
analysis_type:str=tesseract_ocr_option,
|
792 |
handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
|
793 |
request_metadata:str="", current_loop_page:int=0,
|
794 |
page_break_return:bool=False,
|
|
|
799 |
pymupdf_doc = [],
|
800 |
pii_identification_method:str="Local",
|
801 |
comprehend_query_number:int=0,
|
802 |
+
comprehend_client="",
|
803 |
+
textract_client="",
|
804 |
page_break_val:int=int(page_break_value),
|
805 |
logging_file_paths:List=[],
|
806 |
max_time:int=int(max_time_value),
|
|
|
818 |
- is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
|
819 |
- page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
|
820 |
- page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
|
821 |
+
- analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
|
822 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
|
823 |
- request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
|
824 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
|
|
829 |
- pymupdf_doc (List, optional): The document as a PyMupdf object.
|
830 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
831 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
832 |
+
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
833 |
+
- textract_client (optional): A connection to the AWS Textract service via the boto3 package.
|
834 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
835 |
- logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
|
836 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
|
|
843 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
844 |
comprehend_query_number_new = 0
|
845 |
|
846 |
+
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
847 |
+
print("Connection to AWS Comprehend service unsuccessful.")
|
848 |
+
|
849 |
+
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
850 |
+
|
851 |
+
if analysis_type == textract_option and textract_client == "":
|
852 |
+
print("Connection to AWS Textract service unsuccessful.")
|
853 |
+
|
854 |
+
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
855 |
|
856 |
tic = time.perf_counter()
|
857 |
|
|
|
874 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
875 |
#print("Current_loop_page:", current_loop_page)
|
876 |
|
877 |
+
if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
878 |
+
elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
879 |
|
880 |
if current_loop_page == 0: page_loop_start = 0
|
881 |
else: page_loop_start = current_loop_page
|
|
|
919 |
else: ocr_lang = language
|
920 |
|
921 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
922 |
+
if analysis_type == tesseract_ocr_option:
|
923 |
|
924 |
word_level_ocr_results = image_analyser.perform_ocr(image)
|
925 |
|
|
|
928 |
|
929 |
|
930 |
# Import results from json and convert
|
931 |
+
if analysis_type == textract_option:
|
932 |
|
933 |
# Convert the image to bytes using an in-memory buffer
|
934 |
image_buffer = io.BytesIO()
|
|
|
939 |
json_file_path = output_folder + file_name + "_textract.json"
|
940 |
|
941 |
if not os.path.exists(json_file_path):
|
942 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client) # Analyse page with Textract
|
943 |
logging_file_paths.append(json_file_path)
|
944 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
945 |
|
|
|
987 |
line_level_ocr_results,
|
988 |
line_level_ocr_results_with_children,
|
989 |
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
990 |
+
pii_identification_method = pii_identification_method,
|
991 |
+
comprehend_client=comprehend_client,
|
992 |
language=language,
|
993 |
entities=chosen_redact_entities,
|
994 |
allow_list=allow_list,
|
|
|
996 |
)
|
997 |
|
998 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
999 |
|
1000 |
else:
|
1001 |
redaction_bboxes = []
|
1002 |
|
1003 |
|
1004 |
+
if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
1005 |
+
elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
1006 |
|
1007 |
# Save decision making process
|
1008 |
bboxes_str = str(redaction_bboxes)
|
|
|
1379 |
allow_list: List[str] = None, # Optional list of allowed entities
|
1380 |
page_min: int = 0, # Minimum page number to start redaction
|
1381 |
page_max: int = 999, # Maximum page number to end redaction
|
1382 |
+
analysis_type: str = text_ocr_option, # Type of analysis to perform
|
1383 |
current_loop_page: int = 0, # Current page being processed in the loop
|
1384 |
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
1385 |
annotations_all_pages: List = [], # List of annotations across all pages
|
|
|
1388 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1389 |
pii_identification_method: str = "Local",
|
1390 |
comprehend_query_number:int = 0,
|
1391 |
+
comprehend_client="",
|
1392 |
page_break_val: int = int(page_break_value), # Value for page break
|
1393 |
max_time: int = int(max_time_value),
|
1394 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
|
1414 |
- all_decision_process_table: DataFrame for decision process table
|
1415 |
- pymupdf_doc: List of PyMuPDF documents
|
1416 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
1417 |
+
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
1418 |
+
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
1419 |
- page_break_val: Value for page break
|
1420 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1421 |
- progress: Progress tracking object
|
1422 |
'''
|
1423 |
|
1424 |
+
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
1425 |
+
print("Connection to AWS Comprehend service not found.")
|
1426 |
+
|
1427 |
+
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1428 |
+
|
1429 |
tic = time.perf_counter()
|
1430 |
|
1431 |
# Open with Pikepdf to get text lines
|
|
|
1477 |
decision_process_table_on_page = pd.DataFrame()
|
1478 |
page_text_outputs = pd.DataFrame()
|
1479 |
|
1480 |
+
if analysis_type == text_ocr_option:
|
1481 |
for n, text_container in enumerate(page_layout):
|
1482 |
|
1483 |
text_container_analyser_results = []
|
tools/helper_functions.py
CHANGED
@@ -29,6 +29,16 @@ def get_or_create_env_var(var_name, default_value):
|
|
29 |
|
30 |
return value
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
# Retrieving or setting output folder
|
33 |
env_var_name = 'GRADIO_OUTPUT_FOLDER'
|
34 |
default_value = 'output/'
|
|
|
29 |
|
30 |
return value
|
31 |
|
32 |
+
|
33 |
+
# Names for options labels
|
34 |
+
text_ocr_option = "Simple text analysis - docs with selectable text"
|
35 |
+
tesseract_ocr_option = "OCR analysis for documents without selectable text - best for typed text"
|
36 |
+
textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
|
37 |
+
|
38 |
+
local_pii_detector = "Local"
|
39 |
+
aws_pii_detector = "AWS Comprehend"
|
40 |
+
|
41 |
+
|
42 |
# Retrieving or setting output folder
|
43 |
env_var_name = 'GRADIO_OUTPUT_FOLDER'
|
44 |
default_value = 'output/'
|
tools/redaction_review.py
CHANGED
@@ -47,24 +47,32 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
|
|
47 |
|
48 |
return current_zoom_level, annotate_current_page
|
49 |
|
50 |
-
|
51 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
|
52 |
-
|
|
|
|
|
53 |
|
54 |
zoom_str = str(zoom) + '%'
|
55 |
|
56 |
if not image_annotator_object:
|
57 |
-
|
58 |
label="Modify redaction boxes",
|
59 |
#label_list=["Redaction"],
|
60 |
#label_colors=[(0, 0, 0)],
|
|
|
|
|
61 |
show_label=False,
|
62 |
-
sources=
|
63 |
show_clear_button=False,
|
64 |
show_share_button=False,
|
65 |
show_remove_button=False,
|
66 |
-
interactive=False
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
if page_num is None:
|
70 |
page_num = 0
|
@@ -72,8 +80,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
|
|
72 |
# Check bounding values for current page and page max
|
73 |
if page_num > 0:
|
74 |
page_num_reported = page_num
|
75 |
-
|
76 |
elif page_num == 0: page_num_reported = 1
|
|
|
77 |
else:
|
78 |
page_num = 0
|
79 |
page_num_reported = 1
|
@@ -83,7 +92,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
|
|
83 |
if page_num_reported > page_max_reported:
|
84 |
page_num_reported = page_max_reported
|
85 |
|
86 |
-
|
|
|
|
|
87 |
boxes_alpha=0.1,
|
88 |
box_thickness=1,
|
89 |
#label_list=["Redaction"],
|
@@ -104,30 +115,26 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
|
|
104 |
|
105 |
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
|
106 |
|
107 |
-
return out_image_annotator, number_reported, number_reported
|
108 |
|
109 |
-
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]):
|
110 |
'''
|
111 |
Overwrite current image annotations with modifications
|
112 |
'''
|
113 |
-
|
114 |
if not current_page:
|
115 |
current_page = 1
|
116 |
|
117 |
#If no previous page or is 0, i.e. first time run, then rewrite current page
|
118 |
-
if not previous_page:
|
119 |
-
|
120 |
-
#return all_image_annotations, current_page, current_page
|
121 |
-
|
122 |
-
#print("all_image_annotations before:",all_image_annotations)
|
123 |
|
124 |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
#print("all_image_annotations after:",all_image_annotations)
|
131 |
|
132 |
return all_image_annotations, current_page, current_page
|
133 |
|
@@ -178,7 +185,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
178 |
|
179 |
draw.rectangle(coords, fill=fill)
|
180 |
|
181 |
-
image.save(output_folder + file_base + "
|
182 |
|
183 |
doc = [image]
|
184 |
|
@@ -213,13 +220,13 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
213 |
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
214 |
|
215 |
#try:
|
216 |
-
out_pdf_file_path = output_folder + file_base + "
|
217 |
unredacted_doc.save(out_pdf_file_path)
|
218 |
output_files.append(out_pdf_file_path)
|
219 |
|
220 |
# Save the gradio_annotation_boxes to a JSON file
|
221 |
try:
|
222 |
-
out_annotation_file_path = output_folder + file_base + '
|
223 |
with open(out_annotation_file_path, 'w') as f:
|
224 |
json.dump(all_image_annotations, f)
|
225 |
output_files.append(out_annotation_file_path)
|
|
|
47 |
|
48 |
return current_zoom_level, annotate_current_page
|
49 |
|
|
|
50 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
|
51 |
+
'''
|
52 |
+
Update a gradio_image_annotation object with new annotation data
|
53 |
+
'''
|
54 |
|
55 |
zoom_str = str(zoom) + '%'
|
56 |
|
57 |
if not image_annotator_object:
|
58 |
+
out_image_annotator = image_annotator(
|
59 |
label="Modify redaction boxes",
|
60 |
#label_list=["Redaction"],
|
61 |
#label_colors=[(0, 0, 0)],
|
62 |
+
height=zoom_str,
|
63 |
+
width=zoom_str,
|
64 |
show_label=False,
|
65 |
+
sources=None,
|
66 |
show_clear_button=False,
|
67 |
show_share_button=False,
|
68 |
show_remove_button=False,
|
69 |
+
interactive=False)
|
70 |
+
|
71 |
+
number_reported = gr.Number(label = "Page (press enter to change)", value=1, precision=0)
|
72 |
+
|
73 |
+
return out_image_annotator, number_reported, number_reported
|
74 |
+
|
75 |
+
print("page_num at start of update_annotator function:", page_num)
|
76 |
|
77 |
if page_num is None:
|
78 |
page_num = 0
|
|
|
80 |
# Check bounding values for current page and page max
|
81 |
if page_num > 0:
|
82 |
page_num_reported = page_num
|
83 |
+
|
84 |
elif page_num == 0: page_num_reported = 1
|
85 |
+
|
86 |
else:
|
87 |
page_num = 0
|
88 |
page_num_reported = 1
|
|
|
92 |
if page_num_reported > page_max_reported:
|
93 |
page_num_reported = page_max_reported
|
94 |
|
95 |
+
|
96 |
+
out_image_annotator = image_annotator(
|
97 |
+
value = image_annotator_object[page_num_reported - 1],
|
98 |
boxes_alpha=0.1,
|
99 |
box_thickness=1,
|
100 |
#label_list=["Redaction"],
|
|
|
115 |
|
116 |
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
|
117 |
|
118 |
+
return out_image_annotator, number_reported, number_reported, page_num_reported
|
119 |
|
120 |
+
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
|
121 |
'''
|
122 |
Overwrite current image annotations with modifications
|
123 |
'''
|
124 |
+
|
125 |
if not current_page:
|
126 |
current_page = 1
|
127 |
|
128 |
#If no previous page or is 0, i.e. first time run, then rewrite current page
|
129 |
+
#if not previous_page:
|
130 |
+
# previous_page = current_page
|
|
|
|
|
|
|
131 |
|
132 |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
|
133 |
|
134 |
+
if clear_all == False:
|
135 |
+
all_image_annotations[previous_page - 1] = image_annotated
|
136 |
+
else:
|
137 |
+
all_image_annotations[previous_page - 1]["boxes"] = []
|
|
|
138 |
|
139 |
return all_image_annotations, current_page, current_page
|
140 |
|
|
|
185 |
|
186 |
draw.rectangle(coords, fill=fill)
|
187 |
|
188 |
+
image.save(output_folder + file_base + "_redacted.png")
|
189 |
|
190 |
doc = [image]
|
191 |
|
|
|
220 |
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
221 |
|
222 |
#try:
|
223 |
+
out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
|
224 |
unredacted_doc.save(out_pdf_file_path)
|
225 |
output_files.append(out_pdf_file_path)
|
226 |
|
227 |
# Save the gradio_annotation_boxes to a JSON file
|
228 |
try:
|
229 |
+
out_annotation_file_path = output_folder + file_base + '_redactions.json'
|
230 |
with open(out_annotation_file_path, 'w') as f:
|
231 |
json.dump(all_image_annotations, f)
|
232 |
output_files.append(out_annotation_file_path)
|