Commit
·
4276db1
1
Parent(s):
b805ec6
Added workaround to issue with selectdata and dataframes for filtered dataframes. Rearranged some components.
Browse files- app.py +64 -45
- tools/config.py +1 -1
- tools/file_conversion.py +21 -13
- tools/file_redaction.py +17 -13
- tools/helper_functions.py +7 -1
- tools/redaction_review.py +17 -14
app.py
CHANGED
@@ -4,11 +4,11 @@ import gradio as gr
|
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
|
6 |
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
|
7 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken
|
8 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3
|
9 |
from tools.file_redaction import choose_and_run_redactor
|
10 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
|
11 |
-
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df
|
12 |
from tools.data_anonymise import anonymise_data_files
|
13 |
from tools.auth import authenticate_user
|
14 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
@@ -60,7 +60,7 @@ with app:
|
|
60 |
pdf_doc_state = gr.State([])
|
61 |
all_image_annotations_state = gr.State([])
|
62 |
|
63 |
-
|
64 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
|
65 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
|
66 |
|
@@ -110,6 +110,7 @@ with app:
|
|
110 |
doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
|
111 |
doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
|
112 |
latest_review_file_path = gr.Textbox(label = "latest_review_file_path", value="", visible=False) # Latest review file path output from redaction
|
|
|
113 |
|
114 |
data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
|
115 |
data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
|
@@ -152,8 +153,10 @@ with app:
|
|
152 |
default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
|
153 |
enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
|
154 |
|
155 |
-
# Base
|
156 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
|
|
|
|
|
157 |
|
158 |
# Duplicate page detection
|
159 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
@@ -198,11 +201,11 @@ with app:
|
|
198 |
|
199 |
text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
200 |
|
201 |
-
with gr.Row(equal_height=True):
|
202 |
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
|
203 |
|
204 |
with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
|
205 |
-
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
|
206 |
|
207 |
if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
|
208 |
with gr.Accordion("AWS Textract bulk document API call", open = False, visible=True):
|
@@ -216,22 +219,23 @@ with app:
|
|
216 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
|
217 |
|
218 |
if SHOW_COSTS == "True":
|
219 |
-
with gr.Accordion("Estimated costs and time taken", open =
|
220 |
with gr.Row(equal_height=True):
|
221 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
222 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
|
223 |
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0.00, precision=2, visible=True)
|
224 |
-
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
|
225 |
-
|
226 |
-
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
227 |
|
228 |
-
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
229 |
-
|
230 |
if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
|
231 |
with gr.Accordion("Apply cost code", open = True, visible=True):
|
232 |
-
with gr.Row(
|
233 |
-
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
with gr.Row():
|
237 |
output_summary = gr.Textbox(label="Output summary", scale=1)
|
@@ -253,7 +257,7 @@ with app:
|
|
253 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
254 |
|
255 |
with gr.Accordion(label = "Review PDF redactions", open=True):
|
256 |
-
output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions", file_count='multiple', height=file_input_height)
|
257 |
upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="secondary")
|
258 |
with gr.Row():
|
259 |
annotate_zoom_in = gr.Button("Zoom in", visible=False)
|
@@ -269,7 +273,8 @@ with app:
|
|
269 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
270 |
annotation_next_page_button = gr.Button("Next page", scale = 4)
|
271 |
with gr.Column(scale=1):
|
272 |
-
annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
|
|
|
273 |
|
274 |
with gr.Row():
|
275 |
with gr.Column(scale=2):
|
@@ -293,21 +298,27 @@ with app:
|
|
293 |
interactive=False
|
294 |
)
|
295 |
with gr.Column(scale=1):
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
|
|
|
|
301 |
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
|
|
|
|
307 |
|
308 |
-
|
309 |
-
|
310 |
-
|
|
|
|
|
311 |
|
312 |
with gr.Row():
|
313 |
with gr.Column(scale=2):
|
@@ -317,7 +328,7 @@ with app:
|
|
317 |
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
318 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
|
319 |
with gr.Column(scale=1):
|
320 |
-
blank_markdown_bot = gr.Markdown(value="", label="")
|
321 |
|
322 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
323 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
@@ -447,30 +458,34 @@ with app:
|
|
447 |
# Allow user to select items from cost code dataframe for cost code
|
448 |
if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
|
449 |
cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
|
|
|
450 |
|
451 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
452 |
-
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
|
453 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
454 |
|
455 |
# Run redaction function
|
456 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state,
|
457 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
|
458 |
-
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state,
|
459 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return,
|
460 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
461 |
|
462 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
463 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state,
|
464 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return,
|
465 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
466 |
|
467 |
# If a file has been completed, the function will continue onto the next document
|
468 |
-
latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state,
|
469 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return,
|
470 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
471 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
472 |
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
473 |
|
|
|
|
|
|
|
474 |
###
|
475 |
# REVIEW PDF REDACTIONS
|
476 |
###
|
@@ -478,7 +493,7 @@ with app:
|
|
478 |
# Upload previous files for modifying redactions
|
479 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
480 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
481 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state], api_name="prepare_doc").\
|
482 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
483 |
|
484 |
# Page number controls
|
@@ -531,14 +546,18 @@ with app:
|
|
531 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
532 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
533 |
|
|
|
|
|
|
|
|
|
534 |
# Convert review file to xfdf Adobe format
|
535 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
536 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
|
537 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
538 |
|
539 |
# Convert xfdf Adobe file back to review_file.csv
|
540 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
541 |
-
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
|
542 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
543 |
|
544 |
###
|
@@ -599,11 +618,11 @@ with app:
|
|
599 |
if GET_COST_CODES == "True" and COST_CODES_PATH:
|
600 |
if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
|
601 |
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
|
602 |
-
success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_choice_drop])
|
603 |
print("Successfully loaded cost codes from S3")
|
604 |
elif os.path.exists(COST_CODES_PATH):
|
605 |
print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
|
606 |
-
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_choice_drop])
|
607 |
else: print("Could not load in cost code data")
|
608 |
|
609 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
@@ -650,7 +669,7 @@ if __name__ == "__main__":
|
|
650 |
|
651 |
main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
|
652 |
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
653 |
-
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [],
|
654 |
|
655 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
656 |
# with gr.Tab(label="Advanced options"):
|
|
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
|
6 |
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
|
7 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe
|
8 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3
|
9 |
from tools.file_redaction import choose_and_run_redactor
|
10 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
|
11 |
+
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr
|
12 |
from tools.data_anonymise import anonymise_data_files
|
13 |
from tools.auth import authenticate_user
|
14 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
|
|
60 |
pdf_doc_state = gr.State([])
|
61 |
all_image_annotations_state = gr.State([])
|
62 |
|
63 |
+
|
64 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
|
65 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
|
66 |
|
|
|
110 |
doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
|
111 |
doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
|
112 |
latest_review_file_path = gr.Textbox(label = "latest_review_file_path", value="", visible=False) # Latest review file path output from redaction
|
113 |
+
latest_ocr_file_path = gr.Textbox(label = "latest_ocr_file_path", value="", visible=False) # Latest ocr file path output from text extraction
|
114 |
|
115 |
data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
|
116 |
data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
|
|
|
153 |
default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
|
154 |
enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
|
155 |
|
156 |
+
# Base tables that are not modified subsequent to load
|
157 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
|
158 |
+
all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
|
159 |
+
cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
|
160 |
|
161 |
# Duplicate page detection
|
162 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
|
|
201 |
|
202 |
text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
203 |
|
204 |
+
with gr.Row(equal_height=True):
|
205 |
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
|
206 |
|
207 |
with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
|
208 |
+
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
|
209 |
|
210 |
if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
|
211 |
with gr.Accordion("AWS Textract bulk document API call", open = False, visible=True):
|
|
|
219 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
|
220 |
|
221 |
if SHOW_COSTS == "True":
|
222 |
+
with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
|
223 |
with gr.Row(equal_height=True):
|
224 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
225 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
|
226 |
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0.00, precision=2, visible=True)
|
227 |
+
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
|
|
|
|
|
228 |
|
|
|
|
|
229 |
if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
|
230 |
with gr.Accordion("Apply cost code", open = True, visible=True):
|
231 |
+
with gr.Row():
|
232 |
+
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
|
233 |
+
with gr.Column():
|
234 |
+
reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
|
235 |
+
cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=True)
|
236 |
+
|
237 |
+
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
238 |
+
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
239 |
|
240 |
with gr.Row():
|
241 |
output_summary = gr.Textbox(label="Output summary", scale=1)
|
|
|
257 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
258 |
|
259 |
with gr.Accordion(label = "Review PDF redactions", open=True):
|
260 |
+
output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions. The 'ocr_output' file can also be optionally provided for text search.", file_count='multiple', height=file_input_height)
|
261 |
upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="secondary")
|
262 |
with gr.Row():
|
263 |
annotate_zoom_in = gr.Button("Zoom in", visible=False)
|
|
|
273 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
274 |
annotation_next_page_button = gr.Button("Next page", scale = 4)
|
275 |
with gr.Column(scale=1):
|
276 |
+
annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
|
277 |
+
|
278 |
|
279 |
with gr.Row():
|
280 |
with gr.Column(scale=2):
|
|
|
298 |
interactive=False
|
299 |
)
|
300 |
with gr.Column(scale=1):
|
301 |
+
update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
|
302 |
+
with gr.Accordion("Search suggested redactions", open=True):
|
303 |
+
with gr.Row(equal_height=True):
|
304 |
+
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
305 |
+
page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
|
306 |
+
text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
|
307 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
|
308 |
|
309 |
+
with gr.Row(equal_height=True):
|
310 |
+
exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
|
311 |
+
exclude_selected_btn = gr.Button(value="Exclude all items in table from redactions")
|
312 |
+
with gr.Row(equal_height=True):
|
313 |
+
reset_dropdowns_btn = gr.Button(value="Reset filters")
|
314 |
+
|
315 |
+
undo_last_removal_btn = gr.Button(value="Undo last element removal")
|
316 |
|
317 |
+
selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
|
318 |
+
|
319 |
+
with gr.Accordion("Search all extracted text", open=True):
|
320 |
+
all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
|
321 |
+
reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
|
322 |
|
323 |
with gr.Row():
|
324 |
with gr.Column(scale=2):
|
|
|
328 |
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
329 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
|
330 |
with gr.Column(scale=1):
|
331 |
+
blank_markdown_bot = gr.Markdown(value="", label="")
|
332 |
|
333 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
334 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
|
|
458 |
# Allow user to select items from cost code dataframe for cost code
|
459 |
if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
|
460 |
cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
|
461 |
+
reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
462 |
|
463 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
464 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
|
465 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
466 |
|
467 |
# Run redaction function
|
468 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, output_summary]).\
|
469 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
|
470 |
+
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
471 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
|
472 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
473 |
|
474 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
475 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
476 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
|
477 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
478 |
|
479 |
# If a file has been completed, the function will continue onto the next document
|
480 |
+
latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
481 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
|
482 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
483 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
484 |
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
485 |
|
486 |
+
# If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
|
487 |
+
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
488 |
+
|
489 |
###
|
490 |
# REVIEW PDF REDACTIONS
|
491 |
###
|
|
|
493 |
# Upload previous files for modifying redactions
|
494 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
495 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
496 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
|
497 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
498 |
|
499 |
# Page number controls
|
|
|
546 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
547 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
548 |
|
549 |
+
# Review OCR text buttom
|
550 |
+
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
|
551 |
+
reset_all_ocr_results_btn.click(reset_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
552 |
+
|
553 |
# Convert review file to xfdf Adobe format
|
554 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
555 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
|
556 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
557 |
|
558 |
# Convert xfdf Adobe file back to review_file.csv
|
559 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
560 |
+
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
|
561 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
562 |
|
563 |
###
|
|
|
618 |
if GET_COST_CODES == "True" and COST_CODES_PATH:
|
619 |
if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
|
620 |
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
|
621 |
+
success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
622 |
print("Successfully loaded cost codes from S3")
|
623 |
elif os.path.exists(COST_CODES_PATH):
|
624 |
print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
|
625 |
+
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
626 |
else: print("Could not load in cost code data")
|
627 |
|
628 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
|
|
669 |
|
670 |
main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
|
671 |
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
672 |
+
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
|
673 |
|
674 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
675 |
# with gr.Tab(label="Advanced options"):
|
tools/config.py
CHANGED
@@ -161,7 +161,7 @@ COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CE
|
|
161 |
|
162 |
S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
|
163 |
|
164 |
-
ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed,
|
165 |
|
166 |
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
|
167 |
if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
|
|
|
161 |
|
162 |
S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
|
163 |
|
164 |
+
ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
|
165 |
|
166 |
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
|
167 |
if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
|
tools/file_conversion.py
CHANGED
@@ -251,7 +251,7 @@ def get_input_file_names(file_input:List[str]):
|
|
251 |
file_extension = os.path.splitext(file_path)[1].lower()
|
252 |
|
253 |
# Check if the file is in acceptable types
|
254 |
-
if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
|
255 |
all_relevant_files.append(file_path_without_ext)
|
256 |
file_name_with_extension = file_path_without_ext + file_extension
|
257 |
full_file_name = file_path
|
@@ -480,6 +480,7 @@ def prepare_image_or_pdf(
|
|
480 |
pymupdf_doc = []
|
481 |
all_img_details = []
|
482 |
review_file_csv = pd.DataFrame()
|
|
|
483 |
|
484 |
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
485 |
if not in_fully_redacted_list.empty:
|
@@ -512,7 +513,7 @@ def prepare_image_or_pdf(
|
|
512 |
final_out_message = '\n'.join(out_message)
|
513 |
else:
|
514 |
final_out_message = out_message
|
515 |
-
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details
|
516 |
|
517 |
progress(0.1, desc='Preparing file')
|
518 |
|
@@ -600,11 +601,17 @@ def prepare_image_or_pdf(
|
|
600 |
pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
|
601 |
|
602 |
elif file_extension in ['.csv']:
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
608 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
609 |
if (file_extension in ['.json']) | (json_from_csv == True):
|
610 |
|
@@ -623,11 +630,10 @@ def prepare_image_or_pdf(
|
|
623 |
|
624 |
# Use shutil to copy the file directly
|
625 |
shutil.copy2(file_path, out_textract_path) # Preserves metadata
|
626 |
-
|
627 |
-
textract_output_found = True
|
628 |
-
|
629 |
continue
|
630 |
|
|
|
631 |
# If you have an annotations object from the above code
|
632 |
if all_annotations_object:
|
633 |
|
@@ -669,7 +675,6 @@ def prepare_image_or_pdf(
|
|
669 |
print("Page", annotation_page_number, "image file not found.")
|
670 |
|
671 |
all_annotations_object[i] = annotation
|
672 |
-
|
673 |
|
674 |
if isinstance(in_fully_redacted_list, list):
|
675 |
in_fully_redacted_list = pd.DataFrame(data={"fully_redacted_pages_list":in_fully_redacted_list})
|
@@ -717,6 +722,9 @@ def prepare_image_or_pdf(
|
|
717 |
else:
|
718 |
print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
|
719 |
|
|
|
|
|
|
|
720 |
# Must be something else, return with error message
|
721 |
else:
|
722 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
@@ -744,7 +752,7 @@ def prepare_image_or_pdf(
|
|
744 |
|
745 |
number_of_pages = len(image_file_paths)
|
746 |
|
747 |
-
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details
|
748 |
|
749 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
|
750 |
file_path_without_ext = get_file_name_without_type(in_file_path)
|
@@ -1196,7 +1204,7 @@ def create_annotation_dicts_from_annotation_df(
|
|
1196 |
|
1197 |
# Check if the DataFrame is empty or lacks necessary columns
|
1198 |
if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
|
1199 |
-
print("Warning: Annotation DataFrame is empty or missing 'image' column.")
|
1200 |
return list(image_dict.values()) # Return based on page_sizes only
|
1201 |
|
1202 |
# 2. Define columns to extract for boxes and check availability
|
|
|
251 |
file_extension = os.path.splitext(file_path)[1].lower()
|
252 |
|
253 |
# Check if the file is in acceptable types
|
254 |
+
if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext) & ("ocr_output" not in file_path_without_ext):
|
255 |
all_relevant_files.append(file_path_without_ext)
|
256 |
file_name_with_extension = file_path_without_ext + file_extension
|
257 |
full_file_name = file_path
|
|
|
480 |
pymupdf_doc = []
|
481 |
all_img_details = []
|
482 |
review_file_csv = pd.DataFrame()
|
483 |
+
all_line_level_ocr_results_df = pd.DataFrame()
|
484 |
|
485 |
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
486 |
if not in_fully_redacted_list.empty:
|
|
|
513 |
final_out_message = '\n'.join(out_message)
|
514 |
else:
|
515 |
final_out_message = out_message
|
516 |
+
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
|
517 |
|
518 |
progress(0.1, desc='Preparing file')
|
519 |
|
|
|
601 |
pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
|
602 |
|
603 |
elif file_extension in ['.csv']:
|
604 |
+
if '_review_file' in file_path_without_ext:
|
605 |
+
#print("file_path:", file_path)
|
606 |
+
review_file_csv = read_file(file_path)
|
607 |
+
all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
|
608 |
+
json_from_csv = True
|
609 |
+
print("Converted CSV review file to image annotation object")
|
610 |
+
elif '_ocr_output' in file_path_without_ext:
|
611 |
+
all_line_level_ocr_results_df = read_file(file_path)
|
612 |
+
json_from_csv = False
|
613 |
+
|
614 |
+
# NEW IF STATEMENT
|
615 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
616 |
if (file_extension in ['.json']) | (json_from_csv == True):
|
617 |
|
|
|
630 |
|
631 |
# Use shutil to copy the file directly
|
632 |
shutil.copy2(file_path, out_textract_path) # Preserves metadata
|
633 |
+
textract_output_found = True
|
|
|
|
|
634 |
continue
|
635 |
|
636 |
+
# NEW IF STATEMENT
|
637 |
# If you have an annotations object from the above code
|
638 |
if all_annotations_object:
|
639 |
|
|
|
675 |
print("Page", annotation_page_number, "image file not found.")
|
676 |
|
677 |
all_annotations_object[i] = annotation
|
|
|
678 |
|
679 |
if isinstance(in_fully_redacted_list, list):
|
680 |
in_fully_redacted_list = pd.DataFrame(data={"fully_redacted_pages_list":in_fully_redacted_list})
|
|
|
722 |
else:
|
723 |
print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
|
724 |
|
725 |
+
elif file_extension in ['.csv'] and "ocr_output" in file_path:
|
726 |
+
continue
|
727 |
+
|
728 |
# Must be something else, return with error message
|
729 |
else:
|
730 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
|
|
752 |
|
753 |
number_of_pages = len(image_file_paths)
|
754 |
|
755 |
+
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
|
756 |
|
757 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
|
758 |
file_path_without_ext = get_file_name_without_type(in_file_path)
|
|
|
1204 |
|
1205 |
# Check if the DataFrame is empty or lacks necessary columns
|
1206 |
if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
|
1207 |
+
#print("Warning: Annotation DataFrame is empty or missing 'image' column.")
|
1208 |
return list(image_dict.values()) # Return based on page_sizes only
|
1209 |
|
1210 |
# 2. Define columns to extract for boxes and check availability
|
tools/file_redaction.py
CHANGED
@@ -100,6 +100,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
100 |
review_file_path:str="",
|
101 |
input_folder:str=INPUT_FOLDER,
|
102 |
textract_query_number:int=0,
|
|
|
103 |
prepare_images:bool=True,
|
104 |
progress=gr.Progress(track_tqdm=True)):
|
105 |
'''
|
@@ -148,6 +149,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
148 |
- review_file_path (str, optional): The latest review file path created by the app
|
149 |
- input_folder (str, optional): The custom input path, if provided
|
150 |
- textract_query_number (int, optional): The number of textract queries up until this point.
|
|
|
151 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
152 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
153 |
|
@@ -211,9 +213,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
211 |
print("Completed last file")
|
212 |
current_loop_page = 0
|
213 |
|
214 |
-
if isinstance(out_message, list):
|
215 |
combined_out_message = combined_out_message + '\n'.join(out_message)
|
216 |
-
|
217 |
combined_out_message = combined_out_message + '\n' + out_message
|
218 |
|
219 |
# Only send across review file if redaction has been done
|
@@ -226,7 +228,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
226 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
227 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
228 |
|
229 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number
|
230 |
|
231 |
#if first_loop_state == False:
|
232 |
# Prepare documents and images as required if they don't already exist
|
@@ -257,7 +259,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
257 |
# Call prepare_image_or_pdf only if needed
|
258 |
if prepare_images_flag is not None:# and first_loop_state==True:
|
259 |
#print("Calling preparation function. prepare_images_flag:", prepare_images_flag)
|
260 |
-
out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state = prepare_image_or_pdf(
|
261 |
file_paths_loop, text_extraction_method, 0, out_message, True,
|
262 |
annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
|
263 |
output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
|
@@ -279,7 +281,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
279 |
|
280 |
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
281 |
current_loop_page = 999
|
282 |
-
|
|
|
283 |
|
284 |
# Only send across review file if redaction has been done
|
285 |
if pii_identification_method != no_redaction_option:
|
@@ -288,7 +291,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
288 |
#review_file_path = [x for x in out_file_paths if "review_file" in x]
|
289 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
290 |
|
291 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number
|
292 |
|
293 |
# Load/create allow list
|
294 |
# If string, assume file path
|
@@ -513,14 +516,14 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
513 |
all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
|
514 |
else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
515 |
|
516 |
-
|
517 |
|
518 |
all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
|
519 |
|
520 |
-
all_line_level_ocr_results_df.to_csv(
|
521 |
-
out_file_paths.append(
|
522 |
|
523 |
-
duplication_file_path_outputs.append(
|
524 |
|
525 |
# Convert the gradio annotation boxes to relative coordinates
|
526 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
@@ -543,9 +546,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
543 |
out_file_paths.append(review_file_path)
|
544 |
|
545 |
# Make a combined message for the file
|
546 |
-
if isinstance(out_message, list):
|
547 |
combined_out_message = combined_out_message + '\n'.join(out_message) # Ensure out_message is a list of strings
|
548 |
-
|
|
|
549 |
|
550 |
toc = time.perf_counter()
|
551 |
time_taken = toc - tic
|
@@ -588,7 +592,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
588 |
if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
|
589 |
else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
|
590 |
|
591 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number
|
592 |
|
593 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
594 |
'''
|
|
|
100 |
review_file_path:str="",
|
101 |
input_folder:str=INPUT_FOLDER,
|
102 |
textract_query_number:int=0,
|
103 |
+
ocr_file_path:str="",
|
104 |
prepare_images:bool=True,
|
105 |
progress=gr.Progress(track_tqdm=True)):
|
106 |
'''
|
|
|
149 |
- review_file_path (str, optional): The latest review file path created by the app
|
150 |
- input_folder (str, optional): The custom input path, if provided
|
151 |
- textract_query_number (int, optional): The number of textract queries up until this point.
|
152 |
+
- ocr_file_path (str, optional): The latest ocr file path created by the app
|
153 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
154 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
155 |
|
|
|
213 |
print("Completed last file")
|
214 |
current_loop_page = 0
|
215 |
|
216 |
+
if isinstance(out_message, list) and out_message:
|
217 |
combined_out_message = combined_out_message + '\n'.join(out_message)
|
218 |
+
elif out_message:
|
219 |
combined_out_message = combined_out_message + '\n' + out_message
|
220 |
|
221 |
# Only send across review file if redaction has been done
|
|
|
228 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
229 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
230 |
|
231 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
|
232 |
|
233 |
#if first_loop_state == False:
|
234 |
# Prepare documents and images as required if they don't already exist
|
|
|
259 |
# Call prepare_image_or_pdf only if needed
|
260 |
if prepare_images_flag is not None:# and first_loop_state==True:
|
261 |
#print("Calling preparation function. prepare_images_flag:", prepare_images_flag)
|
262 |
+
out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df = prepare_image_or_pdf(
|
263 |
file_paths_loop, text_extraction_method, 0, out_message, True,
|
264 |
annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
|
265 |
output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
|
|
|
281 |
|
282 |
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
283 |
current_loop_page = 999
|
284 |
+
if out_message:
|
285 |
+
combined_out_message = combined_out_message + "\n" + out_message
|
286 |
|
287 |
# Only send across review file if redaction has been done
|
288 |
if pii_identification_method != no_redaction_option:
|
|
|
291 |
#review_file_path = [x for x in out_file_paths if "review_file" in x]
|
292 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
293 |
|
294 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
|
295 |
|
296 |
# Load/create allow list
|
297 |
# If string, assume file path
|
|
|
516 |
all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
|
517 |
else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
518 |
|
519 |
+
ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
|
520 |
|
521 |
all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
|
522 |
|
523 |
+
all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
|
524 |
+
out_file_paths.append(ocr_file_path)
|
525 |
|
526 |
+
duplication_file_path_outputs.append(ocr_file_path)
|
527 |
|
528 |
# Convert the gradio annotation boxes to relative coordinates
|
529 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
|
|
546 |
out_file_paths.append(review_file_path)
|
547 |
|
548 |
# Make a combined message for the file
|
549 |
+
if isinstance(out_message, list) and out_message:
|
550 |
combined_out_message = combined_out_message + '\n'.join(out_message) # Ensure out_message is a list of strings
|
551 |
+
elif out_message:
|
552 |
+
combined_out_message = combined_out_message + '\n' + out_message
|
553 |
|
554 |
toc = time.perf_counter()
|
555 |
time_taken = toc - tic
|
|
|
592 |
if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
|
593 |
else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
|
594 |
|
595 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
|
596 |
|
597 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
598 |
'''
|
tools/helper_functions.py
CHANGED
@@ -53,7 +53,7 @@ def load_in_default_cost_codes(cost_codes_path:str):
|
|
53 |
|
54 |
out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
|
55 |
|
56 |
-
return cost_codes_df, out_dropdown
|
57 |
|
58 |
def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
|
59 |
if enforce_cost_code_textbox == "True":
|
@@ -485,4 +485,10 @@ def calculate_time_taken(number_of_pages:str,
|
|
485 |
calculated_time_taken = (page_conversion_time_taken + page_extraction_time_taken + page_redaction_time_taken)/60
|
486 |
|
487 |
return calculated_time_taken
|
|
|
|
|
|
|
|
|
|
|
|
|
488 |
|
|
|
53 |
|
54 |
out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
|
55 |
|
56 |
+
return cost_codes_df, cost_codes_df, out_dropdown
|
57 |
|
58 |
def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
|
59 |
if enforce_cost_code_textbox == "True":
|
|
|
485 |
calculated_time_taken = (page_conversion_time_taken + page_extraction_time_taken + page_redaction_time_taken)/60
|
486 |
|
487 |
return calculated_time_taken
|
488 |
+
|
489 |
+
def reset_base_dataframe(df:pd.DataFrame):
|
490 |
+
return df
|
491 |
+
|
492 |
+
def reset_ocr_base_dataframe(df:pd.DataFrame):
|
493 |
+
return df.iloc[:, [0,1]]
|
494 |
|
tools/redaction_review.py
CHANGED
@@ -114,7 +114,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
|
|
114 |
page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
|
115 |
page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
|
116 |
|
117 |
-
recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
|
118 |
|
119 |
recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
|
120 |
|
@@ -151,7 +151,7 @@ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData,
|
|
151 |
|
152 |
review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
|
153 |
|
154 |
-
recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
|
155 |
|
156 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
|
157 |
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
@@ -180,10 +180,6 @@ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
|
|
180 |
out_image_annotations_state = current_image_annotations_state
|
181 |
out_current_page_annotator = current_page_annotator
|
182 |
|
183 |
-
print("page_sizes:", page_sizes)
|
184 |
-
|
185 |
-
review_df.to_csv(OUTPUT_FOLDER + "review_df_in_update_annotator.csv")
|
186 |
-
|
187 |
if not review_df.empty:
|
188 |
|
189 |
out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
|
@@ -195,9 +191,6 @@ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
|
|
195 |
|
196 |
return out_current_page_annotator, out_image_annotations_state
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
|
202 |
selected_rows_df: pd.DataFrame,
|
203 |
image_file_paths:List[str],
|
@@ -241,7 +234,7 @@ def update_annotator_object_and_filter_df(
|
|
241 |
recogniser_entities_dropdown_value:str="ALL",
|
242 |
page_dropdown_value:str="ALL",
|
243 |
text_dropdown_value:str="ALL",
|
244 |
-
recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True),
|
245 |
zoom:int=100,
|
246 |
review_df:pd.DataFrame=[],
|
247 |
page_sizes:List[dict]=[],
|
@@ -584,6 +577,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
584 |
output_files.append(orig_pdf_file_path)
|
585 |
|
586 |
try:
|
|
|
587 |
review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
|
588 |
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
589 |
|
@@ -765,12 +759,21 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
765 |
def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
|
766 |
|
767 |
row_value_code = evt.row_value[0] # This is the value for cost code
|
768 |
-
row_value_label = evt.row_value[1] # This is the label number value
|
769 |
|
770 |
#row_value_df = pd.DataFrame(data={"page":[row_value_code], "label":[row_value_label]})
|
771 |
|
772 |
return row_value_code
|
773 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
774 |
def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, colour:tuple=(0,0,255)):
|
775 |
'''
|
776 |
Update the colour of a single redaction box based on the values in a selection row
|
@@ -889,12 +892,12 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
|
|
889 |
annots = SubElement(xfdf, 'annots')
|
890 |
|
891 |
# Check if page size object exists, and if current coordinates are in relative format or image coordinates format.
|
892 |
-
if page_sizes:
|
|
|
893 |
page_sizes_df = pd.DataFrame(page_sizes)
|
894 |
|
895 |
# If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
|
896 |
-
#
|
897 |
-
print("Using pymupdf coordinates for conversion.")
|
898 |
|
899 |
pages_are_images = False
|
900 |
|
|
|
114 |
page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
|
115 |
page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
|
116 |
|
117 |
+
recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
|
118 |
|
119 |
recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
|
120 |
|
|
|
151 |
|
152 |
review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
|
153 |
|
154 |
+
recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
|
155 |
|
156 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
|
157 |
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
|
|
180 |
out_image_annotations_state = current_image_annotations_state
|
181 |
out_current_page_annotator = current_page_annotator
|
182 |
|
|
|
|
|
|
|
|
|
183 |
if not review_df.empty:
|
184 |
|
185 |
out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
|
|
|
191 |
|
192 |
return out_current_page_annotator, out_image_annotations_state
|
193 |
|
|
|
|
|
|
|
194 |
def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
|
195 |
selected_rows_df: pd.DataFrame,
|
196 |
image_file_paths:List[str],
|
|
|
234 |
recogniser_entities_dropdown_value:str="ALL",
|
235 |
page_dropdown_value:str="ALL",
|
236 |
text_dropdown_value:str="ALL",
|
237 |
+
recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, show_search='filter', max_height=400),
|
238 |
zoom:int=100,
|
239 |
review_df:pd.DataFrame=[],
|
240 |
page_sizes:List[dict]=[],
|
|
|
577 |
output_files.append(orig_pdf_file_path)
|
578 |
|
579 |
try:
|
580 |
+
print("Saving review file.")
|
581 |
review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
|
582 |
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
583 |
|
|
|
759 |
def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
|
760 |
|
761 |
row_value_code = evt.row_value[0] # This is the value for cost code
|
762 |
+
#row_value_label = evt.row_value[1] # This is the label number value
|
763 |
|
764 |
#row_value_df = pd.DataFrame(data={"page":[row_value_code], "label":[row_value_label]})
|
765 |
|
766 |
return row_value_code
|
767 |
|
768 |
+
def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
|
769 |
+
|
770 |
+
row_value_page = evt.row_value[0] # This is the page_number value
|
771 |
+
row_value_text = evt.row_value[1] # This is the text contents
|
772 |
+
|
773 |
+
row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
|
774 |
+
|
775 |
+
return row_value_page, row_value_df
|
776 |
+
|
777 |
def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, colour:tuple=(0,0,255)):
|
778 |
'''
|
779 |
Update the colour of a single redaction box based on the values in a selection row
|
|
|
892 |
annots = SubElement(xfdf, 'annots')
|
893 |
|
894 |
# Check if page size object exists, and if current coordinates are in relative format or image coordinates format.
|
895 |
+
if page_sizes:
|
896 |
+
|
897 |
page_sizes_df = pd.DataFrame(page_sizes)
|
898 |
|
899 |
# If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
|
900 |
+
#print("Using pymupdf coordinates for conversion.")
|
|
|
901 |
|
902 |
pages_are_images = False
|
903 |
|