seanpedrickcase commited on
Commit
ed5f8c7
·
1 Parent(s): 4276db1

Implemented Textract document API calls and associated output tracking/download. Fixes to config and cost code implementation. General minor bug fixes.

Browse files
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
  # Stage 1: Build dependencies and download models
2
- FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
3
 
4
  # Install system dependencies. Need to specify -y for poppler to get it to install
5
  RUN apt-get update \
@@ -27,7 +27,7 @@ COPY lambda_entrypoint.py .
27
  COPY entrypoint.sh .
28
 
29
  # Stage 2: Final runtime image
30
- FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
31
 
32
  # Define a build argument with a default value
33
  ARG APP_MODE=gradio
 
1
  # Stage 1: Build dependencies and download models
2
+ FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
3
 
4
  # Install system dependencies. Need to specify -y for poppler to get it to install
5
  RUN apt-get update \
 
27
  COPY entrypoint.sh .
28
 
29
  # Stage 2: Final runtime image
30
+ FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
31
 
32
  # Define a build argument with a default value
33
  ARG APP_MODE=gradio
app.py CHANGED
@@ -1,19 +1,21 @@
1
  import os
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
 
6
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
7
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe
8
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
9
  from tools.file_redaction import choose_and_run_redactor
10
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
11
- from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr
12
  from tools.data_anonymise import anonymise_data_files
13
  from tools.auth import authenticate_user
14
  from tools.load_spacy_model_custom_recognisers import custom_entities
15
  from tools.custom_csvlogger import CSVLogger_custom
16
  from tools.find_duplicate_pages import identify_similar_pages
 
17
 
18
  # Suppress downcasting warnings
19
  pd.set_option('future.no_silent_downcasting', True)
@@ -58,14 +60,16 @@ with app:
58
 
59
  # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
60
  pdf_doc_state = gr.State([])
61
- all_image_annotations_state = gr.State([])
62
 
63
 
64
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
65
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
66
 
67
  session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
 
68
  s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
 
69
  output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
70
  input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
71
 
@@ -133,6 +137,7 @@ with app:
133
 
134
  clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
135
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
 
136
  prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
137
 
138
  ## Settings page variables
@@ -149,18 +154,29 @@ with app:
149
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
150
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
151
 
 
 
 
 
 
 
 
 
 
152
  s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
153
  default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
154
  enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
 
155
 
156
  # Base tables that are not modified subsequent to load
157
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
158
  all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
 
159
  cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
160
 
161
  # Duplicate page detection
162
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
163
- duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
164
 
165
  # Tracking variables for current page (not visible)
166
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -168,7 +184,7 @@ with app:
168
 
169
  # Placeholders for elements that may be made visible later below depending on environment variables
170
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
171
- cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=False)
172
 
173
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
174
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
@@ -177,6 +193,22 @@ with app:
177
 
178
  only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  ###
181
  # UI DESIGN
182
  ###
@@ -199,32 +231,21 @@ with app:
199
  with gr.Accordion("Redact document", open = True):
200
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
201
 
202
- text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
203
-
204
- with gr.Row(equal_height=True):
205
- pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
206
 
207
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
208
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
209
 
210
- if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
211
- with gr.Accordion("AWS Textract bulk document API call", open = False, visible=True):
212
- with gr.Row(equal_height=True):
213
- job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=True)
214
- send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=True)
215
- with gr.Row(equal_height=True):
216
- check_state_of_textract_api__call_btn = gr.Button("Check state of Textract job", variant="secondary", visible=True)
217
- job_current_status = gr.Textbox(value="", label="job_current_status", visible=True)
218
- with gr.Row(equal_height=True):
219
- textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
220
 
221
  if SHOW_COSTS == "True":
222
  with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
223
- with gr.Row(equal_height=True):
224
- textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
225
- total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
226
- estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0.00, precision=2, visible=True)
227
- estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
228
 
229
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
230
  with gr.Accordion("Apply cost code", open = True, visible=True):
@@ -232,19 +253,32 @@ with app:
232
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
233
  with gr.Column():
234
  reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
235
- cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
238
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
239
 
240
  with gr.Row():
241
- output_summary = gr.Textbox(label="Output summary", scale=1)
242
  output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
243
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
244
 
245
- with gr.Row():
246
- convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
247
-
248
  # Feedback elements are invisible until revealed by redaction action
249
  pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
250
  pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
@@ -263,21 +297,16 @@ with app:
263
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
264
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
265
  with gr.Row():
266
- clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
267
 
268
- with gr.Row(equal_height=True):
269
  with gr.Column(scale=2):
270
  with gr.Row(equal_height=True):
271
  annotation_last_page_button = gr.Button("Previous page", scale = 4)
272
- annotate_current_page = gr.Number(value=1, label="Current page", precision=0, scale = 2, min_width=50)
273
- annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
274
  annotation_next_page_button = gr.Button("Next page", scale = 4)
275
- with gr.Column(scale=1):
276
- annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
277
-
278
 
279
- with gr.Row():
280
- with gr.Column(scale=2):
281
  zoom_str = str(annotator_zoom_number) + '%'
282
 
283
  annotator = image_annotator(
@@ -297,7 +326,15 @@ with app:
297
  handles_cursor=True,
298
  interactive=False
299
  )
 
 
 
 
 
 
 
300
  with gr.Column(scale=1):
 
301
  update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
302
  with gr.Accordion("Search suggested redactions", open=True):
303
  with gr.Row(equal_height=True):
@@ -318,17 +355,7 @@ with app:
318
 
319
  with gr.Accordion("Search all extracted text", open=True):
320
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
321
- reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
322
-
323
- with gr.Row():
324
- with gr.Column(scale=2):
325
- with gr.Row(equal_height=True):
326
- annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
327
- annotate_current_page_bottom = gr.Number(value=1, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
328
- annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
329
- annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
330
- with gr.Column(scale=1):
331
- blank_markdown_bot = gr.Markdown(value="", label="")
332
 
333
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
334
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -432,7 +459,9 @@ with app:
432
  all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
433
  all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
434
 
 
435
  ### UI INTERACTION ###
 
436
 
437
  ###
438
  # PDF/IMAGE REDACTION
@@ -440,7 +469,7 @@ with app:
440
  # Recalculate estimated costs based on changes to inputs
441
  if SHOW_COSTS == 'True':
442
  # Calculate costs
443
- total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
444
  text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
445
  pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
446
  handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
@@ -460,31 +489,42 @@ with app:
460
  cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
461
  reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
462
 
 
 
463
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
464
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
465
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
466
 
467
  # Run redaction function
468
- document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, output_summary]).\
469
- success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
470
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
471
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
472
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
473
 
474
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
475
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
476
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
477
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
478
 
479
  # If a file has been completed, the function will continue onto the next document
480
- latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
481
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
482
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
483
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
484
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
485
 
486
  # If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
487
  all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
 
 
 
 
 
 
 
 
 
488
 
489
  ###
490
  # REVIEW PDF REDACTIONS
@@ -493,7 +533,7 @@ with app:
493
  # Upload previous files for modifying redactions
494
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
495
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
496
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
497
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
498
 
499
  # Page number controls
@@ -501,11 +541,11 @@ with app:
501
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
502
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
503
 
504
- annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page])
505
- annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page])
506
 
507
- annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom, annotate_current_page_bottom])
508
- annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page_bottom, annotate_current_page_bottom])
509
 
510
  annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
511
 
@@ -552,12 +592,12 @@ with app:
552
 
553
  # Convert review file to xfdf Adobe format
554
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
555
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
556
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
557
 
558
  # Convert xfdf Adobe file back to review_file.csv
559
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
560
- success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
561
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
562
 
563
  ###
@@ -601,7 +641,15 @@ with app:
601
  ###
602
 
603
  # Get connection details on app load
604
- app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox])
 
 
 
 
 
 
 
 
605
 
606
  # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
607
  if GET_DEFAULT_ALLOW_LIST == "True" and ALLOW_LIST_PATH:
@@ -615,20 +663,23 @@ with app:
615
  else: print("Could not load in default allow list")
616
 
617
  # If relevant environment variable is set, load in the default cost code file from S3 or locally
618
- if GET_COST_CODES == "True" and COST_CODES_PATH:
619
  if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
620
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
621
- success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
622
  print("Successfully loaded cost codes from S3")
623
  elif os.path.exists(COST_CODES_PATH):
624
  print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
625
- app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
626
  else: print("Could not load in cost code data")
627
 
 
 
628
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
629
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
630
- access_callback.setup([session_hash_textbox], ACCESS_LOGS_FOLDER)
631
- session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
 
632
  success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
633
 
634
  # User submitted feedback for pdf redactions
@@ -647,16 +698,23 @@ with app:
647
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
648
 
649
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
650
- usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
651
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], None, preprocess=False).\
 
 
 
 
652
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
653
  else:
654
- usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
655
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], None, preprocess=False).\
 
656
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
657
 
658
- if __name__ == "__main__":
 
659
 
 
660
  if RUN_DIRECT_MODE == "0":
661
 
662
  if os.environ['COGNITO_AUTH'] == "1":
@@ -667,7 +725,7 @@ if __name__ == "__main__":
667
  else:
668
  from tools.cli_redact import main
669
 
670
- main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
671
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
672
  current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
673
 
 
1
  import os
2
+ import logging
3
  import pandas as pd
4
  import gradio as gr
5
  from gradio_image_annotation import image_annotator
6
 
7
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE
8
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
9
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
10
  from tools.file_redaction import choose_and_run_redactor
11
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
12
+ from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
13
  from tools.data_anonymise import anonymise_data_files
14
  from tools.auth import authenticate_user
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  from tools.custom_csvlogger import CSVLogger_custom
17
  from tools.find_duplicate_pages import identify_similar_pages
18
+ from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
19
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
 
60
 
61
  # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
62
  pdf_doc_state = gr.State([])
63
+ all_image_annotations_state = gr.State([])
64
 
65
 
66
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
67
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
68
 
69
  session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
70
+ host_name_textbox = gr.Textbox(label= "host_name_textbox", value=HOST_NAME, visible=False)
71
  s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
72
+ session_output_folder_textbox = gr.Textbox(value = SESSION_OUTPUT_FOLDER, label="session_output_folder_textbox", visible=False)
73
  output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
74
  input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
75
 
 
137
 
138
  clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
139
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
140
+ prepare_for_review_bool_false = gr.Checkbox(label="prepare_for_review_bool_false", value=False, visible=False)
141
  prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
142
 
143
  ## Settings page variables
 
154
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
155
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
156
 
157
+ s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_BULK_ANALYSIS_BUCKET, visible=False)
158
+ s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
159
+ s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
160
+ successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
161
+
162
+ load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
163
+ s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
164
+ local_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
165
+
166
  s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
167
  default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
168
  enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
169
+ default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
170
 
171
  # Base tables that are not modified subsequent to load
172
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
173
  all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
174
+ all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
175
  cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
176
 
177
  # Duplicate page detection
178
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
179
+ duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
180
 
181
  # Tracking variables for current page (not visible)
182
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
 
184
 
185
  # Placeholders for elements that may be made visible later below depending on environment variables
186
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
187
+ cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
188
 
189
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
190
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
 
193
 
194
  only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
195
 
196
+ # Textract API call placeholders in case option not selected in config
197
+
198
+ job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=False)
199
+ send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
200
+
201
+ job_id_textbox = gr.Textbox(label = "Latest job ID for bulk document analysis", value='', visible=False)
202
+ check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
203
+ job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
204
+ job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
205
+ textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
206
+ selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
207
+ is_a_textract_api_call = gr.Checkbox(value=False, label="is_a_textract_api_call", visible=False)
208
+ job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
209
+
210
+ textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
211
+
212
  ###
213
  # UI DESIGN
214
  ###
 
231
  with gr.Accordion("Redact document", open = True):
232
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
233
 
234
+ text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Go to Redaction settings - AWS Textract options to remove signature detection.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
 
 
 
235
 
236
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
237
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
238
 
239
+ with gr.Row(equal_height=True):
240
+ pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
 
 
 
 
 
 
 
 
241
 
242
  if SHOW_COSTS == "True":
243
  with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
244
+ with gr.Row(equal_height=True):
245
+ textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
246
+ total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
247
+ estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
248
+ estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
249
 
250
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
251
  with gr.Accordion("Apply cost code", open = True, visible=True):
 
253
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
254
  with gr.Column():
255
  reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
256
+ cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
257
+
258
+ if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
259
+ with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
260
+ with gr.Row(equal_height=True):
261
+ gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
262
+ with gr.Row(equal_height=True):
263
+ send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract API call", variant="primary", visible=True)
264
+ with gr.Row(equal_height=False):
265
+ with gr.Column(scale=2):
266
+ textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(6,'fixed'), static_columns=[0,1,2,3,4,5])
267
+ with gr.Column(scale=1):
268
+ job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
269
+ check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
270
+ with gr.Row():
271
+ job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
272
+ textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
273
 
274
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
275
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
276
 
277
  with gr.Row():
278
+ redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
279
  output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
280
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
281
 
 
 
 
282
  # Feedback elements are invisible until revealed by redaction action
283
  pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
284
  pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
 
297
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
298
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
299
  with gr.Row():
300
+ clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
301
 
302
+ with gr.Row():
303
  with gr.Column(scale=2):
304
  with gr.Row(equal_height=True):
305
  annotation_last_page_button = gr.Button("Previous page", scale = 4)
306
+ annotate_current_page = gr.Number(value=0, label="Current page", precision=0, scale = 2, min_width=50)
307
+ annotate_max_pages = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
308
  annotation_next_page_button = gr.Button("Next page", scale = 4)
 
 
 
309
 
 
 
310
  zoom_str = str(annotator_zoom_number) + '%'
311
 
312
  annotator = image_annotator(
 
326
  handles_cursor=True,
327
  interactive=False
328
  )
329
+
330
+ with gr.Row(equal_height=True):
331
+ annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
332
+ annotate_current_page_bottom = gr.Number(value=0, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
333
+ annotate_max_pages_bottom = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
334
+ annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
335
+
336
  with gr.Column(scale=1):
337
+ annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
338
  update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
339
  with gr.Accordion("Search suggested redactions", open=True):
340
  with gr.Row(equal_height=True):
 
355
 
356
  with gr.Accordion("Search all extracted text", open=True):
357
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
358
+ reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
 
 
 
 
 
 
 
 
 
 
359
 
360
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
361
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
 
459
  all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
460
  all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
461
 
462
+ ###
463
  ### UI INTERACTION ###
464
+ ###
465
 
466
  ###
467
  # PDF/IMAGE REDACTION
 
469
  # Recalculate estimated costs based on changes to inputs
470
  if SHOW_COSTS == 'True':
471
  # Calculate costs
472
+ total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
473
  text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
474
  pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
475
  handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
 
489
  cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
490
  reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
491
 
492
+ cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
493
+
494
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
495
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
496
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
497
 
498
  # Run redaction function
499
+ document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
500
+ success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
501
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
502
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
503
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
504
 
505
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
506
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
507
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
508
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
509
 
510
  # If a file has been completed, the function will continue onto the next document
511
+ latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
512
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
513
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
514
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
515
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
516
 
517
  # If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
518
  all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
519
+
520
+ # Send whole document to Textract for text extraction
521
+ send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call])
522
+
523
+ check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
524
+ success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
525
+ success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
526
+
527
+ textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
528
 
529
  ###
530
  # REVIEW PDF REDACTIONS
 
533
  # Upload previous files for modifying redactions
534
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
535
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
536
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
537
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
538
 
539
  # Page number controls
 
541
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
542
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
543
 
544
+ annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom])
545
+ annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom])
546
 
547
+ annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom])
548
+ annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom])
549
 
550
  annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
551
 
 
592
 
593
  # Convert review file to xfdf Adobe format
594
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
595
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder]).\
596
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
597
 
598
  # Convert xfdf Adobe file back to review_file.csv
599
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
600
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder]).\
601
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
602
 
603
  ###
 
641
  ###
642
 
643
  # Get connection details on app load
644
+
645
+ if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
646
+ app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
647
+ success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
648
+ else:
649
+ app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder])
650
+
651
+
652
+ # If relevant environment variable is set, load in the Textract job details
653
 
654
  # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
655
  if GET_DEFAULT_ALLOW_LIST == "True" and ALLOW_LIST_PATH:
 
663
  else: print("Could not load in default allow list")
664
 
665
  # If relevant environment variable is set, load in the default cost code file from S3 or locally
666
+ if GET_COST_CODES == "True" and COST_CODES_PATH:
667
  if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
668
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
669
+ success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
670
  print("Successfully loaded cost codes from S3")
671
  elif os.path.exists(COST_CODES_PATH):
672
  print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
673
+ app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
674
  else: print("Could not load in cost code data")
675
 
676
+ ### LOGGING
677
+
678
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
679
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
680
+ access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
681
+
682
+ session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
683
  success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
684
 
685
  # User submitted feedback for pdf redactions
 
698
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
699
 
700
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
701
+ usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
702
+
703
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
704
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
705
+
706
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
707
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
708
  else:
709
+ usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
710
+
711
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
712
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
713
 
714
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
715
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
716
 
717
+ if __name__ == "__main__":
718
  if RUN_DIRECT_MODE == "0":
719
 
720
  if os.environ['COGNITO_AUTH'] == "1":
 
725
  else:
726
  from tools.cli_redact import main
727
 
728
+ main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
729
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
730
  current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
731
 
requirements.txt CHANGED
@@ -13,7 +13,7 @@ spacy==3.8.4
13
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
  gradio==5.23.3
16
- boto3==1.37.17
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
 
13
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
  gradio==5.23.3
16
+ boto3==1.37.29
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
tools/aws_functions.py CHANGED
@@ -30,129 +30,90 @@ if RUN_AWS_FUNCTIONS == "1":
30
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
31
 
32
  print("Successfully assumed ARN role")
33
- print("Assumed Role ARN:", assumed_role_arn)
34
- print("Assumed Role Name:", assumed_role_name)
35
 
36
  except Exception as e:
37
  print("Could not get assumed role from STS:", e)
38
 
39
  # Download direct from S3 - requires login credentials
40
- def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str):
 
 
 
 
 
41
 
42
- s3 = boto3.client('s3', region_name=AWS_REGION)
43
- s3.download_file(bucket_name, key, local_file_path_and_name)
44
- print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
45
 
46
- def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str):
47
  """
48
  Download all files from an S3 folder to a local folder.
49
  """
50
- s3 = boto3.client('s3', region_name=AWS_REGION)
 
51
 
52
- # List objects in the specified S3 folder
53
- response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
54
 
55
- # Download each object
56
- for obj in response.get('Contents', []):
57
- # Extract object key and construct local file path
58
- object_key = obj['Key']
59
- local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
60
 
61
- # Create directories if necessary
62
- os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
 
 
 
63
 
64
- # Download the object
65
- try:
66
- s3.download_file(bucket_name, object_key, local_file_path)
67
- print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
68
- except Exception as e:
69
- print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
 
 
 
 
70
 
71
- def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str]):
72
  """
73
  Download specific files from an S3 folder to a local folder.
74
  """
75
- s3 = boto3.client('s3', region_name=AWS_REGION)
76
-
77
- print("Trying to download file: ", filenames)
78
-
79
- if filenames == '*':
80
- # List all objects in the S3 folder
81
- print("Trying to download all files in AWS folder: ", s3_folder)
82
- response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
83
-
84
- print("Found files in AWS folder: ", response.get('Contents', []))
85
 
86
- filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
 
87
 
88
- print("Found filenames in AWS folder: ", filenames)
89
 
90
- for filename in filenames:
91
- object_key = os.path.join(s3_folder, filename)
92
- local_file_path = os.path.join(local_folder, filename)
93
 
94
- # Create directories if necessary
95
- os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
 
 
96
 
97
- # Download the object
98
- try:
99
- s3.download_file(bucket_name, object_key, local_file_path)
100
- print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
101
- except Exception as e:
102
- print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
103
-
104
- def load_data_from_aws(in_aws_keyword_file, aws_password:str="", bucket_name:str=DOCUMENT_REDACTION_BUCKET):
105
-
106
- temp_dir = tempfile.mkdtemp()
107
- local_address_stub = temp_dir + '/doc-redaction/'
108
- files = []
109
-
110
- if not 'LAMBETH_BOROUGH_PLAN_PASSWORD' in os.environ:
111
- out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
112
- return files, out_message
113
-
114
- if aws_password:
115
- if "Lambeth borough plan" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_BOROUGH_PLAN_PASSWORD']:
116
 
117
- s3_folder_stub = 'example-data/lambeth-borough-plan/latest/'
118
 
119
- local_folder_path = local_address_stub
120
 
121
- # Check if folder exists
122
- if not os.path.exists(local_folder_path):
123
- print(f"Folder {local_folder_path} does not exist! Making folder.")
124
 
125
- os.mkdir(local_folder_path)
 
126
 
127
- # Check if folder is empty
128
- if len(os.listdir(local_folder_path)) == 0:
129
- print(f"Folder {local_folder_path} is empty")
130
- # Download data
131
- download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
 
132
 
133
- print("AWS data downloaded")
134
 
135
- else:
136
- print(f"Folder {local_folder_path} is not empty")
137
-
138
- #files = os.listdir(local_folder_stub)
139
- #print(files)
140
-
141
- files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
142
-
143
- out_message = "Data successfully loaded from AWS"
144
- print(out_message)
145
-
146
- else:
147
- out_message = "Data not loaded from AWS"
148
- print(out_message)
149
- else:
150
- out_message = "No password provided. Please ask the data team for access if you need this."
151
- print(out_message)
152
-
153
- return files, out_message
154
-
155
- def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET):
156
  """
157
  Uploads a file from local machine to Amazon S3.
158
 
@@ -165,33 +126,44 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
165
  - Message as variable/printed to console
166
  """
167
  final_out_message = []
 
168
 
169
- s3_client = boto3.client('s3', region_name=AWS_REGION)
 
 
170
 
171
- if isinstance(local_file_paths, str):
172
- local_file_paths = [local_file_paths]
173
 
174
- for file in local_file_paths:
175
- if s3_client:
176
- #print(s3_client)
177
- try:
178
- # Get file name off file path
179
- file_name = os.path.basename(file)
180
 
181
- s3_key_full = s3_key + file_name
182
- print("S3 key: ", s3_key_full)
 
 
 
 
183
 
184
- s3_client.upload_file(file, s3_bucket, s3_key_full)
185
- out_message = "File " + file_name + " uploaded successfully!"
186
- print(out_message)
187
-
188
- except Exception as e:
189
- out_message = f"Error uploading file(s): {e}"
190
- print(out_message)
191
 
192
- final_out_message.append(out_message)
193
- final_out_message_str = '\n'.join(final_out_message)
 
 
 
 
 
194
 
195
- else: final_out_message_str = "Could not connect to AWS."
 
 
 
 
 
 
 
 
 
196
 
197
  return final_out_message_str
 
30
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
31
 
32
  print("Successfully assumed ARN role")
33
+ #print("Assumed Role ARN:", assumed_role_arn)
34
+ #print("Assumed Role Name:", assumed_role_name)
35
 
36
  except Exception as e:
37
  print("Could not get assumed role from STS:", e)
38
 
39
  # Download direct from S3 - requires login credentials
40
+ def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
41
+
42
+ if RUN_AWS_FUNCTIONS == "1":
43
+ s3 = boto3.client('s3', region_name=AWS_REGION)
44
+ s3.download_file(bucket_name, key, local_file_path_and_name)
45
+ print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
46
 
 
 
 
47
 
48
+ def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
49
  """
50
  Download all files from an S3 folder to a local folder.
51
  """
52
+ if RUN_AWS_FUNCTIONS == "1":
53
+ if bucket_name and s3_folder and local_folder:
54
 
55
+ s3 = boto3.client('s3', region_name=AWS_REGION)
 
56
 
57
+ # List objects in the specified S3 folder
58
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
 
 
 
59
 
60
+ # Download each object
61
+ for obj in response.get('Contents', []):
62
+ # Extract object key and construct local file path
63
+ object_key = obj['Key']
64
+ local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
65
 
66
+ # Create directories if necessary
67
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
68
+
69
+ # Download the object
70
+ try:
71
+ s3.download_file(bucket_name, object_key, local_file_path)
72
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
73
+ except Exception as e:
74
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
75
+ else: print("One or more required variables are empty, could not download from S3")
76
 
77
+ def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str], RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
78
  """
79
  Download specific files from an S3 folder to a local folder.
80
  """
 
 
 
 
 
 
 
 
 
 
81
 
82
+ if RUN_AWS_FUNCTIONS == "1":
83
+ if bucket_name and s3_folder and local_folder and filenames:
84
 
85
+ s3 = boto3.client('s3', region_name=AWS_REGION)
86
 
87
+ print("Trying to download file: ", filenames)
 
 
88
 
89
+ if filenames == '*':
90
+ # List all objects in the S3 folder
91
+ print("Trying to download all files in AWS folder: ", s3_folder)
92
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
93
 
94
+ print("Found files in AWS folder: ", response.get('Contents', []))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
97
 
98
+ print("Found filenames in AWS folder: ", filenames)
99
 
100
+ for filename in filenames:
101
+ object_key = os.path.join(s3_folder, filename)
102
+ local_file_path = os.path.join(local_folder, filename)
103
 
104
+ # Create directories if necessary
105
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
106
 
107
+ # Download the object
108
+ try:
109
+ s3.download_file(bucket_name, object_key, local_file_path)
110
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
111
+ except Exception as e:
112
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
113
 
114
+ else: print("One or more required variables are empty, could not download from S3")
115
 
116
+ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  """
118
  Uploads a file from local machine to Amazon S3.
119
 
 
126
  - Message as variable/printed to console
127
  """
128
  final_out_message = []
129
+ final_out_message_str = ""
130
 
131
+ if RUN_AWS_FUNCTIONS == "1":
132
+ try:
133
+ if s3_bucket and s3_key and local_file_paths:
134
 
135
+ s3_client = boto3.client('s3', region_name=AWS_REGION)
 
136
 
137
+ if isinstance(local_file_paths, str):
138
+ local_file_paths = [local_file_paths]
 
 
 
 
139
 
140
+ for file in local_file_paths:
141
+ if s3_client:
142
+ #print(s3_client)
143
+ try:
144
+ # Get file name off file path
145
+ file_name = os.path.basename(file)
146
 
147
+ s3_key_full = s3_key + file_name
148
+ print("S3 key: ", s3_key_full)
 
 
 
 
 
149
 
150
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
151
+ out_message = "File " + file_name + " uploaded successfully!"
152
+ print(out_message)
153
+
154
+ except Exception as e:
155
+ out_message = f"Error uploading file(s): {e}"
156
+ print(out_message)
157
 
158
+ final_out_message.append(out_message)
159
+ final_out_message_str = '\n'.join(final_out_message)
160
+
161
+ else: final_out_message_str = "Could not connect to AWS."
162
+ else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
163
+ except Exception as e:
164
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
165
+ print(final_out_message_str)
166
+ else:
167
+ final_out_message_str = "App not set to run AWS functions"
168
 
169
  return final_out_message_str
tools/aws_textract.py CHANGED
@@ -6,6 +6,7 @@ import json
6
  from collections import defaultdict
7
  import pikepdf
8
  import time
 
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
  from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
11
 
@@ -275,7 +276,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
275
 
276
  return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
277
 
278
- def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
279
  """
280
  Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
281
  """
@@ -307,7 +308,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
307
  print("Need to convert Textract JSON to app format.")
308
  try:
309
 
310
- textract_data = restructure_textract_output(textract_data)
311
  return textract_data, False, log_files_output_paths # Successfully converted
312
 
313
  except Exception as e:
@@ -318,7 +319,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
318
  print("textract data:", textract_data)
319
  return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
320
 
321
- def restructure_textract_output(textract_output: dict):
322
  """
323
  Reorganise Textract output from the bulk Textract analysis option on AWS
324
  into a format that works in this redaction app, reducing size.
@@ -328,10 +329,62 @@ def restructure_textract_output(textract_output: dict):
328
  # Extract total pages from DocumentMetadata
329
  document_metadata = textract_output.get("DocumentMetadata", {})
330
 
 
 
 
 
331
  for block in textract_output.get("Blocks", []):
332
  page_no = block.get("Page", 1) # Default to 1 if missing
333
 
334
- # Initialize page structure if not already present
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  if page_no not in pages_dict:
336
  pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
337
 
 
6
  from collections import defaultdict
7
  import pikepdf
8
  import time
9
+ import pandas as pd
10
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
11
  from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
12
 
 
276
 
277
  return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
278
 
279
+ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
280
  """
281
  Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
282
  """
 
308
  print("Need to convert Textract JSON to app format.")
309
  try:
310
 
311
+ textract_data = restructure_textract_output(textract_data, page_sizes_df)
312
  return textract_data, False, log_files_output_paths # Successfully converted
313
 
314
  except Exception as e:
 
319
  print("textract data:", textract_data)
320
  return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
321
 
322
+ def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFrame):
323
  """
324
  Reorganise Textract output from the bulk Textract analysis option on AWS
325
  into a format that works in this redaction app, reducing size.
 
329
  # Extract total pages from DocumentMetadata
330
  document_metadata = textract_output.get("DocumentMetadata", {})
331
 
332
+ # For efficient lookup, set 'page' as index if it's not already
333
+ if 'page' in page_sizes_df.columns:
334
+ page_sizes_df = page_sizes_df.set_index('page')
335
+
336
  for block in textract_output.get("Blocks", []):
337
  page_no = block.get("Page", 1) # Default to 1 if missing
338
 
339
+ # --- Geometry Conversion Logic ---
340
+ try:
341
+ page_info = page_sizes_df.loc[page_no]
342
+ cb_width = page_info['cropbox_width']
343
+ cb_height = page_info['cropbox_height']
344
+ mb_width = page_info['mediabox_width']
345
+ mb_height = page_info['mediabox_height']
346
+ cb_x_offset = page_info['cropbox_x_offset']
347
+ cb_y_offset_top = page_info['cropbox_y_offset_from_top']
348
+
349
+ # Check if conversion is needed (and avoid division by zero)
350
+ needs_conversion = (
351
+ abs(cb_width - mb_width) > 1e-6 or \
352
+ abs(cb_height - mb_height) > 1e-6
353
+ ) and mb_width > 1e-6 and mb_height > 1e-6 # Avoid division by zero
354
+
355
+ if needs_conversion and 'Geometry' in block:
356
+ geometry = block['Geometry'] # Work directly on the block's geometry
357
+
358
+ # --- Convert BoundingBox ---
359
+ if 'BoundingBox' in geometry:
360
+ bbox = geometry['BoundingBox']
361
+ old_left = bbox['Left']
362
+ old_top = bbox['Top']
363
+ old_width = bbox['Width']
364
+ old_height = bbox['Height']
365
+
366
+ # Calculate absolute coordinates within CropBox
367
+ abs_cb_x = old_left * cb_width
368
+ abs_cb_y = old_top * cb_height
369
+ abs_cb_width = old_width * cb_width
370
+ abs_cb_height = old_height * cb_height
371
+
372
+ # Calculate absolute coordinates relative to MediaBox top-left
373
+ abs_mb_x = cb_x_offset + abs_cb_x
374
+ abs_mb_y = cb_y_offset_top + abs_cb_y
375
+
376
+ # Convert back to normalized coordinates relative to MediaBox
377
+ bbox['Left'] = abs_mb_x / mb_width
378
+ bbox['Top'] = abs_mb_y / mb_height
379
+ bbox['Width'] = abs_cb_width / mb_width
380
+ bbox['Height'] = abs_cb_height / mb_height
381
+ except KeyError:
382
+ print(f"Warning: Page number {page_no} not found in page_sizes_df. Skipping coordinate conversion for this block.")
383
+ # Decide how to handle missing page info: skip conversion, raise error, etc.
384
+ except ZeroDivisionError:
385
+ print(f"Warning: MediaBox width or height is zero for page {page_no}. Skipping coordinate conversion for this block.")
386
+
387
+ # Initialise page structure if not already present
388
  if page_no not in pages_dict:
389
  pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
390
 
tools/config.py CHANGED
@@ -1,12 +1,13 @@
1
  import os
2
  import tempfile
3
  import socket
 
4
  from datetime import datetime
5
  from dotenv import load_dotenv
6
  from tldextract import TLDExtract
7
 
8
  today_rev = datetime.now().strftime("%Y%m%d")
9
- host_name = socket.gethostname()
10
 
11
  # Set or retrieve configuration variables for the redaction app
12
 
@@ -28,28 +29,40 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
28
  return value
29
 
30
 
31
- # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
32
- APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '')
33
 
 
 
 
 
 
 
34
 
35
- if os.path.exists(APP_CONFIG_PATH):
36
- print(f"Loading APP variables from config file {APP_CONFIG_PATH}")
37
- load_dotenv(APP_CONFIG_PATH)
 
 
 
38
 
39
  ###
40
  # AWS CONFIG
41
  ###
42
 
43
- # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
44
- AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '')
45
 
46
- if os.path.exists(AWS_CONFIG_PATH):
47
- print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
48
- load_dotenv(AWS_CONFIG_PATH)
 
 
 
49
 
50
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
51
 
52
- AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
53
 
54
  AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
55
 
@@ -65,14 +78,28 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
65
 
66
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # Custom headers e.g. if routing traffic through Cloudfront
69
  # Retrieving or setting CUSTOM_HEADER
70
  CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
71
- if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
72
 
73
  # Retrieving or setting CUSTOM_HEADER_VALUE
74
  CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
75
- if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
76
 
77
  ###
78
  # Images config
@@ -84,8 +111,7 @@ MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to No
84
  ###
85
  # File I/O config
86
  ###
87
-
88
- SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
89
 
90
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
91
  INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
@@ -99,12 +125,14 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
99
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
100
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
101
 
102
- FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + host_name + '/')
 
103
 
104
- USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'logs/' + today_rev + '/' + host_name + '/')
105
-
106
- ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'usage/' + today_rev + '/' + host_name + '/')
107
 
 
108
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
109
 
110
  ###
@@ -114,7 +142,6 @@ TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
114
 
115
  POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
116
 
117
- SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
118
 
119
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
120
  PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
@@ -153,15 +180,16 @@ ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_
153
 
154
  S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
155
 
156
- SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'True')
157
 
158
  GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
159
 
 
 
160
  COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
161
 
162
  S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
163
 
164
  ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
165
 
166
- if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
167
- if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
 
1
  import os
2
  import tempfile
3
  import socket
4
+ import logging
5
  from datetime import datetime
6
  from dotenv import load_dotenv
7
  from tldextract import TLDExtract
8
 
9
  today_rev = datetime.now().strftime("%Y%m%d")
10
+ HOST_NAME = socket.gethostname()
11
 
12
  # Set or retrieve configuration variables for the redaction app
13
 
 
29
  return value
30
 
31
 
32
+ # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
33
+ APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env')
34
 
35
+ if APP_CONFIG_PATH:
36
+ if os.path.exists(APP_CONFIG_PATH):
37
+ print(f"Loading app variables from config file {APP_CONFIG_PATH}")
38
+ load_dotenv(APP_CONFIG_PATH)
39
+ else:
40
+ print("App config file not found at location:", APP_CONFIG_PATH)
41
 
42
+ # Report logging to console?
43
+ LOGGING = get_or_create_env_var('LOGGING', 'False')
44
+
45
+ if LOGGING == 'True':
46
+ # Configure logging
47
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
48
 
49
  ###
50
  # AWS CONFIG
51
  ###
52
 
53
+ # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
54
+ AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', 'config/aws_config.env')
55
 
56
+ if AWS_CONFIG_PATH:
57
+ if os.path.exists(AWS_CONFIG_PATH):
58
+ print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
59
+ load_dotenv(AWS_CONFIG_PATH)
60
+ else:
61
+ print("AWS config file not found at location:", AWS_CONFIG_PATH)
62
 
63
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
64
 
65
+ AWS_REGION = get_or_create_env_var('AWS_REGION', '')
66
 
67
  AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
68
 
 
78
 
79
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
80
 
81
+ SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
82
+
83
+ TEXTRACT_BULK_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_BUCKET', '')
84
+
85
+ TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER', 'input')
86
+
87
+ TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
88
+
89
+ LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
90
+
91
+ TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
92
+
93
+ TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
94
+
95
  # Custom headers e.g. if routing traffic through Cloudfront
96
  # Retrieving or setting CUSTOM_HEADER
97
  CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
98
+ #if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
99
 
100
  # Retrieving or setting CUSTOM_HEADER_VALUE
101
  CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
102
+ #if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
103
 
104
  ###
105
  # Images config
 
111
  ###
112
  # File I/O config
113
  ###
114
+ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
 
115
 
116
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
117
  INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
 
125
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
126
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
127
 
128
+ # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
129
+ # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
130
 
131
+ FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + HOST_NAME + '/')
132
+ ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + today_rev + '/' + HOST_NAME + '/')
133
+ USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + today_rev + '/' + HOST_NAME + '/')
134
 
135
+ # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
136
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
137
 
138
  ###
 
142
 
143
  POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
144
 
 
145
 
146
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
147
  PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
 
180
 
181
  S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
182
 
183
+ SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
184
 
185
  GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
186
 
187
+ DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
188
+
189
  COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
190
 
191
  S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
192
 
193
  ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
194
 
195
+ if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
 
tools/file_conversion.py CHANGED
@@ -181,7 +181,7 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
181
  widths = [result[2] for result in results]
182
  heights = [result[3] for result in results]
183
 
184
- print("PDF has been converted to images.")
185
  return images, widths, heights, results
186
 
187
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
@@ -208,7 +208,7 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
208
 
209
  # Check if the file is a PDF
210
  elif file_extension == '.pdf':
211
- print(f"{file_path} is a PDF file. Converting to image set")
212
 
213
  # Run your function for processing PDF files here
214
  img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
@@ -417,12 +417,29 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
417
  pymupdf_page = pymupdf_doc.load_page(page_no)
418
  original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
419
 
420
- # Create a page_sizes_object.
421
- # If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  if image_sizes_width and image_sizes_height:
423
- out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
424
- else:
425
- out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":pd.NA, "image_height":pd.NA, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
426
 
427
  page_sizes.append(out_page_image_sizes)
428
 
@@ -434,7 +451,7 @@ def prepare_image_or_pdf(
434
  latest_file_completed: int = 0,
435
  out_message: List[str] = [],
436
  first_loop_state: bool = False,
437
- number_of_pages:int = 1,
438
  all_annotations_object:List = [],
439
  prepare_for_review:bool = False,
440
  in_fully_redacted_list:List[int]=[],
@@ -481,6 +498,9 @@ def prepare_image_or_pdf(
481
  all_img_details = []
482
  review_file_csv = pd.DataFrame()
483
  all_line_level_ocr_results_df = pd.DataFrame()
 
 
 
484
 
485
  if isinstance(in_fully_redacted_list, pd.DataFrame):
486
  if not in_fully_redacted_list.empty:
@@ -494,7 +514,7 @@ def prepare_image_or_pdf(
494
  else:
495
  print("Now redacting file", str(latest_file_completed))
496
 
497
- # If out message or converted_file_paths are blank, change to a list so it can be appended to
498
  if isinstance(out_message, str): out_message = [out_message]
499
 
500
  if not file_paths: file_paths = []
@@ -521,15 +541,9 @@ def prepare_image_or_pdf(
521
  file_paths_list = [file_paths]
522
  file_paths_loop = file_paths_list
523
  else:
524
- if prepare_for_review == False:
525
- file_paths_list = file_paths
526
- file_paths_loop = [file_paths_list[int(latest_file_completed)]]
527
- else:
528
- file_paths_list = file_paths
529
- file_paths_loop = file_paths
530
- # Sort files to prioritise PDF files first, then JSON files. This means that the pdf can be loaded in, and pdf page path locations can be added to the json
531
- file_paths_loop = sorted(file_paths_loop, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
532
-
533
  # Loop through files to load in
534
  for file in file_paths_loop:
535
  converted_file_path = []
@@ -592,7 +606,6 @@ def prepare_image_or_pdf(
592
 
593
  image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
594
 
595
-
596
  # Create a page_sizes_object
597
  page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
598
 
@@ -612,7 +625,8 @@ def prepare_image_or_pdf(
612
  json_from_csv = False
613
 
614
  # NEW IF STATEMENT
615
- # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
 
616
  if (file_extension in ['.json']) | (json_from_csv == True):
617
 
618
  if (file_extension in ['.json']) & (prepare_for_review == True):
@@ -624,9 +638,14 @@ def prepare_image_or_pdf(
624
  all_annotations_object = json.loads(file_path) # Use loads for string content
625
 
626
  # Assume it's a textract json
627
- elif (file_extension == '.json') and (prepare_for_review is not True):
 
628
  # Copy it to the output folder so it can be used later.
629
- out_textract_path = os.path.join(output_folder, file_path_without_ext + "_textract.json")
 
 
 
 
630
 
631
  # Use shutil to copy the file directly
632
  shutil.copy2(file_path, out_textract_path) # Preserves metadata
@@ -748,11 +767,11 @@ def prepare_image_or_pdf(
748
  print(out_time)
749
 
750
  out_message.append(out_time)
751
- out_message_out = '\n'.join(out_message)
752
 
753
- number_of_pages = len(image_file_paths)
754
 
755
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
756
 
757
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
758
  file_path_without_ext = get_file_name_without_type(in_file_path)
 
181
  widths = [result[2] for result in results]
182
  heights = [result[3] for result in results]
183
 
184
+ #print("PDF has been converted to images.")
185
  return images, widths, heights, results
186
 
187
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 
208
 
209
  # Check if the file is a PDF
210
  elif file_extension == '.pdf':
211
+ # print(f"{file_path} is a PDF file. Converting to image set")
212
 
213
  # Run your function for processing PDF files here
214
  img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
 
417
  pymupdf_page = pymupdf_doc.load_page(page_no)
418
  original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
419
 
420
+ # Create a page_sizes_object. If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
421
+ out_page_image_sizes = {
422
+ "page":reported_page_no,
423
+ "mediabox_width":pymupdf_page.mediabox.width,
424
+ "mediabox_height": pymupdf_page.mediabox.height,
425
+ "cropbox_width":pymupdf_page.cropbox.width,
426
+ "cropbox_height":pymupdf_page.cropbox.height,
427
+ "original_cropbox":original_cropboxes[-1],
428
+ "image_path":image_file_paths[page_no]}
429
+
430
+ # cropbox_x_offset: Distance from MediaBox left edge to CropBox left edge
431
+ # This is simply the difference in their x0 coordinates.
432
+ out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
433
+
434
+ # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
435
+ # MediaBox top y = mediabox.y1
436
+ # CropBox top y = cropbox.y1
437
+ # The difference is mediabox.y1 - cropbox.y1
438
+ out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
439
+
440
  if image_sizes_width and image_sizes_height:
441
+ out_page_image_sizes["image_width"] = image_sizes_width[page_no]
442
+ out_page_image_sizes["image_height"] = image_sizes_height[page_no]
 
443
 
444
  page_sizes.append(out_page_image_sizes)
445
 
 
451
  latest_file_completed: int = 0,
452
  out_message: List[str] = [],
453
  first_loop_state: bool = False,
454
+ number_of_pages:int = 0,
455
  all_annotations_object:List = [],
456
  prepare_for_review:bool = False,
457
  in_fully_redacted_list:List[int]=[],
 
498
  all_img_details = []
499
  review_file_csv = pd.DataFrame()
500
  all_line_level_ocr_results_df = pd.DataFrame()
501
+ out_textract_path = ""
502
+ combined_out_message = ""
503
+ final_out_message = ""
504
 
505
  if isinstance(in_fully_redacted_list, pd.DataFrame):
506
  if not in_fully_redacted_list.empty:
 
514
  else:
515
  print("Now redacting file", str(latest_file_completed))
516
 
517
+ # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
518
  if isinstance(out_message, str): out_message = [out_message]
519
 
520
  if not file_paths: file_paths = []
 
541
  file_paths_list = [file_paths]
542
  file_paths_loop = file_paths_list
543
  else:
544
+ file_paths_list = file_paths
545
+ file_paths_loop = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
546
+
 
 
 
 
 
 
547
  # Loop through files to load in
548
  for file in file_paths_loop:
549
  converted_file_path = []
 
606
 
607
  image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
608
 
 
609
  # Create a page_sizes_object
610
  page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
611
 
 
625
  json_from_csv = False
626
 
627
  # NEW IF STATEMENT
628
+ # If the file name ends with .json, check if we are loading for review. If yes, assume it is an annoations object, overwrite the current annotations object. If false, assume this is a Textract object, load in to Textract
629
+
630
  if (file_extension in ['.json']) | (json_from_csv == True):
631
 
632
  if (file_extension in ['.json']) & (prepare_for_review == True):
 
638
  all_annotations_object = json.loads(file_path) # Use loads for string content
639
 
640
  # Assume it's a textract json
641
+ elif (file_extension in ['.json']) and (prepare_for_review != True):
642
+ print("Saving Textract output")
643
  # Copy it to the output folder so it can be used later.
644
+ output_textract_json_file_name = file_path_without_ext
645
+ if not file_path.endswith("_textract.json"): output_textract_json_file_name = file_path_without_ext + "_textract.json"
646
+ else: output_textract_json_file_name = file_path_without_ext + ".json"
647
+
648
+ out_textract_path = os.path.join(output_folder, output_textract_json_file_name)
649
 
650
  # Use shutil to copy the file directly
651
  shutil.copy2(file_path, out_textract_path) # Preserves metadata
 
767
  print(out_time)
768
 
769
  out_message.append(out_time)
770
+ combined_out_message = '\n'.join(out_message)
771
 
772
+ number_of_pages = len(page_sizes)#len(image_file_paths)
773
 
774
+ return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
775
 
776
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
777
  file_path_without_ext = get_file_name_without_type(in_file_path)
tools/file_redaction.py CHANGED
@@ -205,7 +205,7 @@ def choose_and_run_redactor(file_paths:List[str],
205
  latest_file_completed = int(latest_file_completed)
206
 
207
  if isinstance(file_paths,str): number_of_files = 1
208
- else: number_of_files = len(file_paths)
209
 
210
  # If we have already redacted the last file, return the input out_message and file list to the relevant outputs
211
  if latest_file_completed >= number_of_files:
@@ -764,28 +764,66 @@ def move_page_info(file_path: str) -> str:
764
 
765
  return new_file_path
766
 
767
- def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict, image:Image):
768
  '''
769
  Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
770
  '''
771
 
772
  img_annotation_box = {}
773
 
 
 
 
 
 
 
 
 
 
774
  if image:
775
  pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
 
776
  else:
777
- pymupdf_x1 = annot.left
778
- pymupdf_x2 = annot.left + annot.width
779
- pymupdf_y1 = annot.top
780
- pymupdf_y2 = annot.top + annot.height
781
-
782
- x1 = pymupdf_x1
783
- x2 = pymupdf_x2
784
-
785
- img_annotation_box["xmin"] = annot.left
786
- img_annotation_box["ymin"] = annot.top
787
- img_annotation_box["xmax"] = annot.left + annot.width
788
- img_annotation_box["ymax"] = annot.top + annot.height
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  img_annotation_box["color"] = (0,0,0)
790
  try:
791
  img_annotation_box["label"] = str(annot.entity_type)
@@ -795,12 +833,11 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
795
  if hasattr(annot, 'text') and annot.text:
796
  img_annotation_box["text"] = str(annot.text)
797
  else:
798
- img_annotation_box["text"] = ""
799
-
800
- rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
801
 
802
  return img_annotation_box, rect
803
 
 
804
  def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
805
  '''
806
  Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
@@ -951,8 +988,9 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
951
  rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
952
 
953
  # Else should be CustomImageRecognizerResult
954
- elif isinstance(annot, CustomImageRecognizerResult):
955
- img_annotation_box, rect = prepare_custom_image_recogniser_result_annotation_box(page, annot, image)
 
956
 
957
  # Else it should be a pikepdf annotation object
958
  else:
@@ -1211,7 +1249,7 @@ def redact_image_pdf(file_path:str,
1211
  # If running Textract, check if file already exists. If it does, load in existing data
1212
  if text_extraction_method == textract_option:
1213
  textract_json_file_path = output_folder + file_name + "_textract.json"
1214
- textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths)
1215
  original_textract_data = textract_data.copy()
1216
 
1217
  ###
 
205
  latest_file_completed = int(latest_file_completed)
206
 
207
  if isinstance(file_paths,str): number_of_files = 1
208
+ else: number_of_files = len(file_paths_list)
209
 
210
  # If we have already redacted the last file, return the input out_message and file list to the relevant outputs
211
  if latest_file_completed >= number_of_files:
 
764
 
765
  return new_file_path
766
 
767
+ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict, image:Image, page_sizes_df:pd.DataFrame):
768
  '''
769
  Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
770
  '''
771
 
772
  img_annotation_box = {}
773
 
774
+ # For efficient lookup, set 'page' as index if it's not already
775
+ if 'page' in page_sizes_df.columns:
776
+ page_sizes_df = page_sizes_df.set_index('page')
777
+ # PyMuPDF page numbers are 0-based, DataFrame index assumed 1-based
778
+ page_num_one_based = page.number + 1
779
+
780
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = 0, 0, 0, 0 # Initialize defaults
781
+
782
+
783
  if image:
784
  pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
785
+
786
  else:
787
+ # --- Calculate coordinates when no image is present ---
788
+ # Assumes annot coords are normalized relative to MediaBox (top-left origin)
789
+ try:
790
+ # 1. Get MediaBox dimensions from the DataFrame
791
+ page_info = page_sizes_df.loc[page_num_one_based]
792
+ mb_width = page_info['mediabox_width']
793
+ mb_height = page_info['mediabox_height']
794
+ x_offset = page_info['cropbox_x_offset']
795
+ y_offset = page_info['cropbox_y_offset_from_top']
796
+
797
+
798
+ # Check for invalid dimensions
799
+ if mb_width <= 0 or mb_height <= 0:
800
+ print(f"Warning: Invalid MediaBox dimensions ({mb_width}x{mb_height}) for page {page_num_one_based}. Setting coords to 0.")
801
+ else:
802
+ pymupdf_x1 = annot.left - x_offset
803
+ pymupdf_x2 = annot.left + annot.width - x_offset
804
+ pymupdf_y1 = annot.top - y_offset
805
+ pymupdf_y2 = annot.top + annot.height - y_offset
806
+
807
+ except KeyError:
808
+ print(f"Warning: Page number {page_num_one_based} not found in page_sizes_df. Cannot get MediaBox dimensions. Setting coords to 0.")
809
+ except AttributeError as e:
810
+ print(f"Error accessing attributes ('left', 'top', etc.) on 'annot' object for page {page_num_one_based}: {e}")
811
+ except Exception as e:
812
+ print(f"Error during coordinate calculation for page {page_num_one_based}: {e}")
813
+
814
+ rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
815
+
816
+ # Now creating image annotation object
817
+ image_x1 = annot.left
818
+ image_x2 = annot.left + annot.width
819
+ image_y1 = annot.top
820
+ image_y2 = annot.top + annot.height
821
+
822
+ # Create image annotation boxes
823
+ img_annotation_box["xmin"] = image_x1
824
+ img_annotation_box["ymin"] = image_y1
825
+ img_annotation_box["xmax"] = image_x2 # annot.left + annot.width
826
+ img_annotation_box["ymax"] = image_y2 # annot.top + annot.height
827
  img_annotation_box["color"] = (0,0,0)
828
  try:
829
  img_annotation_box["label"] = str(annot.entity_type)
 
833
  if hasattr(annot, 'text') and annot.text:
834
  img_annotation_box["text"] = str(annot.text)
835
  else:
836
+ img_annotation_box["text"] = ""
 
 
837
 
838
  return img_annotation_box, rect
839
 
840
+
841
  def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
842
  '''
843
  Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
 
988
  rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
989
 
990
  # Else should be CustomImageRecognizerResult
991
+ elif isinstance(annot, CustomImageRecognizerResult):
992
+ #print("annot is a CustomImageRecognizerResult")
993
+ img_annotation_box, rect = prepare_custom_image_recogniser_result_annotation_box(page, annot, image, page_sizes_df)
994
 
995
  # Else it should be a pikepdf annotation object
996
  else:
 
1249
  # If running Textract, check if file already exists. If it does, load in existing data
1250
  if text_extraction_method == textract_option:
1251
  textract_json_file_path = output_folder + file_name + "_textract.json"
1252
+ textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
1253
  original_textract_data = textract_data.copy()
1254
 
1255
  ###
tools/helper_functions.py CHANGED
@@ -9,7 +9,7 @@ import unicodedata
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
- from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
@@ -31,7 +31,7 @@ def reset_state_vars():
31
  show_share_button=False,
32
  show_remove_button=False,
33
  interactive=False
34
- ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], ""
35
 
36
  def reset_ocr_results_state():
37
  return pd.DataFrame(), pd.DataFrame(), []
@@ -44,23 +44,54 @@ def load_in_default_allow_list(allow_list_file_path):
44
  allow_list_file_path = [allow_list_file_path]
45
  return allow_list_file_path
46
 
47
- def load_in_default_cost_codes(cost_codes_path:str):
 
 
 
48
  cost_codes_df = pd.read_csv(cost_codes_path)
49
-
50
- dropdown_choices = cost_codes_df.iloc[:,0].to_list()
51
- dropdown_choices.insert(0, "")
52
-
53
-
54
- out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
 
 
 
 
 
 
 
 
 
 
55
 
56
  return cost_codes_df, cost_codes_df, out_dropdown
57
 
58
- def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
 
 
 
 
59
  if enforce_cost_code_textbox == "True":
60
  if not cost_code_choice:
61
  raise Exception("Please choose a cost code before continuing")
 
 
 
 
 
 
 
 
 
62
  return
63
 
 
 
 
 
 
64
  def update_dataframe(df:pd.DataFrame):
65
  df_copy = df.copy()
66
  return df_copy
@@ -271,7 +302,14 @@ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
271
 
272
  return output_files
273
 
274
- async def get_connection_params(request: gr.Request, output_folder_textbox:str=OUTPUT_FOLDER, input_folder_textbox:str=INPUT_FOLDER, session_output_folder:str=SESSION_OUTPUT_FOLDER):
 
 
 
 
 
 
 
275
 
276
  #print("Session hash:", request.session_hash)
277
 
@@ -323,6 +361,13 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str=O
323
  if session_output_folder == 'True':
324
  output_folder = output_folder_textbox + out_session_hash + "/"
325
  input_folder = input_folder_textbox + out_session_hash + "/"
 
 
 
 
 
 
 
326
  else:
327
  output_folder = output_folder_textbox
328
  input_folder = input_folder_textbox
@@ -330,8 +375,7 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str=O
330
  if not os.path.exists(output_folder): os.mkdir(output_folder)
331
  if not os.path.exists(input_folder): os.mkdir(input_folder)
332
 
333
-
334
- return out_session_hash, output_folder, out_session_hash, input_folder
335
 
336
  def clean_unicode_text(text:str):
337
  # Step 1: Normalise unicode characters to decompose any special forms
@@ -374,6 +418,8 @@ def calculate_aws_costs(number_of_pages:str,
374
  pii_identification_method:str,
375
  textract_output_found_checkbox:bool,
376
  only_extract_text_radio:bool,
 
 
377
  textract_page_cost:float=1.5/1000,
378
  textract_signature_cost:float=2.0/1000,
379
  comprehend_unit_cost:float=0.0001,
@@ -391,6 +437,8 @@ def calculate_aws_costs(number_of_pages:str,
391
  - pii_identification_method_drop: The method of personally-identifiable information removal.
392
  - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
393
  - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
 
 
394
  - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
395
  - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
396
  - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
@@ -419,6 +467,9 @@ def calculate_aws_costs(number_of_pages:str,
419
 
420
  calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
421
 
 
 
 
422
  return calculated_aws_cost
423
 
424
  def calculate_time_taken(number_of_pages:str,
 
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
 
31
  show_share_button=False,
32
  show_remove_button=False,
33
  interactive=False
34
+ ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False
35
 
36
  def reset_ocr_results_state():
37
  return pd.DataFrame(), pd.DataFrame(), []
 
44
  allow_list_file_path = [allow_list_file_path]
45
  return allow_list_file_path
46
 
47
+ def load_in_default_cost_codes(cost_codes_path:str, default_cost_code:str=""):
48
+ '''
49
+ Load in the cost codes list from file.
50
+ '''
51
  cost_codes_df = pd.read_csv(cost_codes_path)
52
+ dropdown_choices = cost_codes_df.iloc[:, 0].astype(str).tolist()
53
+
54
+ # Avoid inserting duplicate or empty cost code values
55
+ if default_cost_code and default_cost_code not in dropdown_choices:
56
+ dropdown_choices.insert(0, default_cost_code)
57
+
58
+ # Always have a blank option at the top
59
+ if "" not in dropdown_choices:
60
+ dropdown_choices.insert(0, "")
61
+
62
+ out_dropdown = gr.Dropdown(
63
+ value=default_cost_code if default_cost_code in dropdown_choices else "",
64
+ label="Choose cost code for analysis",
65
+ choices=dropdown_choices,
66
+ allow_custom_value=False
67
+ )
68
 
69
  return cost_codes_df, cost_codes_df, out_dropdown
70
 
71
+ def enforce_cost_codes(enforce_cost_code_textbox:str, cost_code_choice:str, cost_code_df:pd.DataFrame, verify_cost_codes:bool=True):
72
+ '''
73
+ Check if the enforce cost codes variable is set to true, and then check that a cost cost has been chosen. If not, raise an error. Then, check against the values in the cost code dataframe to ensure that the cost code exists.
74
+ '''
75
+
76
  if enforce_cost_code_textbox == "True":
77
  if not cost_code_choice:
78
  raise Exception("Please choose a cost code before continuing")
79
+
80
+ if verify_cost_codes == True:
81
+ if cost_code_df.empty:
82
+ raise Exception("No cost codes present in dataframe for verification")
83
+ else:
84
+ valid_cost_codes_list = list(cost_code_df.iloc[:,0].unique())
85
+
86
+ if not cost_code_choice in valid_cost_codes_list:
87
+ raise Exception("Selected cost code not found in list. Please contact Finance if you cannot find the correct cost code from the given list of suggestions.")
88
  return
89
 
90
+ def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
91
+ cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :
92
+ ]
93
+ return cost_code_df
94
+
95
  def update_dataframe(df:pd.DataFrame):
96
  df_copy = df.copy()
97
  return df_copy
 
302
 
303
  return output_files
304
 
305
+ async def get_connection_params(request: gr.Request,
306
+ output_folder_textbox:str=OUTPUT_FOLDER,
307
+ input_folder_textbox:str=INPUT_FOLDER,
308
+ session_output_folder:str=SESSION_OUTPUT_FOLDER,
309
+ textract_document_upload_input_folder:str=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER,
310
+ textract_document_upload_output_folder:str=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER,
311
+ s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
312
+ local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
313
 
314
  #print("Session hash:", request.session_hash)
315
 
 
361
  if session_output_folder == 'True':
362
  output_folder = output_folder_textbox + out_session_hash + "/"
363
  input_folder = input_folder_textbox + out_session_hash + "/"
364
+
365
+ textract_document_upload_input_folder = textract_document_upload_input_folder + "/" + out_session_hash
366
+ textract_document_upload_output_folder = textract_document_upload_output_folder + "/" + out_session_hash
367
+
368
+ s3_textract_document_logs_subfolder = s3_textract_document_logs_subfolder + "/" + out_session_hash
369
+ local_textract_document_logs_subfolder = local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
370
+
371
  else:
372
  output_folder = output_folder_textbox
373
  input_folder = input_folder_textbox
 
375
  if not os.path.exists(output_folder): os.mkdir(output_folder)
376
  if not os.path.exists(input_folder): os.mkdir(input_folder)
377
 
378
+ return out_session_hash, output_folder, out_session_hash, input_folder, textract_document_upload_input_folder, textract_document_upload_output_folder, s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder
 
379
 
380
  def clean_unicode_text(text:str):
381
  # Step 1: Normalise unicode characters to decompose any special forms
 
418
  pii_identification_method:str,
419
  textract_output_found_checkbox:bool,
420
  only_extract_text_radio:bool,
421
+ convert_to_gbp:bool=True,
422
+ usd_gbp_conversion_rate:float=0.76,
423
  textract_page_cost:float=1.5/1000,
424
  textract_signature_cost:float=2.0/1000,
425
  comprehend_unit_cost:float=0.0001,
 
437
  - pii_identification_method_drop: The method of personally-identifiable information removal.
438
  - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
439
  - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
440
+ - convert_to_gbp (bool, optional): Should suggested costs be converted from USD to GBP.
441
+ - usd_gbp_conversion_rate (float, optional): Conversion rate used for USD to GBP. Last changed 14th April 2025.
442
  - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
443
  - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
444
  - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
 
467
 
468
  calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
469
 
470
+ if convert_to_gbp == True:
471
+ calculated_aws_cost *= usd_gbp_conversion_rate
472
+
473
  return calculated_aws_cost
474
 
475
  def calculate_time_taken(number_of_pages:str,
tools/redaction_review.py CHANGED
@@ -577,7 +577,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
577
  output_files.append(orig_pdf_file_path)
578
 
579
  try:
580
- print("Saving review file.")
581
  review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
582
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
583
 
@@ -756,6 +756,18 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
756
 
757
  return row_value_page, row_value_df
758
 
 
 
 
 
 
 
 
 
 
 
 
 
759
  def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
760
 
761
  row_value_code = evt.row_value[0] # This is the value for cost code
 
577
  output_files.append(orig_pdf_file_path)
578
 
579
  try:
580
+ #print("Saving review file.")
581
  review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
582
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
583
 
 
756
 
757
  return row_value_page, row_value_df
758
 
759
+ def df_select_callback_textract_api(df: pd.DataFrame, evt: gr.SelectData):
760
+
761
+ #print("evt.data:", evt._data)
762
+
763
+ row_value_job_id = evt.row_value[0] # This is the page number value
764
+ # row_value_label = evt.row_value[1] # This is the label number value
765
+ row_value_job_type = evt.row_value[2] # This is the text number value
766
+
767
+ row_value_df = pd.DataFrame(data={"job_id":[row_value_job_id], "label":[row_value_job_type]})
768
+
769
+ return row_value_job_id, row_value_job_type, row_value_df
770
+
771
  def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
772
 
773
  row_value_code = evt.row_value[0] # This is the value for cost code
tools/textract_batch_call.py CHANGED
@@ -1,22 +1,36 @@
1
  import boto3
2
  import time
3
  import os
 
4
  import json
5
  import logging
 
 
 
6
  from urllib.parse import urlparse
 
7
 
8
- # Configure logging
9
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
10
 
11
- def analyze_pdf_with_textract(
 
12
  local_pdf_path: str,
13
- s3_bucket_name: str,
14
  s3_input_prefix: str,
15
  s3_output_prefix: str,
16
- local_output_dir: str,
17
- aws_region: str = None, # Optional: specify region if not default
18
- poll_interval_seconds: int = 5,
19
- max_polling_attempts: int = 120 # ~10 minutes total wait time
 
 
 
20
  ):
21
  """
22
  Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
@@ -27,10 +41,12 @@ def analyze_pdf_with_textract(
27
  s3_bucket_name (str): Name of the S3 bucket to use.
28
  s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
29
  s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
30
- local_output_dir (str): Local directory to save the downloaded JSON results.
 
 
 
 
31
  aws_region (str, optional): AWS region name. Defaults to boto3 default region.
32
- poll_interval_seconds (int): Seconds to wait between polling Textract status.
33
- max_polling_attempts (int): Maximum number of times to poll Textract status.
34
 
35
  Returns:
36
  str: Path to the downloaded local JSON output file, or None if failed.
@@ -41,12 +57,21 @@ def analyze_pdf_with_textract(
41
  Exception: For other AWS errors or job failures.
42
  """
43
 
 
 
 
 
 
 
 
44
  if not os.path.exists(local_pdf_path):
45
- raise FileNotFoundError(f"Input PDF not found: {local_pdf_path}")
46
 
47
  if not os.path.exists(local_output_dir):
48
  os.makedirs(local_output_dir)
49
- logging.info(f"Created local output directory: {local_output_dir}")
 
 
50
 
51
  # Initialize boto3 clients
52
  session = boto3.Session(region_name=aws_region)
@@ -57,216 +82,407 @@ def analyze_pdf_with_textract(
57
  pdf_filename = os.path.basename(local_pdf_path)
58
  s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
59
 
60
- logging.info(f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'...")
 
 
61
  try:
62
  s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
63
- logging.info("Upload successful.")
 
 
64
  except Exception as e:
65
- logging.error(f"Failed to upload PDF to S3: {e}")
 
 
66
  raise
67
 
 
 
 
 
 
 
 
 
68
  # --- 2. Start Textract Document Analysis ---
69
- logging.info("Starting Textract document analysis job...")
 
 
 
70
  try:
71
- response = textract_client.start_document_analysis(
72
- DocumentLocation={
73
- 'S3Object': {
74
- 'Bucket': s3_bucket_name,
75
- 'Name': s3_input_key
 
 
 
 
 
 
 
76
  }
77
- },
78
- FeatureTypes=['SIGNATURES', 'FORMS', 'TABLES'], # Analyze for signatures, forms, and tables
79
- OutputConfig={
80
- 'S3Bucket': s3_bucket_name,
81
- 'S3Prefix': s3_output_prefix
82
- }
83
- # Optional: Add NotificationChannel for SNS topic notifications
84
- # NotificationChannel={
85
- # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
86
- # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
87
- # }
88
- )
89
- job_id = response['JobId']
90
- logging.info(f"Textract job started with JobId: {job_id}")
91
-
92
- except Exception as e:
93
- logging.error(f"Failed to start Textract job: {e}")
94
- raise
95
-
96
- # --- 3. Poll for Job Completion ---
97
- job_status = 'IN_PROGRESS'
98
- attempts = 0
99
- logging.info("Polling Textract for job completion status...")
100
 
101
- while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
102
- attempts += 1
103
- try:
104
- response = textract_client.get_document_analysis(JobId=job_id)
105
- job_status = response['JobStatus']
106
- logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
107
-
108
- if job_status == 'IN_PROGRESS':
109
- time.sleep(poll_interval_seconds)
110
- elif job_status == 'SUCCEEDED':
111
- logging.info("Textract job succeeded.")
112
- break
113
- elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
114
- status_message = response.get('StatusMessage', 'No status message provided.')
115
- warnings = response.get('Warnings', [])
116
- logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
117
- if warnings:
118
- logging.warning(f"Warnings: {warnings}")
119
- # Decide if PARTIAL_SUCCESS should proceed or raise error
120
- # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
121
- raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
122
- else:
123
- # Should not happen based on documentation, but handle defensively
124
- raise Exception(f"Unexpected Textract job status: {job_status}")
125
 
126
- except textract_client.exceptions.InvalidJobIdException:
127
- logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
128
- raise
129
- except Exception as e:
130
- logging.error(f"Error while polling Textract status for job {job_id}: {e}")
131
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- if job_status != 'SUCCEEDED':
134
- raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
135
 
136
- # --- 4. Download Output JSON from S3 ---
137
- # Textract typically creates output under s3_output_prefix/job_id/
138
- # There might be multiple JSON files if pagination occurred during writing.
139
- # Usually, for smaller docs, there's one file, often named '1'.
140
- # For robust handling, list objects and find the JSON(s).
141
 
142
- s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
143
- logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
 
 
 
144
 
145
- downloaded_file_path = None
146
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  list_response = s3_client.list_objects_v2(
148
  Bucket=s3_bucket_name,
149
  Prefix=s3_output_key_prefix
150
  )
151
-
152
  output_files = list_response.get('Contents', [])
153
- if not output_files:
154
- # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
155
- logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
156
- time.sleep(5)
157
- list_response = s3_client.list_objects_v2(
158
- Bucket=s3_bucket_name,
159
- Prefix=s3_output_key_prefix
160
- )
161
- output_files = list_response.get('Contents', [])
162
-
163
- if not output_files:
164
- logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
165
- # You could alternatively try getting results via get_document_analysis pagination here
166
- # but sticking to the request to download from S3 output path.
167
- raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
168
-
169
- # Usually, we only need the first/main JSON output file(s)
170
- # For simplicity, download the first one found. A more complex scenario might merge multiple files.
171
- # Filter out potential directory markers if any key ends with '/'
172
- json_files_to_download = [f for f in output_files if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/')]
173
-
174
- if not json_files_to_download:
175
- logging.error(f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}")
176
- raise FileNotFoundError(f"Textract output JSON files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
177
-
178
- # Let's download the first JSON found. Often it's the only one or the main one.
179
- s3_output_key = json_files_to_download[0]['Key']
180
- output_filename_base = os.path.basename(pdf_filename).replace('.pdf', '')
181
- local_output_filename = f"{output_filename_base}_textract_output_{job_id}.json"
182
- local_output_path = os.path.join(local_output_dir, local_output_filename)
183
-
184
- logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
185
- s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
186
- logging.info("Download successful.")
187
- downloaded_file_path = local_output_path
188
-
189
- # Log if multiple files were found, as user might need to handle them
190
- if len(json_files_to_download) > 1:
191
- logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
192
 
193
- except Exception as e:
194
- logging.error(f"Failed to download or process Textract output from S3: {e}")
195
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  return downloaded_file_path
198
 
199
- # --- Example Usage ---
200
- if __name__ == '__main__':
201
- # --- Configuration --- (Replace with your actual values)
202
- MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
203
- MY_S3_BUCKET = "your-textract-demo-bucket-name" # MUST BE UNIQUE GLOBALLY
204
- MY_S3_INPUT_PREFIX = "textract-inputs" # Folder in the bucket for uploads
205
- MY_S3_OUTPUT_PREFIX = "textract-outputs" # Folder in the bucket for results
206
- MY_LOCAL_OUTPUT_DIR = "./textract_results" # Local folder to save JSON
207
- MY_AWS_REGION = "us-east-1" # e.g., 'us-east-1', 'eu-west-1'
208
-
209
- # --- Create a dummy PDF for testing if you don't have one ---
210
- # Requires 'reportlab' library: pip install reportlab
211
- try:
212
- from reportlab.pdfgen import canvas
213
- from reportlab.lib.pagesizes import letter
214
- if not os.path.exists(MY_LOCAL_PDF):
215
- print(f"Creating dummy PDF: {MY_LOCAL_PDF}")
216
- c = canvas.Canvas(MY_LOCAL_PDF, pagesize=letter)
217
- c.drawString(100, 750, "This is a test document for AWS Textract.")
218
- c.drawString(100, 700, "It includes some text and a placeholder for a signature.")
219
- c.drawString(100, 650, "Signed:")
220
- # Draw a simple line/scribble for signature placeholder
221
- c.line(150, 630, 250, 645)
222
- c.line(250, 645, 300, 620)
223
- c.save()
224
- print("Dummy PDF created.")
225
- except ImportError:
226
- if not os.path.exists(MY_LOCAL_PDF):
227
- print(f"Warning: reportlab not installed and '{MY_LOCAL_PDF}' not found. Cannot run example without an input PDF.")
228
- exit() # Exit if no PDF available for the example
229
- except Exception as e:
230
- print(f"Error creating dummy PDF: {e}")
231
- exit()
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- # --- Run the analysis ---
235
- try:
236
- output_json_path = analyze_pdf_with_textract(
237
- local_pdf_path=MY_LOCAL_PDF,
238
- s3_bucket_name=MY_S3_BUCKET,
239
- s3_input_prefix=MY_S3_INPUT_PREFIX,
240
- s3_output_prefix=MY_S3_OUTPUT_PREFIX,
241
- local_output_dir=MY_LOCAL_OUTPUT_DIR,
242
- aws_region=MY_AWS_REGION
243
- )
244
 
245
- if output_json_path:
246
- print(f"\n--- Analysis Complete ---")
247
- print(f"Textract output JSON saved to: {output_json_path}")
248
 
249
- # Optional: Load and print some info from the JSON
250
- with open(output_json_path, 'r') as f:
251
- results = json.load(f)
252
- print(f"Detected {results.get('DocumentMetadata', {}).get('Pages', 'N/A')} page(s).")
253
- # Find signature blocks (Note: This is basic, real parsing might be more complex)
254
- signature_blocks = [block for block in results.get('Blocks', []) if block.get('BlockType') == 'SIGNATURE']
255
- print(f"Found {len(signature_blocks)} potential signature block(s).")
256
- if signature_blocks:
257
- print(f"First signature confidence: {signature_blocks[0].get('Confidence', 'N/A')}")
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
- except FileNotFoundError as e:
261
- print(f"\nError: Input file not found. {e}")
262
- except Exception as e:
263
- print(f"\nAn error occurred during the process: {e}")
264
 
265
- import boto3
266
- import time
267
- import os
268
 
269
- def download_textract_output(job_id, output_bucket, output_prefix, local_folder):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  """
271
  Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
272
 
@@ -290,8 +506,8 @@ def download_textract_output(job_id, output_bucket, output_prefix, local_folder)
290
  print("Job failed:", response.get("StatusMessage", "No error message provided."))
291
  return
292
  else:
293
- print(f"Job is still {status}, waiting...")
294
- time.sleep(10) # Wait before checking again
295
 
296
  # Find output ZIP file in S3
297
  output_file_key = f"{output_prefix}/{job_id}.zip"
@@ -303,6 +519,3 @@ def download_textract_output(job_id, output_bucket, output_prefix, local_folder)
303
  print(f"Output file downloaded to: {local_file_path}")
304
  except Exception as e:
305
  print(f"Error downloading file: {e}")
306
-
307
- # Example usage:
308
- # download_textract_output("your-job-id", "your-output-bucket", "your-output-prefix", "/path/to/local/folder")
 
1
  import boto3
2
  import time
3
  import os
4
+ import pandas as pd
5
  import json
6
  import logging
7
+ import datetime
8
+ from typing import List
9
+ from io import StringIO
10
  from urllib.parse import urlparse
11
+ from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
 
13
+ # MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
14
+ # MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
15
+ # MY_S3_INPUT_PREFIX = session_hash_textbox # Folder in the bucket for uploads
16
+ # MY_S3_OUTPUT_PREFIX = session_hash_textbox # Folder in the bucket for results
17
+ # MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER # Local folder to save JSON
18
+ # MY_AWS_REGION = AWS_REGION # e.g., 'us-east-1', 'eu-west-1'
19
+ from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
20
+ from tools.aws_textract import json_to_ocrresult
21
 
22
+
23
+ def analyse_document_with_textract_api(
24
  local_pdf_path: str,
 
25
  s3_input_prefix: str,
26
  s3_output_prefix: str,
27
+ job_df:pd.DataFrame,
28
+ s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
29
+ local_output_dir: str = OUTPUT_FOLDER,
30
+ analyse_signatures:List[str] = [],
31
+ successful_job_number:int=0,
32
+ general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
33
+ aws_region: str = AWS_REGION # Optional: specify region if not default
34
  ):
35
  """
36
  Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
 
41
  s3_bucket_name (str): Name of the S3 bucket to use.
42
  s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
43
  s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
44
+ job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
45
+ s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
46
+ local_output_dir (str, optional): Local directory to save the downloaded JSON results.
47
+ analyse_signatures (List[str], optional): Analyse signatures? Default is no.
48
+ successful_job_number (int): The number of successful jobs that have been submitted in this session.
49
  aws_region (str, optional): AWS region name. Defaults to boto3 default region.
 
 
50
 
51
  Returns:
52
  str: Path to the downloaded local JSON output file, or None if failed.
 
57
  Exception: For other AWS errors or job failures.
58
  """
59
 
60
+ # This is a variable that is written to logs to indicate that a Textract API call was made
61
+ is_a_textract_api_call = True
62
+
63
+ # Keep only latest pdf path if it's a list
64
+ if isinstance(local_pdf_path, list):
65
+ local_pdf_path = local_pdf_path[-1]
66
+
67
  if not os.path.exists(local_pdf_path):
68
+ raise FileNotFoundError(f"Input document not found {local_pdf_path}")
69
 
70
  if not os.path.exists(local_output_dir):
71
  os.makedirs(local_output_dir)
72
+ log_message = f"Created local output directory: {local_output_dir}"
73
+ print(log_message)
74
+ #logging.info(log_message)
75
 
76
  # Initialize boto3 clients
77
  session = boto3.Session(region_name=aws_region)
 
82
  pdf_filename = os.path.basename(local_pdf_path)
83
  s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
84
 
85
+ log_message = f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'..."
86
+ print(log_message)
87
+ #logging.info(log_message)
88
  try:
89
  s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
90
+ log_message = "Upload successful."
91
+ print(log_message)
92
+ #logging.info(log_message)
93
  except Exception as e:
94
+ log_message = f"Failed to upload PDF to S3: {e}"
95
+ print(log_message)
96
+ #logging.error(log_message)
97
  raise
98
 
99
+ # If job_df is not empty
100
+ if not job_df.empty:
101
+ if "file_name" in job_df.columns:
102
+ matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
103
+
104
+ if len(matching_job_id_file_names) > 0:
105
+ raise Exception("Existing Textract outputs found. No need to re-analyse. Please download existing results from the list")
106
+
107
  # --- 2. Start Textract Document Analysis ---
108
+ message = "Starting Textract document analysis job..."
109
+ print(message)
110
+ #logging.info("Starting Textract document analysis job...")
111
+
112
  try:
113
+ if "Extract signatures" in analyse_signatures:
114
+ response = textract_client.start_document_analysis(
115
+ DocumentLocation={
116
+ 'S3Object': {
117
+ 'Bucket': s3_bucket_name,
118
+ 'Name': s3_input_key
119
+ }
120
+ },
121
+ FeatureTypes=['SIGNATURES'], # Analyze for signatures, forms, and tables
122
+ OutputConfig={
123
+ 'S3Bucket': s3_bucket_name,
124
+ 'S3Prefix': s3_output_prefix
125
  }
126
+ # Optional: Add NotificationChannel for SNS topic notifications
127
+ # NotificationChannel={
128
+ # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
129
+ # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
130
+ # }
131
+ )
132
+ job_type="document_analysis"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ else:
135
+ response = textract_client.start_document_text_detection(
136
+ DocumentLocation={
137
+ 'S3Object': {
138
+ 'Bucket': s3_bucket_name,
139
+ 'Name': s3_input_key
140
+ }
141
+ },
142
+ OutputConfig={
143
+ 'S3Bucket': s3_bucket_name,
144
+ 'S3Prefix': s3_output_prefix
145
+ }
146
+ # Optional: Add NotificationChannel for SNS topic notifications
147
+ # NotificationChannel={
148
+ # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
149
+ # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
150
+ # }
151
+ )
152
+ job_type="document_text_detection"
 
 
 
 
 
153
 
154
+ job_id = response['JobId']
155
+ print(f"Textract job started with JobId: {job_id}")
156
+ #logging.info(f"Textract job started with JobId: {job_id}")
157
+
158
+ # Write job_id to memory
159
+ # Prepare CSV in memory
160
+ log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
161
+ job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
162
+
163
+ csv_buffer = StringIO()
164
+ log_df = pd.DataFrame([{
165
+ 'job_id': job_id,
166
+ 'file_name': pdf_filename,
167
+ 'job_type': job_type,
168
+ 'signature_extraction':analyse_signatures,
169
+ 's3_location': job_location_full,
170
+ 'job_date_time': datetime.datetime.now()
171
+ }])
172
+
173
+ # File path
174
+ log_file_path = os.path.join(local_output_dir, "textract_job_log_files.csv")
175
+
176
+ # Check if file exists
177
+ file_exists = os.path.exists(log_file_path)
178
+
179
+ # Append to CSV if it exists, otherwise write with header
180
+ log_df.to_csv(log_file_path, mode='a', index=False, header=not file_exists)
181
+
182
+ #log_df.to_csv(csv_buffer)
183
 
184
+ # Upload the file
185
+ s3_client.upload_file(log_file_path, general_s3_bucket_name, log_csv_key_location)
186
 
187
+ # Upload to S3 (overwrite existing file)
188
+ #s3_client.put_object(Bucket=general_s3_bucket_name, Key=log_csv_key_location, Body=csv_buffer.getvalue())
189
+ print(f"Job ID written to {log_csv_key_location}")
190
+ #logging.info(f"Job ID written to s3://{s3_bucket_name}/{s3_output_prefix}/textract_document_jobs.csv")
 
191
 
192
+ except Exception as e:
193
+ error = f"Failed to start Textract job: {e}"
194
+ print(error)
195
+ #logging.error(error)
196
+ raise
197
 
198
+ successful_job_number += 1
199
+
200
+ return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call
201
+
202
+ def return_job_status(job_id:str,
203
+ response:dict,
204
+ attempts:int,
205
+ poll_interval_seconds: int = 5,
206
+ max_polling_attempts: int = 1 # ~10 minutes total wait time
207
+ ):
208
+ job_status = response['JobStatus']
209
+ logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
210
+
211
+ if job_status == 'IN_PROGRESS':
212
+ time.sleep(poll_interval_seconds)
213
+ elif job_status == 'SUCCEEDED':
214
+ logging.info("Textract job succeeded.")
215
+ elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
216
+ status_message = response.get('StatusMessage', 'No status message provided.')
217
+ warnings = response.get('Warnings', [])
218
+ logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
219
+ if warnings:
220
+ logging.warning(f"Warnings: {warnings}")
221
+ # Decide if PARTIAL_SUCCESS should proceed or raise error
222
+ # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
223
+ raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
224
+ else:
225
+ # Should not happen based on documentation, but handle defensively
226
+ raise Exception(f"Unexpected Textract job status: {job_status}")
227
+
228
+ return job_status
229
+
230
+ def download_textract_job_files(s3_client:str,
231
+ s3_bucket_name:str,
232
+ s3_output_key_prefix:str,
233
+ pdf_filename:str,
234
+ job_id:str,
235
+ local_output_dir:str):
236
+ list_response = s3_client.list_objects_v2(
237
+ Bucket=s3_bucket_name,
238
+ Prefix=s3_output_key_prefix
239
+ )
240
+
241
+ output_files = list_response.get('Contents', [])
242
+ if not output_files:
243
+ # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
244
+ #logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
245
+ #time.sleep(5)
246
  list_response = s3_client.list_objects_v2(
247
  Bucket=s3_bucket_name,
248
  Prefix=s3_output_key_prefix
249
  )
 
250
  output_files = list_response.get('Contents', [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ if not output_files:
253
+ logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
254
+ # You could alternatively try getting results via get_document_analysis pagination here
255
+ # but sticking to the request to download from S3 output path.
256
+ raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
257
+
258
+ # Usually, we only need the first/main JSON output file(s)
259
+ # For simplicity, download the first one found. A more complex scenario might merge multiple files.
260
+ # Filter out potential directory markers if any key ends with '/'
261
+ json_files_to_download = [
262
+ f for f in output_files
263
+ if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/') and 'access_check' not in f['Key']
264
+ ]
265
+
266
+ #print("json_files_to_download:", json_files_to_download)
267
+
268
+ if not json_files_to_download:
269
+ error = f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}"
270
+ print(error)
271
+ #logging.error(error)
272
+ raise FileNotFoundError(error)
273
+
274
+ combined_blocks = []
275
+
276
+ for f in sorted(json_files_to_download, key=lambda x: x['Key']): # Optional: sort to ensure consistent order
277
+ obj = s3_client.get_object(Bucket=s3_bucket_name, Key=f['Key'])
278
+ data = json.loads(obj['Body'].read())
279
+
280
+ # Assuming Textract-style output with a "Blocks" key
281
+ if "Blocks" in data:
282
+ combined_blocks.extend(data["Blocks"])
283
+ else:
284
+ logging.warning(f"No 'Blocks' key in file: {f['Key']}")
285
+
286
+ # Build final combined JSON structure
287
+ combined_output = {
288
+ "DocumentMetadata": {
289
+ "Pages": len(set(block.get('Page', 1) for block in combined_blocks))
290
+ },
291
+ "Blocks": combined_blocks,
292
+ "JobStatus": "SUCCEEDED"
293
+ }
294
+
295
+ output_filename_base = os.path.basename(pdf_filename)
296
+ output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
297
+ local_output_filename = f"{output_filename_base_no_ext}_textract.json"
298
+ local_output_path = os.path.join(local_output_dir, local_output_filename)
299
+
300
+ with open(local_output_path, 'w') as f:
301
+ json.dump(combined_output, f)
302
+
303
+ print(f"Combined Textract output written to {local_output_path}")
304
+
305
+ # logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
306
+ # s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
307
+ # logging.info("Download successful.")
308
+ downloaded_file_path = local_output_path
309
+
310
+ # Log if multiple files were found, as user might need to handle them
311
+ #if len(json_files_to_download) > 1:
312
+ # logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
313
 
314
  return downloaded_file_path
315
 
316
+ def check_for_provided_job_id(job_id:str):
317
+ if not job_id:
318
+ raise Exception("Please provide a job ID.")
319
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ def poll_bulk_textract_analysis_progress_and_download(
322
+ job_id:str,
323
+ job_type_dropdown:str,
324
+ s3_output_prefix: str,
325
+ pdf_filename:str,
326
+ job_df:pd.DataFrame,
327
+ s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
328
+ local_output_dir: str = OUTPUT_FOLDER,
329
+ load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
330
+ load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
331
+ aws_region: str = AWS_REGION, # Optional: specify region if not default
332
+ poll_interval_seconds: int = 1,
333
+ max_polling_attempts: int = 1 # ~10 minutes total wait time):
334
+ ):
335
 
336
+ if job_id:
337
+ # Initialize boto3 clients
338
+ session = boto3.Session(region_name=aws_region)
339
+ s3_client = session.client('s3')
340
+ textract_client = session.client('textract')
 
 
 
 
 
341
 
342
+ # --- 3. Poll for Job Completion ---
343
+ job_status = 'IN_PROGRESS'
344
+ attempts = 0
345
 
346
+ message = "Polling Textract for job completion status..."
347
+ print(message)
348
+ #logging.info("Polling Textract for job completion status...")
 
 
 
 
 
 
349
 
350
+ # Update Textract document history df
351
+ try:
352
+ job_df = load_in_textract_job_details(load_s3_jobs=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
353
+ load_s3_jobs_loc=load_s3_jobs_loc,
354
+ load_local_jobs_loc=load_local_jobs_loc)
355
+ except Exception as e:
356
+ #logging.error(f"Failed to update job details dataframe: {e}")
357
+ print(f"Failed to update job details dataframe: {e}")
358
+ #raise
359
+
360
+ while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
361
+ attempts += 1
362
+ try:
363
+ if job_type_dropdown=="document_analysis":
364
+ response = textract_client.get_document_analysis(JobId=job_id)
365
+ job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
366
+ elif job_type_dropdown=="document_text_detection":
367
+ response = textract_client.get_document_text_detection(JobId=job_id)
368
+ job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
369
+ else:
370
+ error = f"Unknown job type, cannot poll job"
371
+ print(error)
372
+ #logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
373
+ raise
374
+
375
+ except textract_client.exceptions.InvalidJobIdException:
376
+ error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed."
377
+ print(error_message)
378
+ logging.error(error_message)
379
+ raise
380
+ except Exception as e:
381
+ error_message = f"Error while polling Textract status for job {job_id}: {e}"
382
+ print(error_message)
383
+ logging.error(error_message)
384
+ raise
385
+
386
+ downloaded_file_path = None
387
+ if job_status == 'SUCCEEDED':
388
+ #raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
389
+ # 3b - Replace PDF file name if it exists in the job dataframe
390
+
391
+ # If job_df is not empty
392
+ if not job_df.empty:
393
+ if "file_name" in job_df.columns:
394
+ matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
395
+
396
+ if pdf_filename and not matching_job_id_file_names.empty:
397
+ if pdf_filename == matching_job_id_file_names.iloc[0]:
398
+ raise Exception("Existing Textract outputs found. No need to re-download.")
399
+
400
+ if not matching_job_id_file_names.empty:
401
+ pdf_filename = matching_job_id_file_names.iloc[0]
402
+ else:
403
+ pdf_filename = "unknown_file"
404
+
405
+
406
+ # --- 4. Download Output JSON from S3 ---
407
+ # Textract typically creates output under s3_output_prefix/job_id/
408
+ # There might be multiple JSON files if pagination occurred during writing.
409
+ # Usually, for smaller docs, there's one file, often named '1'.
410
+ # For robust handling, list objects and find the JSON(s).
411
+
412
+
413
+ s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
414
+ logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
415
+
416
+ try:
417
+ downloaded_file_path = download_textract_job_files(s3_client,
418
+ s3_bucket_name,
419
+ s3_output_key_prefix,
420
+ pdf_filename,
421
+ job_id,
422
+ local_output_dir)
423
+
424
+ except Exception as e:
425
+ #logging.error(f"Failed to download or process Textract output from S3: {e}")
426
+ print(f"Failed to download or process Textract output from S3: {e}")
427
+ raise
428
+
429
+ else:
430
+ raise Exception("No Job ID provided.")
431
+
432
+ return downloaded_file_path, job_status, job_df
433
+
434
+
435
+
436
+ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
437
+ load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
438
+ load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
439
+ document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
440
+ aws_region:str=AWS_REGION):
441
+
442
+ job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
443
 
444
+ # Initialize boto3 clients
445
+ session = boto3.Session(region_name=aws_region)
446
+ s3_client = session.client('s3')
 
447
 
448
+ local_output_path = f'{load_local_jobs_loc}/textract_job_log_files.csv'
 
 
449
 
450
+ if load_s3_jobs == 'True':
451
+
452
+ s3_output_key = f'{load_s3_jobs_loc}/textract_job_log_files.csv'
453
+
454
+ try:
455
+ s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
456
+ print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
457
+ s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
458
+ print("Download successful.")
459
+ except ClientError as e:
460
+ if e.response['Error']['Code'] == '404':
461
+ print("Log file does not exist in S3.")
462
+ else:
463
+ print(f"Unexpected error occurred: {e}")
464
+ except (NoCredentialsError, PartialCredentialsError, TokenRetrievalError) as e:
465
+ print(f"AWS credential issue encountered: {e}")
466
+ print("Skipping S3 log file download.")
467
+
468
+ # If the log path exists, load it in
469
+ if os.path.exists(local_output_path):
470
+ print("Found log file in local path")
471
+ job_df = pd.read_csv(local_output_path)
472
+
473
+ if "job_date_time" in job_df.columns:
474
+ job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
475
+ # Keep only jobs that have been completed in the last 7 days
476
+ cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=7)
477
+ job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
478
+
479
+ return job_df
480
+
481
+
482
+ def download_textract_output(job_id:str,
483
+ output_bucket:str,
484
+ output_prefix:str,
485
+ local_folder:str):
486
  """
487
  Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
488
 
 
506
  print("Job failed:", response.get("StatusMessage", "No error message provided."))
507
  return
508
  else:
509
+ print(f"Job is still {status}.")
510
+ #time.sleep(10) # Wait before checking again
511
 
512
  # Find output ZIP file in S3
513
  output_file_key = f"{output_prefix}/{job_id}.zip"
 
519
  print(f"Output file downloaded to: {local_file_path}")
520
  except Exception as e:
521
  print(f"Error downloading file: {e}")