Commit
·
ed5f8c7
1
Parent(s):
4276db1
Implemented Textract document API calls and associated output tracking/download. Fixes to config and cost code implementation. General minor bug fixes.
Browse files- Dockerfile +2 -2
- app.py +138 -80
- requirements.txt +1 -1
- tools/aws_functions.py +86 -114
- tools/aws_textract.py +57 -4
- tools/config.py +52 -24
- tools/file_conversion.py +44 -25
- tools/file_redaction.py +58 -20
- tools/helper_functions.py +64 -13
- tools/redaction_review.py +13 -1
- tools/textract_batch_call.py +410 -197
Dockerfile
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# Stage 1: Build dependencies and download models
|
2 |
-
FROM public.ecr.aws/docker/library/python:3.11.
|
3 |
|
4 |
# Install system dependencies. Need to specify -y for poppler to get it to install
|
5 |
RUN apt-get update \
|
@@ -27,7 +27,7 @@ COPY lambda_entrypoint.py .
|
|
27 |
COPY entrypoint.sh .
|
28 |
|
29 |
# Stage 2: Final runtime image
|
30 |
-
FROM public.ecr.aws/docker/library/python:3.11.
|
31 |
|
32 |
# Define a build argument with a default value
|
33 |
ARG APP_MODE=gradio
|
|
|
1 |
# Stage 1: Build dependencies and download models
|
2 |
+
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
|
3 |
|
4 |
# Install system dependencies. Need to specify -y for poppler to get it to install
|
5 |
RUN apt-get update \
|
|
|
27 |
COPY entrypoint.sh .
|
28 |
|
29 |
# Stage 2: Final runtime image
|
30 |
+
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
|
31 |
|
32 |
# Define a build argument with a default value
|
33 |
ARG APP_MODE=gradio
|
app.py
CHANGED
@@ -1,19 +1,21 @@
|
|
1 |
import os
|
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
|
6 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
|
7 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe
|
8 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3
|
9 |
from tools.file_redaction import choose_and_run_redactor
|
10 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
|
11 |
-
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr
|
12 |
from tools.data_anonymise import anonymise_data_files
|
13 |
from tools.auth import authenticate_user
|
14 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
15 |
from tools.custom_csvlogger import CSVLogger_custom
|
16 |
from tools.find_duplicate_pages import identify_similar_pages
|
|
|
17 |
|
18 |
# Suppress downcasting warnings
|
19 |
pd.set_option('future.no_silent_downcasting', True)
|
@@ -58,14 +60,16 @@ with app:
|
|
58 |
|
59 |
# Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
|
60 |
pdf_doc_state = gr.State([])
|
61 |
-
all_image_annotations_state = gr.State([])
|
62 |
|
63 |
|
64 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
|
65 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
|
66 |
|
67 |
session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
|
|
|
68 |
s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
|
|
|
69 |
output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
|
70 |
input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
|
71 |
|
@@ -133,6 +137,7 @@ with app:
|
|
133 |
|
134 |
clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
|
135 |
prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
|
|
|
136 |
prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
|
137 |
|
138 |
## Settings page variables
|
@@ -149,18 +154,29 @@ with app:
|
|
149 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
|
150 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
|
153 |
default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
|
154 |
enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
|
|
|
155 |
|
156 |
# Base tables that are not modified subsequent to load
|
157 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
|
158 |
all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
|
|
|
159 |
cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
|
160 |
|
161 |
# Duplicate page detection
|
162 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
163 |
-
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
|
164 |
|
165 |
# Tracking variables for current page (not visible)
|
166 |
current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
|
@@ -168,7 +184,7 @@ with app:
|
|
168 |
|
169 |
# Placeholders for elements that may be made visible later below depending on environment variables
|
170 |
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
|
171 |
-
cost_code_choice_drop = gr.Dropdown(value=
|
172 |
|
173 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
|
174 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
|
@@ -177,6 +193,22 @@ with app:
|
|
177 |
|
178 |
only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
###
|
181 |
# UI DESIGN
|
182 |
###
|
@@ -199,32 +231,21 @@ with app:
|
|
199 |
with gr.Accordion("Redact document", open = True):
|
200 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
|
201 |
|
202 |
-
text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
203 |
-
|
204 |
-
with gr.Row(equal_height=True):
|
205 |
-
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
|
206 |
|
207 |
with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
|
208 |
-
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
with gr.Row(equal_height=True):
|
213 |
-
job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=True)
|
214 |
-
send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=True)
|
215 |
-
with gr.Row(equal_height=True):
|
216 |
-
check_state_of_textract_api__call_btn = gr.Button("Check state of Textract job", variant="secondary", visible=True)
|
217 |
-
job_current_status = gr.Textbox(value="", label="job_current_status", visible=True)
|
218 |
-
with gr.Row(equal_height=True):
|
219 |
-
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
|
220 |
|
221 |
if SHOW_COSTS == "True":
|
222 |
with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
|
229 |
if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
|
230 |
with gr.Accordion("Apply cost code", open = True, visible=True):
|
@@ -232,19 +253,32 @@ with app:
|
|
232 |
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
|
233 |
with gr.Column():
|
234 |
reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
|
235 |
-
cost_code_choice_drop = gr.Dropdown(value=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
238 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
239 |
|
240 |
with gr.Row():
|
241 |
-
|
242 |
output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
|
243 |
latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
|
244 |
|
245 |
-
with gr.Row():
|
246 |
-
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
247 |
-
|
248 |
# Feedback elements are invisible until revealed by redaction action
|
249 |
pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
250 |
pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
|
@@ -263,21 +297,16 @@ with app:
|
|
263 |
annotate_zoom_in = gr.Button("Zoom in", visible=False)
|
264 |
annotate_zoom_out = gr.Button("Zoom out", visible=False)
|
265 |
with gr.Row():
|
266 |
-
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
267 |
|
268 |
-
with gr.Row(
|
269 |
with gr.Column(scale=2):
|
270 |
with gr.Row(equal_height=True):
|
271 |
annotation_last_page_button = gr.Button("Previous page", scale = 4)
|
272 |
-
annotate_current_page = gr.Number(value=
|
273 |
-
annotate_max_pages = gr.Number(value=
|
274 |
annotation_next_page_button = gr.Button("Next page", scale = 4)
|
275 |
-
with gr.Column(scale=1):
|
276 |
-
annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
|
277 |
-
|
278 |
|
279 |
-
with gr.Row():
|
280 |
-
with gr.Column(scale=2):
|
281 |
zoom_str = str(annotator_zoom_number) + '%'
|
282 |
|
283 |
annotator = image_annotator(
|
@@ -297,7 +326,15 @@ with app:
|
|
297 |
handles_cursor=True,
|
298 |
interactive=False
|
299 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
with gr.Column(scale=1):
|
|
|
301 |
update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
|
302 |
with gr.Accordion("Search suggested redactions", open=True):
|
303 |
with gr.Row(equal_height=True):
|
@@ -318,17 +355,7 @@ with app:
|
|
318 |
|
319 |
with gr.Accordion("Search all extracted text", open=True):
|
320 |
all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
|
321 |
-
reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
|
322 |
-
|
323 |
-
with gr.Row():
|
324 |
-
with gr.Column(scale=2):
|
325 |
-
with gr.Row(equal_height=True):
|
326 |
-
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
|
327 |
-
annotate_current_page_bottom = gr.Number(value=1, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
|
328 |
-
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
329 |
-
annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
|
330 |
-
with gr.Column(scale=1):
|
331 |
-
blank_markdown_bot = gr.Markdown(value="", label="")
|
332 |
|
333 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
334 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
@@ -432,7 +459,9 @@ with app:
|
|
432 |
all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
|
433 |
all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
|
434 |
|
|
|
435 |
### UI INTERACTION ###
|
|
|
436 |
|
437 |
###
|
438 |
# PDF/IMAGE REDACTION
|
@@ -440,7 +469,7 @@ with app:
|
|
440 |
# Recalculate estimated costs based on changes to inputs
|
441 |
if SHOW_COSTS == 'True':
|
442 |
# Calculate costs
|
443 |
-
total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,
|
444 |
text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
445 |
pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
446 |
handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
@@ -460,31 +489,42 @@ with app:
|
|
460 |
cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
|
461 |
reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
462 |
|
|
|
|
|
463 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
464 |
-
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text,
|
465 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
466 |
|
467 |
# Run redaction function
|
468 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state,
|
469 |
-
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
|
470 |
-
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text,
|
471 |
-
outputs=[
|
472 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
473 |
|
474 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
475 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text,
|
476 |
-
outputs=[
|
477 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
478 |
|
479 |
# If a file has been completed, the function will continue onto the next document
|
480 |
-
latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text,
|
481 |
-
outputs=[
|
482 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
483 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
484 |
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
485 |
|
486 |
# If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
|
487 |
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
488 |
|
489 |
###
|
490 |
# REVIEW PDF REDACTIONS
|
@@ -493,7 +533,7 @@ with app:
|
|
493 |
# Upload previous files for modifying redactions
|
494 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
495 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
496 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text,
|
497 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
498 |
|
499 |
# Page number controls
|
@@ -501,11 +541,11 @@ with app:
|
|
501 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
502 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
503 |
|
504 |
-
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page,
|
505 |
-
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page,
|
506 |
|
507 |
-
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[
|
508 |
-
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[
|
509 |
|
510 |
annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
|
511 |
|
@@ -552,12 +592,12 @@ with app:
|
|
552 |
|
553 |
# Convert review file to xfdf Adobe format
|
554 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
555 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text,
|
556 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
557 |
|
558 |
# Convert xfdf Adobe file back to review_file.csv
|
559 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
560 |
-
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text,
|
561 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
562 |
|
563 |
###
|
@@ -601,7 +641,15 @@ with app:
|
|
601 |
###
|
602 |
|
603 |
# Get connection details on app load
|
604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
605 |
|
606 |
# If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
|
607 |
if GET_DEFAULT_ALLOW_LIST == "True" and ALLOW_LIST_PATH:
|
@@ -615,20 +663,23 @@ with app:
|
|
615 |
else: print("Could not load in default allow list")
|
616 |
|
617 |
# If relevant environment variable is set, load in the default cost code file from S3 or locally
|
618 |
-
if GET_COST_CODES == "True" and COST_CODES_PATH:
|
619 |
if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
|
620 |
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
|
621 |
-
success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
622 |
print("Successfully loaded cost codes from S3")
|
623 |
elif os.path.exists(COST_CODES_PATH):
|
624 |
print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
|
625 |
-
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
626 |
else: print("Could not load in cost code data")
|
627 |
|
|
|
|
|
628 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
629 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
630 |
-
access_callback.setup([session_hash_textbox], ACCESS_LOGS_FOLDER)
|
631 |
-
|
|
|
632 |
success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
633 |
|
634 |
# User submitted feedback for pdf redactions
|
@@ -647,16 +698,23 @@ with app:
|
|
647 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
648 |
|
649 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
650 |
-
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
|
651 |
-
|
|
|
|
|
|
|
|
|
652 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
653 |
else:
|
654 |
-
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
|
655 |
-
|
|
|
656 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
657 |
|
658 |
-
|
|
|
659 |
|
|
|
660 |
if RUN_DIRECT_MODE == "0":
|
661 |
|
662 |
if os.environ['COGNITO_AUTH'] == "1":
|
@@ -667,7 +725,7 @@ if __name__ == "__main__":
|
|
667 |
else:
|
668 |
from tools.cli_redact import main
|
669 |
|
670 |
-
main(first_loop_state, latest_file_completed=0,
|
671 |
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
672 |
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
|
673 |
|
|
|
1 |
import os
|
2 |
+
import logging
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
|
7 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE
|
8 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
|
9 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3
|
10 |
from tools.file_redaction import choose_and_run_redactor
|
11 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
|
12 |
+
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
|
13 |
from tools.data_anonymise import anonymise_data_files
|
14 |
from tools.auth import authenticate_user
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
from tools.custom_csvlogger import CSVLogger_custom
|
17 |
from tools.find_duplicate_pages import identify_similar_pages
|
18 |
+
from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
|
19 |
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
|
|
60 |
|
61 |
# Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
|
62 |
pdf_doc_state = gr.State([])
|
63 |
+
all_image_annotations_state = gr.State([])
|
64 |
|
65 |
|
66 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
|
67 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
|
68 |
|
69 |
session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
|
70 |
+
host_name_textbox = gr.Textbox(label= "host_name_textbox", value=HOST_NAME, visible=False)
|
71 |
s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
|
72 |
+
session_output_folder_textbox = gr.Textbox(value = SESSION_OUTPUT_FOLDER, label="session_output_folder_textbox", visible=False)
|
73 |
output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
|
74 |
input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
|
75 |
|
|
|
137 |
|
138 |
clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
|
139 |
prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
|
140 |
+
prepare_for_review_bool_false = gr.Checkbox(label="prepare_for_review_bool_false", value=False, visible=False)
|
141 |
prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
|
142 |
|
143 |
## Settings page variables
|
|
|
154 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
|
155 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
|
156 |
|
157 |
+
s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_BULK_ANALYSIS_BUCKET, visible=False)
|
158 |
+
s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
159 |
+
s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
|
160 |
+
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
161 |
+
|
162 |
+
load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
|
163 |
+
s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
|
164 |
+
local_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
|
165 |
+
|
166 |
s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
|
167 |
default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
|
168 |
enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
|
169 |
+
default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
|
170 |
|
171 |
# Base tables that are not modified subsequent to load
|
172 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
|
173 |
all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
|
174 |
+
all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
|
175 |
cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
|
176 |
|
177 |
# Duplicate page detection
|
178 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
179 |
+
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
|
180 |
|
181 |
# Tracking variables for current page (not visible)
|
182 |
current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
|
|
|
184 |
|
185 |
# Placeholders for elements that may be made visible later below depending on environment variables
|
186 |
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
|
187 |
+
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
|
188 |
|
189 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
|
190 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
|
|
|
193 |
|
194 |
only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
|
195 |
|
196 |
+
# Textract API call placeholders in case option not selected in config
|
197 |
+
|
198 |
+
job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=False)
|
199 |
+
send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
|
200 |
+
|
201 |
+
job_id_textbox = gr.Textbox(label = "Latest job ID for bulk document analysis", value='', visible=False)
|
202 |
+
check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
|
203 |
+
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
|
204 |
+
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
205 |
+
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
|
206 |
+
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
207 |
+
is_a_textract_api_call = gr.Checkbox(value=False, label="is_a_textract_api_call", visible=False)
|
208 |
+
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
209 |
+
|
210 |
+
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
211 |
+
|
212 |
###
|
213 |
# UI DESIGN
|
214 |
###
|
|
|
231 |
with gr.Accordion("Redact document", open = True):
|
232 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
|
233 |
|
234 |
+
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Go to Redaction settings - AWS Textract options to remove signature detection.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
|
|
|
|
|
|
235 |
|
236 |
with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
|
237 |
+
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
|
238 |
|
239 |
+
with gr.Row(equal_height=True):
|
240 |
+
pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
if SHOW_COSTS == "True":
|
243 |
with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
|
244 |
+
with gr.Row(equal_height=True):
|
245 |
+
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
246 |
+
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
|
247 |
+
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
|
248 |
+
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
|
249 |
|
250 |
if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
|
251 |
with gr.Accordion("Apply cost code", open = True, visible=True):
|
|
|
253 |
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
|
254 |
with gr.Column():
|
255 |
reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
|
256 |
+
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
|
257 |
+
|
258 |
+
if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
|
259 |
+
with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
|
260 |
+
with gr.Row(equal_height=True):
|
261 |
+
gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
|
262 |
+
with gr.Row(equal_height=True):
|
263 |
+
send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract API call", variant="primary", visible=True)
|
264 |
+
with gr.Row(equal_height=False):
|
265 |
+
with gr.Column(scale=2):
|
266 |
+
textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(6,'fixed'), static_columns=[0,1,2,3,4,5])
|
267 |
+
with gr.Column(scale=1):
|
268 |
+
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
|
269 |
+
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
270 |
+
with gr.Row():
|
271 |
+
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
|
272 |
+
textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
|
273 |
|
274 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
275 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
276 |
|
277 |
with gr.Row():
|
278 |
+
redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
|
279 |
output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
|
280 |
latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
|
281 |
|
|
|
|
|
|
|
282 |
# Feedback elements are invisible until revealed by redaction action
|
283 |
pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
284 |
pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
|
|
|
297 |
annotate_zoom_in = gr.Button("Zoom in", visible=False)
|
298 |
annotate_zoom_out = gr.Button("Zoom out", visible=False)
|
299 |
with gr.Row():
|
300 |
+
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
301 |
|
302 |
+
with gr.Row():
|
303 |
with gr.Column(scale=2):
|
304 |
with gr.Row(equal_height=True):
|
305 |
annotation_last_page_button = gr.Button("Previous page", scale = 4)
|
306 |
+
annotate_current_page = gr.Number(value=0, label="Current page", precision=0, scale = 2, min_width=50)
|
307 |
+
annotate_max_pages = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
308 |
annotation_next_page_button = gr.Button("Next page", scale = 4)
|
|
|
|
|
|
|
309 |
|
|
|
|
|
310 |
zoom_str = str(annotator_zoom_number) + '%'
|
311 |
|
312 |
annotator = image_annotator(
|
|
|
326 |
handles_cursor=True,
|
327 |
interactive=False
|
328 |
)
|
329 |
+
|
330 |
+
with gr.Row(equal_height=True):
|
331 |
+
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
|
332 |
+
annotate_current_page_bottom = gr.Number(value=0, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
|
333 |
+
annotate_max_pages_bottom = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
334 |
+
annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
|
335 |
+
|
336 |
with gr.Column(scale=1):
|
337 |
+
annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
|
338 |
update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
|
339 |
with gr.Accordion("Search suggested redactions", open=True):
|
340 |
with gr.Row(equal_height=True):
|
|
|
355 |
|
356 |
with gr.Accordion("Search all extracted text", open=True):
|
357 |
all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
|
358 |
+
reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
361 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
|
|
459 |
all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
|
460 |
all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
|
461 |
|
462 |
+
###
|
463 |
### UI INTERACTION ###
|
464 |
+
###
|
465 |
|
466 |
###
|
467 |
# PDF/IMAGE REDACTION
|
|
|
469 |
# Recalculate estimated costs based on changes to inputs
|
470 |
if SHOW_COSTS == 'True':
|
471 |
# Calculate costs
|
472 |
+
total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
473 |
text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
474 |
pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
475 |
handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
|
|
489 |
cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
|
490 |
reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
491 |
|
492 |
+
cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
493 |
+
|
494 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
495 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
|
496 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
497 |
|
498 |
# Run redaction function
|
499 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
|
500 |
+
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
501 |
+
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
502 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
|
503 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
504 |
|
505 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
506 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
507 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
|
508 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
509 |
|
510 |
# If a file has been completed, the function will continue onto the next document
|
511 |
+
latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
512 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
|
513 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
514 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
515 |
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
516 |
|
517 |
# If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
|
518 |
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
519 |
+
|
520 |
+
# Send whole document to Textract for text extraction
|
521 |
+
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call])
|
522 |
+
|
523 |
+
check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
524 |
+
success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
|
525 |
+
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
526 |
+
|
527 |
+
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
528 |
|
529 |
###
|
530 |
# REVIEW PDF REDACTIONS
|
|
|
533 |
# Upload previous files for modifying redactions
|
534 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
535 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
536 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
|
537 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
538 |
|
539 |
# Page number controls
|
|
|
541 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
542 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
543 |
|
544 |
+
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom])
|
545 |
+
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom])
|
546 |
|
547 |
+
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom])
|
548 |
+
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom])
|
549 |
|
550 |
annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
|
551 |
|
|
|
592 |
|
593 |
# Convert review file to xfdf Adobe format
|
594 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
595 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder]).\
|
596 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
597 |
|
598 |
# Convert xfdf Adobe file back to review_file.csv
|
599 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
600 |
+
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder]).\
|
601 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
602 |
|
603 |
###
|
|
|
641 |
###
|
642 |
|
643 |
# Get connection details on app load
|
644 |
+
|
645 |
+
if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
|
646 |
+
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
|
647 |
+
success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
|
648 |
+
else:
|
649 |
+
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder])
|
650 |
+
|
651 |
+
|
652 |
+
# If relevant environment variable is set, load in the Textract job details
|
653 |
|
654 |
# If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
|
655 |
if GET_DEFAULT_ALLOW_LIST == "True" and ALLOW_LIST_PATH:
|
|
|
663 |
else: print("Could not load in default allow list")
|
664 |
|
665 |
# If relevant environment variable is set, load in the default cost code file from S3 or locally
|
666 |
+
if GET_COST_CODES == "True" and COST_CODES_PATH:
|
667 |
if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
|
668 |
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
|
669 |
+
success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
670 |
print("Successfully loaded cost codes from S3")
|
671 |
elif os.path.exists(COST_CODES_PATH):
|
672 |
print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
|
673 |
+
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
674 |
else: print("Could not load in cost code data")
|
675 |
|
676 |
+
### LOGGING
|
677 |
+
|
678 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
679 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
680 |
+
access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
|
681 |
+
|
682 |
+
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
|
683 |
success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
684 |
|
685 |
# User submitted feedback for pdf redactions
|
|
|
698 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
699 |
|
700 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
701 |
+
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
|
702 |
+
|
703 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
704 |
+
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
705 |
+
|
706 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
707 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
708 |
else:
|
709 |
+
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
|
710 |
+
|
711 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
712 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
713 |
|
714 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
715 |
+
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
716 |
|
717 |
+
if __name__ == "__main__":
|
718 |
if RUN_DIRECT_MODE == "0":
|
719 |
|
720 |
if os.environ['COGNITO_AUTH'] == "1":
|
|
|
725 |
else:
|
726 |
from tools.cli_redact import main
|
727 |
|
728 |
+
main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
|
729 |
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
730 |
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
|
731 |
|
requirements.txt
CHANGED
@@ -13,7 +13,7 @@ spacy==3.8.4
|
|
13 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
14 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
gradio==5.23.3
|
16 |
-
boto3==1.37.
|
17 |
pyarrow==19.0.1
|
18 |
openpyxl==3.1.5
|
19 |
Faker==36.1.1
|
|
|
13 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
14 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
gradio==5.23.3
|
16 |
+
boto3==1.37.29
|
17 |
pyarrow==19.0.1
|
18 |
openpyxl==3.1.5
|
19 |
Faker==36.1.1
|
tools/aws_functions.py
CHANGED
@@ -30,129 +30,90 @@ if RUN_AWS_FUNCTIONS == "1":
|
|
30 |
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
31 |
|
32 |
print("Successfully assumed ARN role")
|
33 |
-
print("Assumed Role ARN:", assumed_role_arn)
|
34 |
-
print("Assumed Role Name:", assumed_role_name)
|
35 |
|
36 |
except Exception as e:
|
37 |
print("Could not get assumed role from STS:", e)
|
38 |
|
39 |
# Download direct from S3 - requires login credentials
|
40 |
-
def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str):
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
s3 = boto3.client('s3', region_name=AWS_REGION)
|
43 |
-
s3.download_file(bucket_name, key, local_file_path_and_name)
|
44 |
-
print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
|
45 |
|
46 |
-
def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str):
|
47 |
"""
|
48 |
Download all files from an S3 folder to a local folder.
|
49 |
"""
|
50 |
-
|
|
|
51 |
|
52 |
-
|
53 |
-
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
# Extract object key and construct local file path
|
58 |
-
object_key = obj['Key']
|
59 |
-
local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
|
60 |
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str]):
|
72 |
"""
|
73 |
Download specific files from an S3 folder to a local folder.
|
74 |
"""
|
75 |
-
s3 = boto3.client('s3', region_name=AWS_REGION)
|
76 |
-
|
77 |
-
print("Trying to download file: ", filenames)
|
78 |
-
|
79 |
-
if filenames == '*':
|
80 |
-
# List all objects in the S3 folder
|
81 |
-
print("Trying to download all files in AWS folder: ", s3_folder)
|
82 |
-
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
83 |
-
|
84 |
-
print("Found files in AWS folder: ", response.get('Contents', []))
|
85 |
|
86 |
-
|
|
|
87 |
|
88 |
-
|
89 |
|
90 |
-
|
91 |
-
object_key = os.path.join(s3_folder, filename)
|
92 |
-
local_file_path = os.path.join(local_folder, filename)
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
96 |
|
97 |
-
|
98 |
-
try:
|
99 |
-
s3.download_file(bucket_name, object_key, local_file_path)
|
100 |
-
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
101 |
-
except Exception as e:
|
102 |
-
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
103 |
-
|
104 |
-
def load_data_from_aws(in_aws_keyword_file, aws_password:str="", bucket_name:str=DOCUMENT_REDACTION_BUCKET):
|
105 |
-
|
106 |
-
temp_dir = tempfile.mkdtemp()
|
107 |
-
local_address_stub = temp_dir + '/doc-redaction/'
|
108 |
-
files = []
|
109 |
-
|
110 |
-
if not 'LAMBETH_BOROUGH_PLAN_PASSWORD' in os.environ:
|
111 |
-
out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
|
112 |
-
return files, out_message
|
113 |
-
|
114 |
-
if aws_password:
|
115 |
-
if "Lambeth borough plan" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_BOROUGH_PLAN_PASSWORD']:
|
116 |
|
117 |
-
|
118 |
|
119 |
-
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
|
125 |
-
|
|
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
132 |
|
133 |
-
|
134 |
|
135 |
-
|
136 |
-
print(f"Folder {local_folder_path} is not empty")
|
137 |
-
|
138 |
-
#files = os.listdir(local_folder_stub)
|
139 |
-
#print(files)
|
140 |
-
|
141 |
-
files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
|
142 |
-
|
143 |
-
out_message = "Data successfully loaded from AWS"
|
144 |
-
print(out_message)
|
145 |
-
|
146 |
-
else:
|
147 |
-
out_message = "Data not loaded from AWS"
|
148 |
-
print(out_message)
|
149 |
-
else:
|
150 |
-
out_message = "No password provided. Please ask the data team for access if you need this."
|
151 |
-
print(out_message)
|
152 |
-
|
153 |
-
return files, out_message
|
154 |
-
|
155 |
-
def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET):
|
156 |
"""
|
157 |
Uploads a file from local machine to Amazon S3.
|
158 |
|
@@ -165,33 +126,44 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
|
|
165 |
- Message as variable/printed to console
|
166 |
"""
|
167 |
final_out_message = []
|
|
|
168 |
|
169 |
-
|
|
|
|
|
170 |
|
171 |
-
|
172 |
-
local_file_paths = [local_file_paths]
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
#print(s3_client)
|
177 |
-
try:
|
178 |
-
# Get file name off file path
|
179 |
-
file_name = os.path.basename(file)
|
180 |
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
print(out_message)
|
187 |
-
|
188 |
-
except Exception as e:
|
189 |
-
out_message = f"Error uploading file(s): {e}"
|
190 |
-
print(out_message)
|
191 |
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
return final_out_message_str
|
|
|
30 |
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
31 |
|
32 |
print("Successfully assumed ARN role")
|
33 |
+
#print("Assumed Role ARN:", assumed_role_arn)
|
34 |
+
#print("Assumed Role Name:", assumed_role_name)
|
35 |
|
36 |
except Exception as e:
|
37 |
print("Could not get assumed role from STS:", e)
|
38 |
|
39 |
# Download direct from S3 - requires login credentials
|
40 |
+
def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
|
41 |
+
|
42 |
+
if RUN_AWS_FUNCTIONS == "1":
|
43 |
+
s3 = boto3.client('s3', region_name=AWS_REGION)
|
44 |
+
s3.download_file(bucket_name, key, local_file_path_and_name)
|
45 |
+
print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
|
46 |
|
|
|
|
|
|
|
47 |
|
48 |
+
def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
|
49 |
"""
|
50 |
Download all files from an S3 folder to a local folder.
|
51 |
"""
|
52 |
+
if RUN_AWS_FUNCTIONS == "1":
|
53 |
+
if bucket_name and s3_folder and local_folder:
|
54 |
|
55 |
+
s3 = boto3.client('s3', region_name=AWS_REGION)
|
|
|
56 |
|
57 |
+
# List objects in the specified S3 folder
|
58 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
|
|
|
|
|
|
59 |
|
60 |
+
# Download each object
|
61 |
+
for obj in response.get('Contents', []):
|
62 |
+
# Extract object key and construct local file path
|
63 |
+
object_key = obj['Key']
|
64 |
+
local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
|
65 |
|
66 |
+
# Create directories if necessary
|
67 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
68 |
+
|
69 |
+
# Download the object
|
70 |
+
try:
|
71 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
72 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
73 |
+
except Exception as e:
|
74 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
75 |
+
else: print("One or more required variables are empty, could not download from S3")
|
76 |
|
77 |
+
def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str], RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
|
78 |
"""
|
79 |
Download specific files from an S3 folder to a local folder.
|
80 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
+
if RUN_AWS_FUNCTIONS == "1":
|
83 |
+
if bucket_name and s3_folder and local_folder and filenames:
|
84 |
|
85 |
+
s3 = boto3.client('s3', region_name=AWS_REGION)
|
86 |
|
87 |
+
print("Trying to download file: ", filenames)
|
|
|
|
|
88 |
|
89 |
+
if filenames == '*':
|
90 |
+
# List all objects in the S3 folder
|
91 |
+
print("Trying to download all files in AWS folder: ", s3_folder)
|
92 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
93 |
|
94 |
+
print("Found files in AWS folder: ", response.get('Contents', []))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
+
filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
|
97 |
|
98 |
+
print("Found filenames in AWS folder: ", filenames)
|
99 |
|
100 |
+
for filename in filenames:
|
101 |
+
object_key = os.path.join(s3_folder, filename)
|
102 |
+
local_file_path = os.path.join(local_folder, filename)
|
103 |
|
104 |
+
# Create directories if necessary
|
105 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
106 |
|
107 |
+
# Download the object
|
108 |
+
try:
|
109 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
110 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
111 |
+
except Exception as e:
|
112 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
113 |
|
114 |
+
else: print("One or more required variables are empty, could not download from S3")
|
115 |
|
116 |
+
def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
"""
|
118 |
Uploads a file from local machine to Amazon S3.
|
119 |
|
|
|
126 |
- Message as variable/printed to console
|
127 |
"""
|
128 |
final_out_message = []
|
129 |
+
final_out_message_str = ""
|
130 |
|
131 |
+
if RUN_AWS_FUNCTIONS == "1":
|
132 |
+
try:
|
133 |
+
if s3_bucket and s3_key and local_file_paths:
|
134 |
|
135 |
+
s3_client = boto3.client('s3', region_name=AWS_REGION)
|
|
|
136 |
|
137 |
+
if isinstance(local_file_paths, str):
|
138 |
+
local_file_paths = [local_file_paths]
|
|
|
|
|
|
|
|
|
139 |
|
140 |
+
for file in local_file_paths:
|
141 |
+
if s3_client:
|
142 |
+
#print(s3_client)
|
143 |
+
try:
|
144 |
+
# Get file name off file path
|
145 |
+
file_name = os.path.basename(file)
|
146 |
|
147 |
+
s3_key_full = s3_key + file_name
|
148 |
+
print("S3 key: ", s3_key_full)
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
+
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
151 |
+
out_message = "File " + file_name + " uploaded successfully!"
|
152 |
+
print(out_message)
|
153 |
+
|
154 |
+
except Exception as e:
|
155 |
+
out_message = f"Error uploading file(s): {e}"
|
156 |
+
print(out_message)
|
157 |
|
158 |
+
final_out_message.append(out_message)
|
159 |
+
final_out_message_str = '\n'.join(final_out_message)
|
160 |
+
|
161 |
+
else: final_out_message_str = "Could not connect to AWS."
|
162 |
+
else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
|
163 |
+
except Exception as e:
|
164 |
+
final_out_message_str = "Could not upload files to S3 due to: " + str(e)
|
165 |
+
print(final_out_message_str)
|
166 |
+
else:
|
167 |
+
final_out_message_str = "App not set to run AWS functions"
|
168 |
|
169 |
return final_out_message_str
|
tools/aws_textract.py
CHANGED
@@ -6,6 +6,7 @@ import json
|
|
6 |
from collections import defaultdict
|
7 |
import pikepdf
|
8 |
import time
|
|
|
9 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
10 |
from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
|
11 |
|
@@ -275,7 +276,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
|
|
275 |
|
276 |
return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
|
277 |
|
278 |
-
def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
|
279 |
"""
|
280 |
Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
|
281 |
"""
|
@@ -307,7 +308,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
|
|
307 |
print("Need to convert Textract JSON to app format.")
|
308 |
try:
|
309 |
|
310 |
-
textract_data = restructure_textract_output(textract_data)
|
311 |
return textract_data, False, log_files_output_paths # Successfully converted
|
312 |
|
313 |
except Exception as e:
|
@@ -318,7 +319,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
|
|
318 |
print("textract data:", textract_data)
|
319 |
return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
|
320 |
|
321 |
-
def restructure_textract_output(textract_output: dict):
|
322 |
"""
|
323 |
Reorganise Textract output from the bulk Textract analysis option on AWS
|
324 |
into a format that works in this redaction app, reducing size.
|
@@ -328,10 +329,62 @@ def restructure_textract_output(textract_output: dict):
|
|
328 |
# Extract total pages from DocumentMetadata
|
329 |
document_metadata = textract_output.get("DocumentMetadata", {})
|
330 |
|
|
|
|
|
|
|
|
|
331 |
for block in textract_output.get("Blocks", []):
|
332 |
page_no = block.get("Page", 1) # Default to 1 if missing
|
333 |
|
334 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
if page_no not in pages_dict:
|
336 |
pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
|
337 |
|
|
|
6 |
from collections import defaultdict
|
7 |
import pikepdf
|
8 |
import time
|
9 |
+
import pandas as pd
|
10 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
11 |
from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
|
12 |
|
|
|
276 |
|
277 |
return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
|
278 |
|
279 |
+
def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
280 |
"""
|
281 |
Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
|
282 |
"""
|
|
|
308 |
print("Need to convert Textract JSON to app format.")
|
309 |
try:
|
310 |
|
311 |
+
textract_data = restructure_textract_output(textract_data, page_sizes_df)
|
312 |
return textract_data, False, log_files_output_paths # Successfully converted
|
313 |
|
314 |
except Exception as e:
|
|
|
319 |
print("textract data:", textract_data)
|
320 |
return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
|
321 |
|
322 |
+
def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFrame):
|
323 |
"""
|
324 |
Reorganise Textract output from the bulk Textract analysis option on AWS
|
325 |
into a format that works in this redaction app, reducing size.
|
|
|
329 |
# Extract total pages from DocumentMetadata
|
330 |
document_metadata = textract_output.get("DocumentMetadata", {})
|
331 |
|
332 |
+
# For efficient lookup, set 'page' as index if it's not already
|
333 |
+
if 'page' in page_sizes_df.columns:
|
334 |
+
page_sizes_df = page_sizes_df.set_index('page')
|
335 |
+
|
336 |
for block in textract_output.get("Blocks", []):
|
337 |
page_no = block.get("Page", 1) # Default to 1 if missing
|
338 |
|
339 |
+
# --- Geometry Conversion Logic ---
|
340 |
+
try:
|
341 |
+
page_info = page_sizes_df.loc[page_no]
|
342 |
+
cb_width = page_info['cropbox_width']
|
343 |
+
cb_height = page_info['cropbox_height']
|
344 |
+
mb_width = page_info['mediabox_width']
|
345 |
+
mb_height = page_info['mediabox_height']
|
346 |
+
cb_x_offset = page_info['cropbox_x_offset']
|
347 |
+
cb_y_offset_top = page_info['cropbox_y_offset_from_top']
|
348 |
+
|
349 |
+
# Check if conversion is needed (and avoid division by zero)
|
350 |
+
needs_conversion = (
|
351 |
+
abs(cb_width - mb_width) > 1e-6 or \
|
352 |
+
abs(cb_height - mb_height) > 1e-6
|
353 |
+
) and mb_width > 1e-6 and mb_height > 1e-6 # Avoid division by zero
|
354 |
+
|
355 |
+
if needs_conversion and 'Geometry' in block:
|
356 |
+
geometry = block['Geometry'] # Work directly on the block's geometry
|
357 |
+
|
358 |
+
# --- Convert BoundingBox ---
|
359 |
+
if 'BoundingBox' in geometry:
|
360 |
+
bbox = geometry['BoundingBox']
|
361 |
+
old_left = bbox['Left']
|
362 |
+
old_top = bbox['Top']
|
363 |
+
old_width = bbox['Width']
|
364 |
+
old_height = bbox['Height']
|
365 |
+
|
366 |
+
# Calculate absolute coordinates within CropBox
|
367 |
+
abs_cb_x = old_left * cb_width
|
368 |
+
abs_cb_y = old_top * cb_height
|
369 |
+
abs_cb_width = old_width * cb_width
|
370 |
+
abs_cb_height = old_height * cb_height
|
371 |
+
|
372 |
+
# Calculate absolute coordinates relative to MediaBox top-left
|
373 |
+
abs_mb_x = cb_x_offset + abs_cb_x
|
374 |
+
abs_mb_y = cb_y_offset_top + abs_cb_y
|
375 |
+
|
376 |
+
# Convert back to normalized coordinates relative to MediaBox
|
377 |
+
bbox['Left'] = abs_mb_x / mb_width
|
378 |
+
bbox['Top'] = abs_mb_y / mb_height
|
379 |
+
bbox['Width'] = abs_cb_width / mb_width
|
380 |
+
bbox['Height'] = abs_cb_height / mb_height
|
381 |
+
except KeyError:
|
382 |
+
print(f"Warning: Page number {page_no} not found in page_sizes_df. Skipping coordinate conversion for this block.")
|
383 |
+
# Decide how to handle missing page info: skip conversion, raise error, etc.
|
384 |
+
except ZeroDivisionError:
|
385 |
+
print(f"Warning: MediaBox width or height is zero for page {page_no}. Skipping coordinate conversion for this block.")
|
386 |
+
|
387 |
+
# Initialise page structure if not already present
|
388 |
if page_no not in pages_dict:
|
389 |
pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
|
390 |
|
tools/config.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
import os
|
2 |
import tempfile
|
3 |
import socket
|
|
|
4 |
from datetime import datetime
|
5 |
from dotenv import load_dotenv
|
6 |
from tldextract import TLDExtract
|
7 |
|
8 |
today_rev = datetime.now().strftime("%Y%m%d")
|
9 |
-
|
10 |
|
11 |
# Set or retrieve configuration variables for the redaction app
|
12 |
|
@@ -28,28 +29,40 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
|
|
28 |
return value
|
29 |
|
30 |
|
31 |
-
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/
|
32 |
-
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '')
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
38 |
|
39 |
###
|
40 |
# AWS CONFIG
|
41 |
###
|
42 |
|
43 |
-
# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '
|
44 |
-
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '')
|
45 |
|
46 |
-
if
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
49 |
|
50 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
51 |
|
52 |
-
AWS_REGION = get_or_create_env_var('AWS_REGION', '
|
53 |
|
54 |
AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
|
55 |
|
@@ -65,14 +78,28 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
|
|
65 |
|
66 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# Custom headers e.g. if routing traffic through Cloudfront
|
69 |
# Retrieving or setting CUSTOM_HEADER
|
70 |
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
71 |
-
if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
|
72 |
|
73 |
# Retrieving or setting CUSTOM_HEADER_VALUE
|
74 |
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
75 |
-
if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
|
76 |
|
77 |
###
|
78 |
# Images config
|
@@ -84,8 +111,7 @@ MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to No
|
|
84 |
###
|
85 |
# File I/O config
|
86 |
###
|
87 |
-
|
88 |
-
SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
|
89 |
|
90 |
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
|
91 |
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
|
@@ -99,12 +125,14 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
|
99 |
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
|
100 |
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
|
101 |
|
102 |
-
|
|
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
|
|
|
108 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
109 |
|
110 |
###
|
@@ -114,7 +142,6 @@ TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
|
|
114 |
|
115 |
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
|
116 |
|
117 |
-
SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
|
118 |
|
119 |
# Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
|
120 |
PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
|
@@ -153,15 +180,16 @@ ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_
|
|
153 |
|
154 |
S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
|
155 |
|
156 |
-
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', '
|
157 |
|
158 |
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
|
159 |
|
|
|
|
|
160 |
COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
|
161 |
|
162 |
S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
|
163 |
|
164 |
ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
|
165 |
|
166 |
-
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
|
167 |
-
if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
|
|
|
1 |
import os
|
2 |
import tempfile
|
3 |
import socket
|
4 |
+
import logging
|
5 |
from datetime import datetime
|
6 |
from dotenv import load_dotenv
|
7 |
from tldextract import TLDExtract
|
8 |
|
9 |
today_rev = datetime.now().strftime("%Y%m%d")
|
10 |
+
HOST_NAME = socket.gethostname()
|
11 |
|
12 |
# Set or retrieve configuration variables for the redaction app
|
13 |
|
|
|
29 |
return value
|
30 |
|
31 |
|
32 |
+
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
|
33 |
+
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env')
|
34 |
|
35 |
+
if APP_CONFIG_PATH:
|
36 |
+
if os.path.exists(APP_CONFIG_PATH):
|
37 |
+
print(f"Loading app variables from config file {APP_CONFIG_PATH}")
|
38 |
+
load_dotenv(APP_CONFIG_PATH)
|
39 |
+
else:
|
40 |
+
print("App config file not found at location:", APP_CONFIG_PATH)
|
41 |
|
42 |
+
# Report logging to console?
|
43 |
+
LOGGING = get_or_create_env_var('LOGGING', 'False')
|
44 |
+
|
45 |
+
if LOGGING == 'True':
|
46 |
+
# Configure logging
|
47 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
48 |
|
49 |
###
|
50 |
# AWS CONFIG
|
51 |
###
|
52 |
|
53 |
+
# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
|
54 |
+
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', 'config/aws_config.env')
|
55 |
|
56 |
+
if AWS_CONFIG_PATH:
|
57 |
+
if os.path.exists(AWS_CONFIG_PATH):
|
58 |
+
print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
|
59 |
+
load_dotenv(AWS_CONFIG_PATH)
|
60 |
+
else:
|
61 |
+
print("AWS config file not found at location:", AWS_CONFIG_PATH)
|
62 |
|
63 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
64 |
|
65 |
+
AWS_REGION = get_or_create_env_var('AWS_REGION', '')
|
66 |
|
67 |
AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
|
68 |
|
|
|
78 |
|
79 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
80 |
|
81 |
+
SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
|
82 |
+
|
83 |
+
TEXTRACT_BULK_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_BUCKET', '')
|
84 |
+
|
85 |
+
TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER', 'input')
|
86 |
+
|
87 |
+
TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
|
88 |
+
|
89 |
+
LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
|
90 |
+
|
91 |
+
TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
|
92 |
+
|
93 |
+
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
|
94 |
+
|
95 |
# Custom headers e.g. if routing traffic through Cloudfront
|
96 |
# Retrieving or setting CUSTOM_HEADER
|
97 |
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
98 |
+
#if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
|
99 |
|
100 |
# Retrieving or setting CUSTOM_HEADER_VALUE
|
101 |
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
102 |
+
#if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
|
103 |
|
104 |
###
|
105 |
# Images config
|
|
|
111 |
###
|
112 |
# File I/O config
|
113 |
###
|
114 |
+
SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
|
|
|
115 |
|
116 |
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
|
117 |
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
|
|
|
125 |
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
|
126 |
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
|
127 |
|
128 |
+
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
129 |
+
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
130 |
|
131 |
+
FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + HOST_NAME + '/')
|
132 |
+
ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + today_rev + '/' + HOST_NAME + '/')
|
133 |
+
USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + today_rev + '/' + HOST_NAME + '/')
|
134 |
|
135 |
+
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
136 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
137 |
|
138 |
###
|
|
|
142 |
|
143 |
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
|
144 |
|
|
|
145 |
|
146 |
# Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
|
147 |
PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
|
|
|
180 |
|
181 |
S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
|
182 |
|
183 |
+
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
|
184 |
|
185 |
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
|
186 |
|
187 |
+
DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
|
188 |
+
|
189 |
COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
|
190 |
|
191 |
S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
|
192 |
|
193 |
ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
|
194 |
|
195 |
+
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
|
|
tools/file_conversion.py
CHANGED
@@ -181,7 +181,7 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
|
|
181 |
widths = [result[2] for result in results]
|
182 |
heights = [result[3] for result in results]
|
183 |
|
184 |
-
print("PDF has been converted to images.")
|
185 |
return images, widths, heights, results
|
186 |
|
187 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
@@ -208,7 +208,7 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
|
|
208 |
|
209 |
# Check if the file is a PDF
|
210 |
elif file_extension == '.pdf':
|
211 |
-
print(f"{file_path} is a PDF file. Converting to image set")
|
212 |
|
213 |
# Run your function for processing PDF files here
|
214 |
img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
|
@@ -417,12 +417,29 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
|
|
417 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
418 |
original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
|
419 |
|
420 |
-
# Create a page_sizes_object.
|
421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
if image_sizes_width and image_sizes_height:
|
423 |
-
out_page_image_sizes
|
424 |
-
|
425 |
-
out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":pd.NA, "image_height":pd.NA, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
|
426 |
|
427 |
page_sizes.append(out_page_image_sizes)
|
428 |
|
@@ -434,7 +451,7 @@ def prepare_image_or_pdf(
|
|
434 |
latest_file_completed: int = 0,
|
435 |
out_message: List[str] = [],
|
436 |
first_loop_state: bool = False,
|
437 |
-
number_of_pages:int =
|
438 |
all_annotations_object:List = [],
|
439 |
prepare_for_review:bool = False,
|
440 |
in_fully_redacted_list:List[int]=[],
|
@@ -481,6 +498,9 @@ def prepare_image_or_pdf(
|
|
481 |
all_img_details = []
|
482 |
review_file_csv = pd.DataFrame()
|
483 |
all_line_level_ocr_results_df = pd.DataFrame()
|
|
|
|
|
|
|
484 |
|
485 |
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
486 |
if not in_fully_redacted_list.empty:
|
@@ -494,7 +514,7 @@ def prepare_image_or_pdf(
|
|
494 |
else:
|
495 |
print("Now redacting file", str(latest_file_completed))
|
496 |
|
497 |
-
# If out message or converted_file_paths are blank, change to a list so it can be appended to
|
498 |
if isinstance(out_message, str): out_message = [out_message]
|
499 |
|
500 |
if not file_paths: file_paths = []
|
@@ -521,15 +541,9 @@ def prepare_image_or_pdf(
|
|
521 |
file_paths_list = [file_paths]
|
522 |
file_paths_loop = file_paths_list
|
523 |
else:
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
else:
|
528 |
-
file_paths_list = file_paths
|
529 |
-
file_paths_loop = file_paths
|
530 |
-
# Sort files to prioritise PDF files first, then JSON files. This means that the pdf can be loaded in, and pdf page path locations can be added to the json
|
531 |
-
file_paths_loop = sorted(file_paths_loop, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
532 |
-
|
533 |
# Loop through files to load in
|
534 |
for file in file_paths_loop:
|
535 |
converted_file_path = []
|
@@ -592,7 +606,6 @@ def prepare_image_or_pdf(
|
|
592 |
|
593 |
image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
|
594 |
|
595 |
-
|
596 |
# Create a page_sizes_object
|
597 |
page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
|
598 |
|
@@ -612,7 +625,8 @@ def prepare_image_or_pdf(
|
|
612 |
json_from_csv = False
|
613 |
|
614 |
# NEW IF STATEMENT
|
615 |
-
# If the file name ends with
|
|
|
616 |
if (file_extension in ['.json']) | (json_from_csv == True):
|
617 |
|
618 |
if (file_extension in ['.json']) & (prepare_for_review == True):
|
@@ -624,9 +638,14 @@ def prepare_image_or_pdf(
|
|
624 |
all_annotations_object = json.loads(file_path) # Use loads for string content
|
625 |
|
626 |
# Assume it's a textract json
|
627 |
-
elif (file_extension
|
|
|
628 |
# Copy it to the output folder so it can be used later.
|
629 |
-
|
|
|
|
|
|
|
|
|
630 |
|
631 |
# Use shutil to copy the file directly
|
632 |
shutil.copy2(file_path, out_textract_path) # Preserves metadata
|
@@ -748,11 +767,11 @@ def prepare_image_or_pdf(
|
|
748 |
print(out_time)
|
749 |
|
750 |
out_message.append(out_time)
|
751 |
-
|
752 |
|
753 |
-
number_of_pages = len(image_file_paths)
|
754 |
|
755 |
-
return
|
756 |
|
757 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
|
758 |
file_path_without_ext = get_file_name_without_type(in_file_path)
|
|
|
181 |
widths = [result[2] for result in results]
|
182 |
heights = [result[3] for result in results]
|
183 |
|
184 |
+
#print("PDF has been converted to images.")
|
185 |
return images, widths, heights, results
|
186 |
|
187 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
|
|
208 |
|
209 |
# Check if the file is a PDF
|
210 |
elif file_extension == '.pdf':
|
211 |
+
# print(f"{file_path} is a PDF file. Converting to image set")
|
212 |
|
213 |
# Run your function for processing PDF files here
|
214 |
img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
|
|
|
417 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
418 |
original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
|
419 |
|
420 |
+
# Create a page_sizes_object. If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
|
421 |
+
out_page_image_sizes = {
|
422 |
+
"page":reported_page_no,
|
423 |
+
"mediabox_width":pymupdf_page.mediabox.width,
|
424 |
+
"mediabox_height": pymupdf_page.mediabox.height,
|
425 |
+
"cropbox_width":pymupdf_page.cropbox.width,
|
426 |
+
"cropbox_height":pymupdf_page.cropbox.height,
|
427 |
+
"original_cropbox":original_cropboxes[-1],
|
428 |
+
"image_path":image_file_paths[page_no]}
|
429 |
+
|
430 |
+
# cropbox_x_offset: Distance from MediaBox left edge to CropBox left edge
|
431 |
+
# This is simply the difference in their x0 coordinates.
|
432 |
+
out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
|
433 |
+
|
434 |
+
# cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
|
435 |
+
# MediaBox top y = mediabox.y1
|
436 |
+
# CropBox top y = cropbox.y1
|
437 |
+
# The difference is mediabox.y1 - cropbox.y1
|
438 |
+
out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
|
439 |
+
|
440 |
if image_sizes_width and image_sizes_height:
|
441 |
+
out_page_image_sizes["image_width"] = image_sizes_width[page_no]
|
442 |
+
out_page_image_sizes["image_height"] = image_sizes_height[page_no]
|
|
|
443 |
|
444 |
page_sizes.append(out_page_image_sizes)
|
445 |
|
|
|
451 |
latest_file_completed: int = 0,
|
452 |
out_message: List[str] = [],
|
453 |
first_loop_state: bool = False,
|
454 |
+
number_of_pages:int = 0,
|
455 |
all_annotations_object:List = [],
|
456 |
prepare_for_review:bool = False,
|
457 |
in_fully_redacted_list:List[int]=[],
|
|
|
498 |
all_img_details = []
|
499 |
review_file_csv = pd.DataFrame()
|
500 |
all_line_level_ocr_results_df = pd.DataFrame()
|
501 |
+
out_textract_path = ""
|
502 |
+
combined_out_message = ""
|
503 |
+
final_out_message = ""
|
504 |
|
505 |
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
506 |
if not in_fully_redacted_list.empty:
|
|
|
514 |
else:
|
515 |
print("Now redacting file", str(latest_file_completed))
|
516 |
|
517 |
+
# If combined out message or converted_file_paths are blank, change to a list so it can be appended to
|
518 |
if isinstance(out_message, str): out_message = [out_message]
|
519 |
|
520 |
if not file_paths: file_paths = []
|
|
|
541 |
file_paths_list = [file_paths]
|
542 |
file_paths_loop = file_paths_list
|
543 |
else:
|
544 |
+
file_paths_list = file_paths
|
545 |
+
file_paths_loop = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
546 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
# Loop through files to load in
|
548 |
for file in file_paths_loop:
|
549 |
converted_file_path = []
|
|
|
606 |
|
607 |
image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
|
608 |
|
|
|
609 |
# Create a page_sizes_object
|
610 |
page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
|
611 |
|
|
|
625 |
json_from_csv = False
|
626 |
|
627 |
# NEW IF STATEMENT
|
628 |
+
# If the file name ends with .json, check if we are loading for review. If yes, assume it is an annoations object, overwrite the current annotations object. If false, assume this is a Textract object, load in to Textract
|
629 |
+
|
630 |
if (file_extension in ['.json']) | (json_from_csv == True):
|
631 |
|
632 |
if (file_extension in ['.json']) & (prepare_for_review == True):
|
|
|
638 |
all_annotations_object = json.loads(file_path) # Use loads for string content
|
639 |
|
640 |
# Assume it's a textract json
|
641 |
+
elif (file_extension in ['.json']) and (prepare_for_review != True):
|
642 |
+
print("Saving Textract output")
|
643 |
# Copy it to the output folder so it can be used later.
|
644 |
+
output_textract_json_file_name = file_path_without_ext
|
645 |
+
if not file_path.endswith("_textract.json"): output_textract_json_file_name = file_path_without_ext + "_textract.json"
|
646 |
+
else: output_textract_json_file_name = file_path_without_ext + ".json"
|
647 |
+
|
648 |
+
out_textract_path = os.path.join(output_folder, output_textract_json_file_name)
|
649 |
|
650 |
# Use shutil to copy the file directly
|
651 |
shutil.copy2(file_path, out_textract_path) # Preserves metadata
|
|
|
767 |
print(out_time)
|
768 |
|
769 |
out_message.append(out_time)
|
770 |
+
combined_out_message = '\n'.join(out_message)
|
771 |
|
772 |
+
number_of_pages = len(page_sizes)#len(image_file_paths)
|
773 |
|
774 |
+
return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
|
775 |
|
776 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
|
777 |
file_path_without_ext = get_file_name_without_type(in_file_path)
|
tools/file_redaction.py
CHANGED
@@ -205,7 +205,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
205 |
latest_file_completed = int(latest_file_completed)
|
206 |
|
207 |
if isinstance(file_paths,str): number_of_files = 1
|
208 |
-
else: number_of_files = len(
|
209 |
|
210 |
# If we have already redacted the last file, return the input out_message and file list to the relevant outputs
|
211 |
if latest_file_completed >= number_of_files:
|
@@ -764,28 +764,66 @@ def move_page_info(file_path: str) -> str:
|
|
764 |
|
765 |
return new_file_path
|
766 |
|
767 |
-
def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict, image:Image):
|
768 |
'''
|
769 |
Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
|
770 |
'''
|
771 |
|
772 |
img_annotation_box = {}
|
773 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
774 |
if image:
|
775 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
|
|
|
776 |
else:
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
789 |
img_annotation_box["color"] = (0,0,0)
|
790 |
try:
|
791 |
img_annotation_box["label"] = str(annot.entity_type)
|
@@ -795,12 +833,11 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
|
|
795 |
if hasattr(annot, 'text') and annot.text:
|
796 |
img_annotation_box["text"] = str(annot.text)
|
797 |
else:
|
798 |
-
img_annotation_box["text"] = ""
|
799 |
-
|
800 |
-
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
801 |
|
802 |
return img_annotation_box, rect
|
803 |
|
|
|
804 |
def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
|
805 |
'''
|
806 |
Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
|
@@ -951,8 +988,9 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
951 |
rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
|
952 |
|
953 |
# Else should be CustomImageRecognizerResult
|
954 |
-
elif isinstance(annot, CustomImageRecognizerResult):
|
955 |
-
|
|
|
956 |
|
957 |
# Else it should be a pikepdf annotation object
|
958 |
else:
|
@@ -1211,7 +1249,7 @@ def redact_image_pdf(file_path:str,
|
|
1211 |
# If running Textract, check if file already exists. If it does, load in existing data
|
1212 |
if text_extraction_method == textract_option:
|
1213 |
textract_json_file_path = output_folder + file_name + "_textract.json"
|
1214 |
-
textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths)
|
1215 |
original_textract_data = textract_data.copy()
|
1216 |
|
1217 |
###
|
|
|
205 |
latest_file_completed = int(latest_file_completed)
|
206 |
|
207 |
if isinstance(file_paths,str): number_of_files = 1
|
208 |
+
else: number_of_files = len(file_paths_list)
|
209 |
|
210 |
# If we have already redacted the last file, return the input out_message and file list to the relevant outputs
|
211 |
if latest_file_completed >= number_of_files:
|
|
|
764 |
|
765 |
return new_file_path
|
766 |
|
767 |
+
def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict, image:Image, page_sizes_df:pd.DataFrame):
|
768 |
'''
|
769 |
Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
|
770 |
'''
|
771 |
|
772 |
img_annotation_box = {}
|
773 |
|
774 |
+
# For efficient lookup, set 'page' as index if it's not already
|
775 |
+
if 'page' in page_sizes_df.columns:
|
776 |
+
page_sizes_df = page_sizes_df.set_index('page')
|
777 |
+
# PyMuPDF page numbers are 0-based, DataFrame index assumed 1-based
|
778 |
+
page_num_one_based = page.number + 1
|
779 |
+
|
780 |
+
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = 0, 0, 0, 0 # Initialize defaults
|
781 |
+
|
782 |
+
|
783 |
if image:
|
784 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
|
785 |
+
|
786 |
else:
|
787 |
+
# --- Calculate coordinates when no image is present ---
|
788 |
+
# Assumes annot coords are normalized relative to MediaBox (top-left origin)
|
789 |
+
try:
|
790 |
+
# 1. Get MediaBox dimensions from the DataFrame
|
791 |
+
page_info = page_sizes_df.loc[page_num_one_based]
|
792 |
+
mb_width = page_info['mediabox_width']
|
793 |
+
mb_height = page_info['mediabox_height']
|
794 |
+
x_offset = page_info['cropbox_x_offset']
|
795 |
+
y_offset = page_info['cropbox_y_offset_from_top']
|
796 |
+
|
797 |
+
|
798 |
+
# Check for invalid dimensions
|
799 |
+
if mb_width <= 0 or mb_height <= 0:
|
800 |
+
print(f"Warning: Invalid MediaBox dimensions ({mb_width}x{mb_height}) for page {page_num_one_based}. Setting coords to 0.")
|
801 |
+
else:
|
802 |
+
pymupdf_x1 = annot.left - x_offset
|
803 |
+
pymupdf_x2 = annot.left + annot.width - x_offset
|
804 |
+
pymupdf_y1 = annot.top - y_offset
|
805 |
+
pymupdf_y2 = annot.top + annot.height - y_offset
|
806 |
+
|
807 |
+
except KeyError:
|
808 |
+
print(f"Warning: Page number {page_num_one_based} not found in page_sizes_df. Cannot get MediaBox dimensions. Setting coords to 0.")
|
809 |
+
except AttributeError as e:
|
810 |
+
print(f"Error accessing attributes ('left', 'top', etc.) on 'annot' object for page {page_num_one_based}: {e}")
|
811 |
+
except Exception as e:
|
812 |
+
print(f"Error during coordinate calculation for page {page_num_one_based}: {e}")
|
813 |
+
|
814 |
+
rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
|
815 |
+
|
816 |
+
# Now creating image annotation object
|
817 |
+
image_x1 = annot.left
|
818 |
+
image_x2 = annot.left + annot.width
|
819 |
+
image_y1 = annot.top
|
820 |
+
image_y2 = annot.top + annot.height
|
821 |
+
|
822 |
+
# Create image annotation boxes
|
823 |
+
img_annotation_box["xmin"] = image_x1
|
824 |
+
img_annotation_box["ymin"] = image_y1
|
825 |
+
img_annotation_box["xmax"] = image_x2 # annot.left + annot.width
|
826 |
+
img_annotation_box["ymax"] = image_y2 # annot.top + annot.height
|
827 |
img_annotation_box["color"] = (0,0,0)
|
828 |
try:
|
829 |
img_annotation_box["label"] = str(annot.entity_type)
|
|
|
833 |
if hasattr(annot, 'text') and annot.text:
|
834 |
img_annotation_box["text"] = str(annot.text)
|
835 |
else:
|
836 |
+
img_annotation_box["text"] = ""
|
|
|
|
|
837 |
|
838 |
return img_annotation_box, rect
|
839 |
|
840 |
+
|
841 |
def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
|
842 |
'''
|
843 |
Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
|
|
|
988 |
rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
|
989 |
|
990 |
# Else should be CustomImageRecognizerResult
|
991 |
+
elif isinstance(annot, CustomImageRecognizerResult):
|
992 |
+
#print("annot is a CustomImageRecognizerResult")
|
993 |
+
img_annotation_box, rect = prepare_custom_image_recogniser_result_annotation_box(page, annot, image, page_sizes_df)
|
994 |
|
995 |
# Else it should be a pikepdf annotation object
|
996 |
else:
|
|
|
1249 |
# If running Textract, check if file already exists. If it does, load in existing data
|
1250 |
if text_extraction_method == textract_option:
|
1251 |
textract_json_file_path = output_folder + file_name + "_textract.json"
|
1252 |
+
textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
|
1253 |
original_textract_data = textract_data.copy()
|
1254 |
|
1255 |
###
|
tools/helper_functions.py
CHANGED
@@ -9,7 +9,7 @@ import unicodedata
|
|
9 |
from typing import List
|
10 |
from math import ceil
|
11 |
from gradio_image_annotation import image_annotator
|
12 |
-
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID
|
13 |
|
14 |
# Names for options labels
|
15 |
text_ocr_option = "Local model - selectable text"
|
@@ -31,7 +31,7 @@ def reset_state_vars():
|
|
31 |
show_share_button=False,
|
32 |
show_remove_button=False,
|
33 |
interactive=False
|
34 |
-
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], ""
|
35 |
|
36 |
def reset_ocr_results_state():
|
37 |
return pd.DataFrame(), pd.DataFrame(), []
|
@@ -44,23 +44,54 @@ def load_in_default_allow_list(allow_list_file_path):
|
|
44 |
allow_list_file_path = [allow_list_file_path]
|
45 |
return allow_list_file_path
|
46 |
|
47 |
-
def load_in_default_cost_codes(cost_codes_path:str):
|
|
|
|
|
|
|
48 |
cost_codes_df = pd.read_csv(cost_codes_path)
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
return cost_codes_df, cost_codes_df, out_dropdown
|
57 |
|
58 |
-
def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
|
|
|
|
|
|
|
|
|
59 |
if enforce_cost_code_textbox == "True":
|
60 |
if not cost_code_choice:
|
61 |
raise Exception("Please choose a cost code before continuing")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
return
|
63 |
|
|
|
|
|
|
|
|
|
|
|
64 |
def update_dataframe(df:pd.DataFrame):
|
65 |
df_copy = df.copy()
|
66 |
return df_copy
|
@@ -271,7 +302,14 @@ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
|
|
271 |
|
272 |
return output_files
|
273 |
|
274 |
-
async def get_connection_params(request: gr.Request,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
276 |
#print("Session hash:", request.session_hash)
|
277 |
|
@@ -323,6 +361,13 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str=O
|
|
323 |
if session_output_folder == 'True':
|
324 |
output_folder = output_folder_textbox + out_session_hash + "/"
|
325 |
input_folder = input_folder_textbox + out_session_hash + "/"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
else:
|
327 |
output_folder = output_folder_textbox
|
328 |
input_folder = input_folder_textbox
|
@@ -330,8 +375,7 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str=O
|
|
330 |
if not os.path.exists(output_folder): os.mkdir(output_folder)
|
331 |
if not os.path.exists(input_folder): os.mkdir(input_folder)
|
332 |
|
333 |
-
|
334 |
-
return out_session_hash, output_folder, out_session_hash, input_folder
|
335 |
|
336 |
def clean_unicode_text(text:str):
|
337 |
# Step 1: Normalise unicode characters to decompose any special forms
|
@@ -374,6 +418,8 @@ def calculate_aws_costs(number_of_pages:str,
|
|
374 |
pii_identification_method:str,
|
375 |
textract_output_found_checkbox:bool,
|
376 |
only_extract_text_radio:bool,
|
|
|
|
|
377 |
textract_page_cost:float=1.5/1000,
|
378 |
textract_signature_cost:float=2.0/1000,
|
379 |
comprehend_unit_cost:float=0.0001,
|
@@ -391,6 +437,8 @@ def calculate_aws_costs(number_of_pages:str,
|
|
391 |
- pii_identification_method_drop: The method of personally-identifiable information removal.
|
392 |
- textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
|
393 |
- only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
|
|
|
|
|
394 |
- textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
|
395 |
- textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
|
396 |
- comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
|
@@ -419,6 +467,9 @@ def calculate_aws_costs(number_of_pages:str,
|
|
419 |
|
420 |
calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
|
421 |
|
|
|
|
|
|
|
422 |
return calculated_aws_cost
|
423 |
|
424 |
def calculate_time_taken(number_of_pages:str,
|
|
|
9 |
from typing import List
|
10 |
from math import ceil
|
11 |
from gradio_image_annotation import image_annotator
|
12 |
+
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
|
13 |
|
14 |
# Names for options labels
|
15 |
text_ocr_option = "Local model - selectable text"
|
|
|
31 |
show_share_button=False,
|
32 |
show_remove_button=False,
|
33 |
interactive=False
|
34 |
+
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False
|
35 |
|
36 |
def reset_ocr_results_state():
|
37 |
return pd.DataFrame(), pd.DataFrame(), []
|
|
|
44 |
allow_list_file_path = [allow_list_file_path]
|
45 |
return allow_list_file_path
|
46 |
|
47 |
+
def load_in_default_cost_codes(cost_codes_path:str, default_cost_code:str=""):
|
48 |
+
'''
|
49 |
+
Load in the cost codes list from file.
|
50 |
+
'''
|
51 |
cost_codes_df = pd.read_csv(cost_codes_path)
|
52 |
+
dropdown_choices = cost_codes_df.iloc[:, 0].astype(str).tolist()
|
53 |
+
|
54 |
+
# Avoid inserting duplicate or empty cost code values
|
55 |
+
if default_cost_code and default_cost_code not in dropdown_choices:
|
56 |
+
dropdown_choices.insert(0, default_cost_code)
|
57 |
+
|
58 |
+
# Always have a blank option at the top
|
59 |
+
if "" not in dropdown_choices:
|
60 |
+
dropdown_choices.insert(0, "")
|
61 |
+
|
62 |
+
out_dropdown = gr.Dropdown(
|
63 |
+
value=default_cost_code if default_cost_code in dropdown_choices else "",
|
64 |
+
label="Choose cost code for analysis",
|
65 |
+
choices=dropdown_choices,
|
66 |
+
allow_custom_value=False
|
67 |
+
)
|
68 |
|
69 |
return cost_codes_df, cost_codes_df, out_dropdown
|
70 |
|
71 |
+
def enforce_cost_codes(enforce_cost_code_textbox:str, cost_code_choice:str, cost_code_df:pd.DataFrame, verify_cost_codes:bool=True):
|
72 |
+
'''
|
73 |
+
Check if the enforce cost codes variable is set to true, and then check that a cost cost has been chosen. If not, raise an error. Then, check against the values in the cost code dataframe to ensure that the cost code exists.
|
74 |
+
'''
|
75 |
+
|
76 |
if enforce_cost_code_textbox == "True":
|
77 |
if not cost_code_choice:
|
78 |
raise Exception("Please choose a cost code before continuing")
|
79 |
+
|
80 |
+
if verify_cost_codes == True:
|
81 |
+
if cost_code_df.empty:
|
82 |
+
raise Exception("No cost codes present in dataframe for verification")
|
83 |
+
else:
|
84 |
+
valid_cost_codes_list = list(cost_code_df.iloc[:,0].unique())
|
85 |
+
|
86 |
+
if not cost_code_choice in valid_cost_codes_list:
|
87 |
+
raise Exception("Selected cost code not found in list. Please contact Finance if you cannot find the correct cost code from the given list of suggestions.")
|
88 |
return
|
89 |
|
90 |
+
def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
|
91 |
+
cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :
|
92 |
+
]
|
93 |
+
return cost_code_df
|
94 |
+
|
95 |
def update_dataframe(df:pd.DataFrame):
|
96 |
df_copy = df.copy()
|
97 |
return df_copy
|
|
|
302 |
|
303 |
return output_files
|
304 |
|
305 |
+
async def get_connection_params(request: gr.Request,
|
306 |
+
output_folder_textbox:str=OUTPUT_FOLDER,
|
307 |
+
input_folder_textbox:str=INPUT_FOLDER,
|
308 |
+
session_output_folder:str=SESSION_OUTPUT_FOLDER,
|
309 |
+
textract_document_upload_input_folder:str=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER,
|
310 |
+
textract_document_upload_output_folder:str=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER,
|
311 |
+
s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
|
312 |
+
local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
|
313 |
|
314 |
#print("Session hash:", request.session_hash)
|
315 |
|
|
|
361 |
if session_output_folder == 'True':
|
362 |
output_folder = output_folder_textbox + out_session_hash + "/"
|
363 |
input_folder = input_folder_textbox + out_session_hash + "/"
|
364 |
+
|
365 |
+
textract_document_upload_input_folder = textract_document_upload_input_folder + "/" + out_session_hash
|
366 |
+
textract_document_upload_output_folder = textract_document_upload_output_folder + "/" + out_session_hash
|
367 |
+
|
368 |
+
s3_textract_document_logs_subfolder = s3_textract_document_logs_subfolder + "/" + out_session_hash
|
369 |
+
local_textract_document_logs_subfolder = local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
|
370 |
+
|
371 |
else:
|
372 |
output_folder = output_folder_textbox
|
373 |
input_folder = input_folder_textbox
|
|
|
375 |
if not os.path.exists(output_folder): os.mkdir(output_folder)
|
376 |
if not os.path.exists(input_folder): os.mkdir(input_folder)
|
377 |
|
378 |
+
return out_session_hash, output_folder, out_session_hash, input_folder, textract_document_upload_input_folder, textract_document_upload_output_folder, s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder
|
|
|
379 |
|
380 |
def clean_unicode_text(text:str):
|
381 |
# Step 1: Normalise unicode characters to decompose any special forms
|
|
|
418 |
pii_identification_method:str,
|
419 |
textract_output_found_checkbox:bool,
|
420 |
only_extract_text_radio:bool,
|
421 |
+
convert_to_gbp:bool=True,
|
422 |
+
usd_gbp_conversion_rate:float=0.76,
|
423 |
textract_page_cost:float=1.5/1000,
|
424 |
textract_signature_cost:float=2.0/1000,
|
425 |
comprehend_unit_cost:float=0.0001,
|
|
|
437 |
- pii_identification_method_drop: The method of personally-identifiable information removal.
|
438 |
- textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
|
439 |
- only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
|
440 |
+
- convert_to_gbp (bool, optional): Should suggested costs be converted from USD to GBP.
|
441 |
+
- usd_gbp_conversion_rate (float, optional): Conversion rate used for USD to GBP. Last changed 14th April 2025.
|
442 |
- textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
|
443 |
- textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
|
444 |
- comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
|
|
|
467 |
|
468 |
calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
|
469 |
|
470 |
+
if convert_to_gbp == True:
|
471 |
+
calculated_aws_cost *= usd_gbp_conversion_rate
|
472 |
+
|
473 |
return calculated_aws_cost
|
474 |
|
475 |
def calculate_time_taken(number_of_pages:str,
|
tools/redaction_review.py
CHANGED
@@ -577,7 +577,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
577 |
output_files.append(orig_pdf_file_path)
|
578 |
|
579 |
try:
|
580 |
-
print("Saving review file.")
|
581 |
review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
|
582 |
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
583 |
|
@@ -756,6 +756,18 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
756 |
|
757 |
return row_value_page, row_value_df
|
758 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
759 |
def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
|
760 |
|
761 |
row_value_code = evt.row_value[0] # This is the value for cost code
|
|
|
577 |
output_files.append(orig_pdf_file_path)
|
578 |
|
579 |
try:
|
580 |
+
#print("Saving review file.")
|
581 |
review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
|
582 |
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
583 |
|
|
|
756 |
|
757 |
return row_value_page, row_value_df
|
758 |
|
759 |
+
def df_select_callback_textract_api(df: pd.DataFrame, evt: gr.SelectData):
|
760 |
+
|
761 |
+
#print("evt.data:", evt._data)
|
762 |
+
|
763 |
+
row_value_job_id = evt.row_value[0] # This is the page number value
|
764 |
+
# row_value_label = evt.row_value[1] # This is the label number value
|
765 |
+
row_value_job_type = evt.row_value[2] # This is the text number value
|
766 |
+
|
767 |
+
row_value_df = pd.DataFrame(data={"job_id":[row_value_job_id], "label":[row_value_job_type]})
|
768 |
+
|
769 |
+
return row_value_job_id, row_value_job_type, row_value_df
|
770 |
+
|
771 |
def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
|
772 |
|
773 |
row_value_code = evt.row_value[0] # This is the value for cost code
|
tools/textract_batch_call.py
CHANGED
@@ -1,22 +1,36 @@
|
|
1 |
import boto3
|
2 |
import time
|
3 |
import os
|
|
|
4 |
import json
|
5 |
import logging
|
|
|
|
|
|
|
6 |
from urllib.parse import urlparse
|
|
|
7 |
|
8 |
-
#
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
|
|
12 |
local_pdf_path: str,
|
13 |
-
s3_bucket_name: str,
|
14 |
s3_input_prefix: str,
|
15 |
s3_output_prefix: str,
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
20 |
):
|
21 |
"""
|
22 |
Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
|
@@ -27,10 +41,12 @@ def analyze_pdf_with_textract(
|
|
27 |
s3_bucket_name (str): Name of the S3 bucket to use.
|
28 |
s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
|
29 |
s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
|
30 |
-
|
|
|
|
|
|
|
|
|
31 |
aws_region (str, optional): AWS region name. Defaults to boto3 default region.
|
32 |
-
poll_interval_seconds (int): Seconds to wait between polling Textract status.
|
33 |
-
max_polling_attempts (int): Maximum number of times to poll Textract status.
|
34 |
|
35 |
Returns:
|
36 |
str: Path to the downloaded local JSON output file, or None if failed.
|
@@ -41,12 +57,21 @@ def analyze_pdf_with_textract(
|
|
41 |
Exception: For other AWS errors or job failures.
|
42 |
"""
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
if not os.path.exists(local_pdf_path):
|
45 |
-
raise FileNotFoundError(f"Input
|
46 |
|
47 |
if not os.path.exists(local_output_dir):
|
48 |
os.makedirs(local_output_dir)
|
49 |
-
|
|
|
|
|
50 |
|
51 |
# Initialize boto3 clients
|
52 |
session = boto3.Session(region_name=aws_region)
|
@@ -57,216 +82,407 @@ def analyze_pdf_with_textract(
|
|
57 |
pdf_filename = os.path.basename(local_pdf_path)
|
58 |
s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
|
59 |
|
60 |
-
|
|
|
|
|
61 |
try:
|
62 |
s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
|
63 |
-
|
|
|
|
|
64 |
except Exception as e:
|
65 |
-
|
|
|
|
|
66 |
raise
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# --- 2. Start Textract Document Analysis ---
|
69 |
-
|
|
|
|
|
|
|
70 |
try:
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
'
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
}
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
'
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
# NotificationChannel={
|
85 |
-
# 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
|
86 |
-
# 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
|
87 |
-
# }
|
88 |
-
)
|
89 |
-
job_id = response['JobId']
|
90 |
-
logging.info(f"Textract job started with JobId: {job_id}")
|
91 |
-
|
92 |
-
except Exception as e:
|
93 |
-
logging.error(f"Failed to start Textract job: {e}")
|
94 |
-
raise
|
95 |
-
|
96 |
-
# --- 3. Poll for Job Completion ---
|
97 |
-
job_status = 'IN_PROGRESS'
|
98 |
-
attempts = 0
|
99 |
-
logging.info("Polling Textract for job completion status...")
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
# For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
|
121 |
-
raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
|
122 |
-
else:
|
123 |
-
# Should not happen based on documentation, but handle defensively
|
124 |
-
raise Exception(f"Unexpected Textract job status: {job_status}")
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
-
|
134 |
-
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
# For robust handling, list objects and find the JSON(s).
|
141 |
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
144 |
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
list_response = s3_client.list_objects_v2(
|
148 |
Bucket=s3_bucket_name,
|
149 |
Prefix=s3_output_key_prefix
|
150 |
)
|
151 |
-
|
152 |
output_files = list_response.get('Contents', [])
|
153 |
-
if not output_files:
|
154 |
-
# Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
|
155 |
-
logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
|
156 |
-
time.sleep(5)
|
157 |
-
list_response = s3_client.list_objects_v2(
|
158 |
-
Bucket=s3_bucket_name,
|
159 |
-
Prefix=s3_output_key_prefix
|
160 |
-
)
|
161 |
-
output_files = list_response.get('Contents', [])
|
162 |
-
|
163 |
-
if not output_files:
|
164 |
-
logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
165 |
-
# You could alternatively try getting results via get_document_analysis pagination here
|
166 |
-
# but sticking to the request to download from S3 output path.
|
167 |
-
raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
168 |
-
|
169 |
-
# Usually, we only need the first/main JSON output file(s)
|
170 |
-
# For simplicity, download the first one found. A more complex scenario might merge multiple files.
|
171 |
-
# Filter out potential directory markers if any key ends with '/'
|
172 |
-
json_files_to_download = [f for f in output_files if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/')]
|
173 |
-
|
174 |
-
if not json_files_to_download:
|
175 |
-
logging.error(f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
176 |
-
raise FileNotFoundError(f"Textract output JSON files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
177 |
-
|
178 |
-
# Let's download the first JSON found. Often it's the only one or the main one.
|
179 |
-
s3_output_key = json_files_to_download[0]['Key']
|
180 |
-
output_filename_base = os.path.basename(pdf_filename).replace('.pdf', '')
|
181 |
-
local_output_filename = f"{output_filename_base}_textract_output_{job_id}.json"
|
182 |
-
local_output_path = os.path.join(local_output_dir, local_output_filename)
|
183 |
-
|
184 |
-
logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
|
185 |
-
s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
|
186 |
-
logging.info("Download successful.")
|
187 |
-
downloaded_file_path = local_output_path
|
188 |
-
|
189 |
-
# Log if multiple files were found, as user might need to handle them
|
190 |
-
if len(json_files_to_download) > 1:
|
191 |
-
logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
|
192 |
|
193 |
-
|
194 |
-
logging.error(f"
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
return downloaded_file_path
|
198 |
|
199 |
-
|
200 |
-
if
|
201 |
-
|
202 |
-
|
203 |
-
MY_S3_BUCKET = "your-textract-demo-bucket-name" # MUST BE UNIQUE GLOBALLY
|
204 |
-
MY_S3_INPUT_PREFIX = "textract-inputs" # Folder in the bucket for uploads
|
205 |
-
MY_S3_OUTPUT_PREFIX = "textract-outputs" # Folder in the bucket for results
|
206 |
-
MY_LOCAL_OUTPUT_DIR = "./textract_results" # Local folder to save JSON
|
207 |
-
MY_AWS_REGION = "us-east-1" # e.g., 'us-east-1', 'eu-west-1'
|
208 |
-
|
209 |
-
# --- Create a dummy PDF for testing if you don't have one ---
|
210 |
-
# Requires 'reportlab' library: pip install reportlab
|
211 |
-
try:
|
212 |
-
from reportlab.pdfgen import canvas
|
213 |
-
from reportlab.lib.pagesizes import letter
|
214 |
-
if not os.path.exists(MY_LOCAL_PDF):
|
215 |
-
print(f"Creating dummy PDF: {MY_LOCAL_PDF}")
|
216 |
-
c = canvas.Canvas(MY_LOCAL_PDF, pagesize=letter)
|
217 |
-
c.drawString(100, 750, "This is a test document for AWS Textract.")
|
218 |
-
c.drawString(100, 700, "It includes some text and a placeholder for a signature.")
|
219 |
-
c.drawString(100, 650, "Signed:")
|
220 |
-
# Draw a simple line/scribble for signature placeholder
|
221 |
-
c.line(150, 630, 250, 645)
|
222 |
-
c.line(250, 645, 300, 620)
|
223 |
-
c.save()
|
224 |
-
print("Dummy PDF created.")
|
225 |
-
except ImportError:
|
226 |
-
if not os.path.exists(MY_LOCAL_PDF):
|
227 |
-
print(f"Warning: reportlab not installed and '{MY_LOCAL_PDF}' not found. Cannot run example without an input PDF.")
|
228 |
-
exit() # Exit if no PDF available for the example
|
229 |
-
except Exception as e:
|
230 |
-
print(f"Error creating dummy PDF: {e}")
|
231 |
-
exit()
|
232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
s3_input_prefix=MY_S3_INPUT_PREFIX,
|
240 |
-
s3_output_prefix=MY_S3_OUTPUT_PREFIX,
|
241 |
-
local_output_dir=MY_LOCAL_OUTPUT_DIR,
|
242 |
-
aws_region=MY_AWS_REGION
|
243 |
-
)
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
print(f"Detected {results.get('DocumentMetadata', {}).get('Pages', 'N/A')} page(s).")
|
253 |
-
# Find signature blocks (Note: This is basic, real parsing might be more complex)
|
254 |
-
signature_blocks = [block for block in results.get('Blocks', []) if block.get('BlockType') == 'SIGNATURE']
|
255 |
-
print(f"Found {len(signature_blocks)} potential signature block(s).")
|
256 |
-
if signature_blocks:
|
257 |
-
print(f"First signature confidence: {signature_blocks[0].get('Confidence', 'N/A')}")
|
258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
print(f"\nAn error occurred during the process: {e}")
|
264 |
|
265 |
-
|
266 |
-
import time
|
267 |
-
import os
|
268 |
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
"""
|
271 |
Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
|
272 |
|
@@ -290,8 +506,8 @@ def download_textract_output(job_id, output_bucket, output_prefix, local_folder)
|
|
290 |
print("Job failed:", response.get("StatusMessage", "No error message provided."))
|
291 |
return
|
292 |
else:
|
293 |
-
print(f"Job is still {status}
|
294 |
-
time.sleep(10) # Wait before checking again
|
295 |
|
296 |
# Find output ZIP file in S3
|
297 |
output_file_key = f"{output_prefix}/{job_id}.zip"
|
@@ -303,6 +519,3 @@ def download_textract_output(job_id, output_bucket, output_prefix, local_folder)
|
|
303 |
print(f"Output file downloaded to: {local_file_path}")
|
304 |
except Exception as e:
|
305 |
print(f"Error downloading file: {e}")
|
306 |
-
|
307 |
-
# Example usage:
|
308 |
-
# download_textract_output("your-job-id", "your-output-bucket", "your-output-prefix", "/path/to/local/folder")
|
|
|
1 |
import boto3
|
2 |
import time
|
3 |
import os
|
4 |
+
import pandas as pd
|
5 |
import json
|
6 |
import logging
|
7 |
+
import datetime
|
8 |
+
from typing import List
|
9 |
+
from io import StringIO
|
10 |
from urllib.parse import urlparse
|
11 |
+
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
12 |
|
13 |
+
# MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
|
14 |
+
# MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
|
15 |
+
# MY_S3_INPUT_PREFIX = session_hash_textbox # Folder in the bucket for uploads
|
16 |
+
# MY_S3_OUTPUT_PREFIX = session_hash_textbox # Folder in the bucket for results
|
17 |
+
# MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER # Local folder to save JSON
|
18 |
+
# MY_AWS_REGION = AWS_REGION # e.g., 'us-east-1', 'eu-west-1'
|
19 |
+
from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
|
20 |
+
from tools.aws_textract import json_to_ocrresult
|
21 |
|
22 |
+
|
23 |
+
def analyse_document_with_textract_api(
|
24 |
local_pdf_path: str,
|
|
|
25 |
s3_input_prefix: str,
|
26 |
s3_output_prefix: str,
|
27 |
+
job_df:pd.DataFrame,
|
28 |
+
s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
|
29 |
+
local_output_dir: str = OUTPUT_FOLDER,
|
30 |
+
analyse_signatures:List[str] = [],
|
31 |
+
successful_job_number:int=0,
|
32 |
+
general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
|
33 |
+
aws_region: str = AWS_REGION # Optional: specify region if not default
|
34 |
):
|
35 |
"""
|
36 |
Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
|
|
|
41 |
s3_bucket_name (str): Name of the S3 bucket to use.
|
42 |
s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
|
43 |
s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
|
44 |
+
job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
|
45 |
+
s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
|
46 |
+
local_output_dir (str, optional): Local directory to save the downloaded JSON results.
|
47 |
+
analyse_signatures (List[str], optional): Analyse signatures? Default is no.
|
48 |
+
successful_job_number (int): The number of successful jobs that have been submitted in this session.
|
49 |
aws_region (str, optional): AWS region name. Defaults to boto3 default region.
|
|
|
|
|
50 |
|
51 |
Returns:
|
52 |
str: Path to the downloaded local JSON output file, or None if failed.
|
|
|
57 |
Exception: For other AWS errors or job failures.
|
58 |
"""
|
59 |
|
60 |
+
# This is a variable that is written to logs to indicate that a Textract API call was made
|
61 |
+
is_a_textract_api_call = True
|
62 |
+
|
63 |
+
# Keep only latest pdf path if it's a list
|
64 |
+
if isinstance(local_pdf_path, list):
|
65 |
+
local_pdf_path = local_pdf_path[-1]
|
66 |
+
|
67 |
if not os.path.exists(local_pdf_path):
|
68 |
+
raise FileNotFoundError(f"Input document not found {local_pdf_path}")
|
69 |
|
70 |
if not os.path.exists(local_output_dir):
|
71 |
os.makedirs(local_output_dir)
|
72 |
+
log_message = f"Created local output directory: {local_output_dir}"
|
73 |
+
print(log_message)
|
74 |
+
#logging.info(log_message)
|
75 |
|
76 |
# Initialize boto3 clients
|
77 |
session = boto3.Session(region_name=aws_region)
|
|
|
82 |
pdf_filename = os.path.basename(local_pdf_path)
|
83 |
s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
|
84 |
|
85 |
+
log_message = f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'..."
|
86 |
+
print(log_message)
|
87 |
+
#logging.info(log_message)
|
88 |
try:
|
89 |
s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
|
90 |
+
log_message = "Upload successful."
|
91 |
+
print(log_message)
|
92 |
+
#logging.info(log_message)
|
93 |
except Exception as e:
|
94 |
+
log_message = f"Failed to upload PDF to S3: {e}"
|
95 |
+
print(log_message)
|
96 |
+
#logging.error(log_message)
|
97 |
raise
|
98 |
|
99 |
+
# If job_df is not empty
|
100 |
+
if not job_df.empty:
|
101 |
+
if "file_name" in job_df.columns:
|
102 |
+
matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
|
103 |
+
|
104 |
+
if len(matching_job_id_file_names) > 0:
|
105 |
+
raise Exception("Existing Textract outputs found. No need to re-analyse. Please download existing results from the list")
|
106 |
+
|
107 |
# --- 2. Start Textract Document Analysis ---
|
108 |
+
message = "Starting Textract document analysis job..."
|
109 |
+
print(message)
|
110 |
+
#logging.info("Starting Textract document analysis job...")
|
111 |
+
|
112 |
try:
|
113 |
+
if "Extract signatures" in analyse_signatures:
|
114 |
+
response = textract_client.start_document_analysis(
|
115 |
+
DocumentLocation={
|
116 |
+
'S3Object': {
|
117 |
+
'Bucket': s3_bucket_name,
|
118 |
+
'Name': s3_input_key
|
119 |
+
}
|
120 |
+
},
|
121 |
+
FeatureTypes=['SIGNATURES'], # Analyze for signatures, forms, and tables
|
122 |
+
OutputConfig={
|
123 |
+
'S3Bucket': s3_bucket_name,
|
124 |
+
'S3Prefix': s3_output_prefix
|
125 |
}
|
126 |
+
# Optional: Add NotificationChannel for SNS topic notifications
|
127 |
+
# NotificationChannel={
|
128 |
+
# 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
|
129 |
+
# 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
|
130 |
+
# }
|
131 |
+
)
|
132 |
+
job_type="document_analysis"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
else:
|
135 |
+
response = textract_client.start_document_text_detection(
|
136 |
+
DocumentLocation={
|
137 |
+
'S3Object': {
|
138 |
+
'Bucket': s3_bucket_name,
|
139 |
+
'Name': s3_input_key
|
140 |
+
}
|
141 |
+
},
|
142 |
+
OutputConfig={
|
143 |
+
'S3Bucket': s3_bucket_name,
|
144 |
+
'S3Prefix': s3_output_prefix
|
145 |
+
}
|
146 |
+
# Optional: Add NotificationChannel for SNS topic notifications
|
147 |
+
# NotificationChannel={
|
148 |
+
# 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
|
149 |
+
# 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
|
150 |
+
# }
|
151 |
+
)
|
152 |
+
job_type="document_text_detection"
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
+
job_id = response['JobId']
|
155 |
+
print(f"Textract job started with JobId: {job_id}")
|
156 |
+
#logging.info(f"Textract job started with JobId: {job_id}")
|
157 |
+
|
158 |
+
# Write job_id to memory
|
159 |
+
# Prepare CSV in memory
|
160 |
+
log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
|
161 |
+
job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
|
162 |
+
|
163 |
+
csv_buffer = StringIO()
|
164 |
+
log_df = pd.DataFrame([{
|
165 |
+
'job_id': job_id,
|
166 |
+
'file_name': pdf_filename,
|
167 |
+
'job_type': job_type,
|
168 |
+
'signature_extraction':analyse_signatures,
|
169 |
+
's3_location': job_location_full,
|
170 |
+
'job_date_time': datetime.datetime.now()
|
171 |
+
}])
|
172 |
+
|
173 |
+
# File path
|
174 |
+
log_file_path = os.path.join(local_output_dir, "textract_job_log_files.csv")
|
175 |
+
|
176 |
+
# Check if file exists
|
177 |
+
file_exists = os.path.exists(log_file_path)
|
178 |
+
|
179 |
+
# Append to CSV if it exists, otherwise write with header
|
180 |
+
log_df.to_csv(log_file_path, mode='a', index=False, header=not file_exists)
|
181 |
+
|
182 |
+
#log_df.to_csv(csv_buffer)
|
183 |
|
184 |
+
# Upload the file
|
185 |
+
s3_client.upload_file(log_file_path, general_s3_bucket_name, log_csv_key_location)
|
186 |
|
187 |
+
# Upload to S3 (overwrite existing file)
|
188 |
+
#s3_client.put_object(Bucket=general_s3_bucket_name, Key=log_csv_key_location, Body=csv_buffer.getvalue())
|
189 |
+
print(f"Job ID written to {log_csv_key_location}")
|
190 |
+
#logging.info(f"Job ID written to s3://{s3_bucket_name}/{s3_output_prefix}/textract_document_jobs.csv")
|
|
|
191 |
|
192 |
+
except Exception as e:
|
193 |
+
error = f"Failed to start Textract job: {e}"
|
194 |
+
print(error)
|
195 |
+
#logging.error(error)
|
196 |
+
raise
|
197 |
|
198 |
+
successful_job_number += 1
|
199 |
+
|
200 |
+
return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call
|
201 |
+
|
202 |
+
def return_job_status(job_id:str,
|
203 |
+
response:dict,
|
204 |
+
attempts:int,
|
205 |
+
poll_interval_seconds: int = 5,
|
206 |
+
max_polling_attempts: int = 1 # ~10 minutes total wait time
|
207 |
+
):
|
208 |
+
job_status = response['JobStatus']
|
209 |
+
logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
|
210 |
+
|
211 |
+
if job_status == 'IN_PROGRESS':
|
212 |
+
time.sleep(poll_interval_seconds)
|
213 |
+
elif job_status == 'SUCCEEDED':
|
214 |
+
logging.info("Textract job succeeded.")
|
215 |
+
elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
|
216 |
+
status_message = response.get('StatusMessage', 'No status message provided.')
|
217 |
+
warnings = response.get('Warnings', [])
|
218 |
+
logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
|
219 |
+
if warnings:
|
220 |
+
logging.warning(f"Warnings: {warnings}")
|
221 |
+
# Decide if PARTIAL_SUCCESS should proceed or raise error
|
222 |
+
# For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
|
223 |
+
raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
|
224 |
+
else:
|
225 |
+
# Should not happen based on documentation, but handle defensively
|
226 |
+
raise Exception(f"Unexpected Textract job status: {job_status}")
|
227 |
+
|
228 |
+
return job_status
|
229 |
+
|
230 |
+
def download_textract_job_files(s3_client:str,
|
231 |
+
s3_bucket_name:str,
|
232 |
+
s3_output_key_prefix:str,
|
233 |
+
pdf_filename:str,
|
234 |
+
job_id:str,
|
235 |
+
local_output_dir:str):
|
236 |
+
list_response = s3_client.list_objects_v2(
|
237 |
+
Bucket=s3_bucket_name,
|
238 |
+
Prefix=s3_output_key_prefix
|
239 |
+
)
|
240 |
+
|
241 |
+
output_files = list_response.get('Contents', [])
|
242 |
+
if not output_files:
|
243 |
+
# Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
|
244 |
+
#logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
|
245 |
+
#time.sleep(5)
|
246 |
list_response = s3_client.list_objects_v2(
|
247 |
Bucket=s3_bucket_name,
|
248 |
Prefix=s3_output_key_prefix
|
249 |
)
|
|
|
250 |
output_files = list_response.get('Contents', [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
+
if not output_files:
|
253 |
+
logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
254 |
+
# You could alternatively try getting results via get_document_analysis pagination here
|
255 |
+
# but sticking to the request to download from S3 output path.
|
256 |
+
raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
257 |
+
|
258 |
+
# Usually, we only need the first/main JSON output file(s)
|
259 |
+
# For simplicity, download the first one found. A more complex scenario might merge multiple files.
|
260 |
+
# Filter out potential directory markers if any key ends with '/'
|
261 |
+
json_files_to_download = [
|
262 |
+
f for f in output_files
|
263 |
+
if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/') and 'access_check' not in f['Key']
|
264 |
+
]
|
265 |
+
|
266 |
+
#print("json_files_to_download:", json_files_to_download)
|
267 |
+
|
268 |
+
if not json_files_to_download:
|
269 |
+
error = f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}"
|
270 |
+
print(error)
|
271 |
+
#logging.error(error)
|
272 |
+
raise FileNotFoundError(error)
|
273 |
+
|
274 |
+
combined_blocks = []
|
275 |
+
|
276 |
+
for f in sorted(json_files_to_download, key=lambda x: x['Key']): # Optional: sort to ensure consistent order
|
277 |
+
obj = s3_client.get_object(Bucket=s3_bucket_name, Key=f['Key'])
|
278 |
+
data = json.loads(obj['Body'].read())
|
279 |
+
|
280 |
+
# Assuming Textract-style output with a "Blocks" key
|
281 |
+
if "Blocks" in data:
|
282 |
+
combined_blocks.extend(data["Blocks"])
|
283 |
+
else:
|
284 |
+
logging.warning(f"No 'Blocks' key in file: {f['Key']}")
|
285 |
+
|
286 |
+
# Build final combined JSON structure
|
287 |
+
combined_output = {
|
288 |
+
"DocumentMetadata": {
|
289 |
+
"Pages": len(set(block.get('Page', 1) for block in combined_blocks))
|
290 |
+
},
|
291 |
+
"Blocks": combined_blocks,
|
292 |
+
"JobStatus": "SUCCEEDED"
|
293 |
+
}
|
294 |
+
|
295 |
+
output_filename_base = os.path.basename(pdf_filename)
|
296 |
+
output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
|
297 |
+
local_output_filename = f"{output_filename_base_no_ext}_textract.json"
|
298 |
+
local_output_path = os.path.join(local_output_dir, local_output_filename)
|
299 |
+
|
300 |
+
with open(local_output_path, 'w') as f:
|
301 |
+
json.dump(combined_output, f)
|
302 |
+
|
303 |
+
print(f"Combined Textract output written to {local_output_path}")
|
304 |
+
|
305 |
+
# logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
|
306 |
+
# s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
|
307 |
+
# logging.info("Download successful.")
|
308 |
+
downloaded_file_path = local_output_path
|
309 |
+
|
310 |
+
# Log if multiple files were found, as user might need to handle them
|
311 |
+
#if len(json_files_to_download) > 1:
|
312 |
+
# logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
|
313 |
|
314 |
return downloaded_file_path
|
315 |
|
316 |
+
def check_for_provided_job_id(job_id:str):
|
317 |
+
if not job_id:
|
318 |
+
raise Exception("Please provide a job ID.")
|
319 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
+
def poll_bulk_textract_analysis_progress_and_download(
|
322 |
+
job_id:str,
|
323 |
+
job_type_dropdown:str,
|
324 |
+
s3_output_prefix: str,
|
325 |
+
pdf_filename:str,
|
326 |
+
job_df:pd.DataFrame,
|
327 |
+
s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
|
328 |
+
local_output_dir: str = OUTPUT_FOLDER,
|
329 |
+
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
330 |
+
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
331 |
+
aws_region: str = AWS_REGION, # Optional: specify region if not default
|
332 |
+
poll_interval_seconds: int = 1,
|
333 |
+
max_polling_attempts: int = 1 # ~10 minutes total wait time):
|
334 |
+
):
|
335 |
|
336 |
+
if job_id:
|
337 |
+
# Initialize boto3 clients
|
338 |
+
session = boto3.Session(region_name=aws_region)
|
339 |
+
s3_client = session.client('s3')
|
340 |
+
textract_client = session.client('textract')
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
+
# --- 3. Poll for Job Completion ---
|
343 |
+
job_status = 'IN_PROGRESS'
|
344 |
+
attempts = 0
|
345 |
|
346 |
+
message = "Polling Textract for job completion status..."
|
347 |
+
print(message)
|
348 |
+
#logging.info("Polling Textract for job completion status...")
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
|
350 |
+
# Update Textract document history df
|
351 |
+
try:
|
352 |
+
job_df = load_in_textract_job_details(load_s3_jobs=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
353 |
+
load_s3_jobs_loc=load_s3_jobs_loc,
|
354 |
+
load_local_jobs_loc=load_local_jobs_loc)
|
355 |
+
except Exception as e:
|
356 |
+
#logging.error(f"Failed to update job details dataframe: {e}")
|
357 |
+
print(f"Failed to update job details dataframe: {e}")
|
358 |
+
#raise
|
359 |
+
|
360 |
+
while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
|
361 |
+
attempts += 1
|
362 |
+
try:
|
363 |
+
if job_type_dropdown=="document_analysis":
|
364 |
+
response = textract_client.get_document_analysis(JobId=job_id)
|
365 |
+
job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
|
366 |
+
elif job_type_dropdown=="document_text_detection":
|
367 |
+
response = textract_client.get_document_text_detection(JobId=job_id)
|
368 |
+
job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
|
369 |
+
else:
|
370 |
+
error = f"Unknown job type, cannot poll job"
|
371 |
+
print(error)
|
372 |
+
#logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
|
373 |
+
raise
|
374 |
+
|
375 |
+
except textract_client.exceptions.InvalidJobIdException:
|
376 |
+
error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed."
|
377 |
+
print(error_message)
|
378 |
+
logging.error(error_message)
|
379 |
+
raise
|
380 |
+
except Exception as e:
|
381 |
+
error_message = f"Error while polling Textract status for job {job_id}: {e}"
|
382 |
+
print(error_message)
|
383 |
+
logging.error(error_message)
|
384 |
+
raise
|
385 |
+
|
386 |
+
downloaded_file_path = None
|
387 |
+
if job_status == 'SUCCEEDED':
|
388 |
+
#raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
|
389 |
+
# 3b - Replace PDF file name if it exists in the job dataframe
|
390 |
+
|
391 |
+
# If job_df is not empty
|
392 |
+
if not job_df.empty:
|
393 |
+
if "file_name" in job_df.columns:
|
394 |
+
matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
|
395 |
+
|
396 |
+
if pdf_filename and not matching_job_id_file_names.empty:
|
397 |
+
if pdf_filename == matching_job_id_file_names.iloc[0]:
|
398 |
+
raise Exception("Existing Textract outputs found. No need to re-download.")
|
399 |
+
|
400 |
+
if not matching_job_id_file_names.empty:
|
401 |
+
pdf_filename = matching_job_id_file_names.iloc[0]
|
402 |
+
else:
|
403 |
+
pdf_filename = "unknown_file"
|
404 |
+
|
405 |
+
|
406 |
+
# --- 4. Download Output JSON from S3 ---
|
407 |
+
# Textract typically creates output under s3_output_prefix/job_id/
|
408 |
+
# There might be multiple JSON files if pagination occurred during writing.
|
409 |
+
# Usually, for smaller docs, there's one file, often named '1'.
|
410 |
+
# For robust handling, list objects and find the JSON(s).
|
411 |
+
|
412 |
+
|
413 |
+
s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
|
414 |
+
logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
|
415 |
+
|
416 |
+
try:
|
417 |
+
downloaded_file_path = download_textract_job_files(s3_client,
|
418 |
+
s3_bucket_name,
|
419 |
+
s3_output_key_prefix,
|
420 |
+
pdf_filename,
|
421 |
+
job_id,
|
422 |
+
local_output_dir)
|
423 |
+
|
424 |
+
except Exception as e:
|
425 |
+
#logging.error(f"Failed to download or process Textract output from S3: {e}")
|
426 |
+
print(f"Failed to download or process Textract output from S3: {e}")
|
427 |
+
raise
|
428 |
+
|
429 |
+
else:
|
430 |
+
raise Exception("No Job ID provided.")
|
431 |
+
|
432 |
+
return downloaded_file_path, job_status, job_df
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
437 |
+
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
438 |
+
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
439 |
+
document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
|
440 |
+
aws_region:str=AWS_REGION):
|
441 |
+
|
442 |
+
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
|
443 |
|
444 |
+
# Initialize boto3 clients
|
445 |
+
session = boto3.Session(region_name=aws_region)
|
446 |
+
s3_client = session.client('s3')
|
|
|
447 |
|
448 |
+
local_output_path = f'{load_local_jobs_loc}/textract_job_log_files.csv'
|
|
|
|
|
449 |
|
450 |
+
if load_s3_jobs == 'True':
|
451 |
+
|
452 |
+
s3_output_key = f'{load_s3_jobs_loc}/textract_job_log_files.csv'
|
453 |
+
|
454 |
+
try:
|
455 |
+
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
456 |
+
print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
|
457 |
+
s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
|
458 |
+
print("Download successful.")
|
459 |
+
except ClientError as e:
|
460 |
+
if e.response['Error']['Code'] == '404':
|
461 |
+
print("Log file does not exist in S3.")
|
462 |
+
else:
|
463 |
+
print(f"Unexpected error occurred: {e}")
|
464 |
+
except (NoCredentialsError, PartialCredentialsError, TokenRetrievalError) as e:
|
465 |
+
print(f"AWS credential issue encountered: {e}")
|
466 |
+
print("Skipping S3 log file download.")
|
467 |
+
|
468 |
+
# If the log path exists, load it in
|
469 |
+
if os.path.exists(local_output_path):
|
470 |
+
print("Found log file in local path")
|
471 |
+
job_df = pd.read_csv(local_output_path)
|
472 |
+
|
473 |
+
if "job_date_time" in job_df.columns:
|
474 |
+
job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
|
475 |
+
# Keep only jobs that have been completed in the last 7 days
|
476 |
+
cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=7)
|
477 |
+
job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
|
478 |
+
|
479 |
+
return job_df
|
480 |
+
|
481 |
+
|
482 |
+
def download_textract_output(job_id:str,
|
483 |
+
output_bucket:str,
|
484 |
+
output_prefix:str,
|
485 |
+
local_folder:str):
|
486 |
"""
|
487 |
Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
|
488 |
|
|
|
506 |
print("Job failed:", response.get("StatusMessage", "No error message provided."))
|
507 |
return
|
508 |
else:
|
509 |
+
print(f"Job is still {status}.")
|
510 |
+
#time.sleep(10) # Wait before checking again
|
511 |
|
512 |
# Find output ZIP file in S3
|
513 |
output_file_key = f"{output_prefix}/{job_id}.zip"
|
|
|
519 |
print(f"Output file downloaded to: {local_file_path}")
|
520 |
except Exception as e:
|
521 |
print(f"Error downloading file: {e}")
|
|
|
|
|
|