seanpedrickcase
commited on
Commit
•
339a165
1
Parent(s):
84c83c0
Redaction tool can now export pdfs with selectable text retained - redacted text is deleted and covered with a black box. Licence change for pymupdf use.
Browse files- README.md +1 -1
- app.py +1 -2
- tools/custom_image_analyser_engine.py +6 -6
- tools/file_redaction.py +253 -82
- tools/load_spacy_model_custom_recognisers.py +2 -1
README.md
CHANGED
@@ -6,7 +6,7 @@ colorTo: green
|
|
6 |
sdk: docker
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
-
license:
|
10 |
---
|
11 |
|
12 |
# Document redaction
|
|
|
6 |
sdk: docker
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
+
license: agpl-3.0
|
10 |
---
|
11 |
|
12 |
# Document redaction
|
app.py
CHANGED
@@ -67,8 +67,7 @@ with app:
|
|
67 |
doc_file_name_textbox = gr.Textbox(value="", visible=False)
|
68 |
data_file_name_textbox = gr.Textbox(value="", visible=False)
|
69 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
70 |
-
estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
71 |
-
|
72 |
|
73 |
###
|
74 |
# UI DESIGN
|
|
|
67 |
doc_file_name_textbox = gr.Textbox(value="", visible=False)
|
68 |
data_file_name_textbox = gr.Textbox(value="", visible=False)
|
69 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
70 |
+
estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
|
|
71 |
|
72 |
###
|
73 |
# UI DESIGN
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -498,12 +498,12 @@ class CustomImageAnalyzerEngine:
|
|
498 |
total_width = 0 # Initialize total width
|
499 |
|
500 |
for word_text in relevant_text.split(): # Iterate through each word in relevant_text
|
501 |
-
print("Looking for word_text:", word_text)
|
502 |
for word in child_words:
|
503 |
#if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
|
504 |
if word_text in word['text']:
|
505 |
found_word = word
|
506 |
-
print("found_word:", found_word)
|
507 |
|
508 |
if word_num == 0: # First word
|
509 |
left = found_word['bounding_box'][0]
|
@@ -535,8 +535,8 @@ class CustomImageAnalyzerEngine:
|
|
535 |
result_reset_pos.start = 0
|
536 |
result_reset_pos.end = len(relevant_text)
|
537 |
|
538 |
-
print("result_reset_pos:", result_reset_pos)
|
539 |
-
print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
540 |
#print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
541 |
|
542 |
# Map the analyzer results to bounding boxes for this line
|
@@ -544,7 +544,7 @@ class CustomImageAnalyzerEngine:
|
|
544 |
[result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
|
545 |
)
|
546 |
|
547 |
-
print("line_results:", line_results)
|
548 |
|
549 |
combined_results.extend(line_results)
|
550 |
|
@@ -581,7 +581,7 @@ class CustomImageAnalyzerEngine:
|
|
581 |
|
582 |
#print("child_info in sub function:", child_info)
|
583 |
#print("redaction_result_bounding_box:", redaction_result_bounding_box)
|
584 |
-
print("Overlaps?", bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']))
|
585 |
|
586 |
if bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']):
|
587 |
# Use the bounding box from ocr_results_with_children
|
|
|
498 |
total_width = 0 # Initialize total width
|
499 |
|
500 |
for word_text in relevant_text.split(): # Iterate through each word in relevant_text
|
501 |
+
#print("Looking for word_text:", word_text)
|
502 |
for word in child_words:
|
503 |
#if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
|
504 |
if word_text in word['text']:
|
505 |
found_word = word
|
506 |
+
#print("found_word:", found_word)
|
507 |
|
508 |
if word_num == 0: # First word
|
509 |
left = found_word['bounding_box'][0]
|
|
|
535 |
result_reset_pos.start = 0
|
536 |
result_reset_pos.end = len(relevant_text)
|
537 |
|
538 |
+
#print("result_reset_pos:", result_reset_pos)
|
539 |
+
#print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
540 |
#print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
541 |
|
542 |
# Map the analyzer results to bounding boxes for this line
|
|
|
544 |
[result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
|
545 |
)
|
546 |
|
547 |
+
#print("line_results:", line_results)
|
548 |
|
549 |
combined_results.extend(line_results)
|
550 |
|
|
|
581 |
|
582 |
#print("child_info in sub function:", child_info)
|
583 |
#print("redaction_result_bounding_box:", redaction_result_bounding_box)
|
584 |
+
#print("Overlaps?", bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']))
|
585 |
|
586 |
if bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']):
|
587 |
# Use the bounding box from ocr_results_with_children
|
tools/file_redaction.py
CHANGED
@@ -11,9 +11,14 @@ import pandas as pd
|
|
11 |
from pdfminer.high_level import extract_pages
|
12 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
|
13 |
from pikepdf import Pdf, Dictionary, Name
|
|
|
|
|
|
|
14 |
import gradio as gr
|
15 |
from gradio import Progress
|
16 |
|
|
|
|
|
17 |
from collections import defaultdict # For efficient grouping
|
18 |
|
19 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
@@ -114,11 +119,17 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
114 |
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
115 |
|
116 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
|
|
117 |
pdf_images, redaction_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
|
118 |
|
119 |
# Save file
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
out_file_paths.append(out_image_file_path)
|
124 |
if logging_file_paths:
|
@@ -148,6 +159,8 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
148 |
latest_file_completed += 1
|
149 |
|
150 |
elif in_redact_method == "Simple text analysis - PDFs with selectable text":
|
|
|
|
|
151 |
|
152 |
if is_pdf(file_path) == False:
|
153 |
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
@@ -155,27 +168,25 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
155 |
# Analyse text-based pdf
|
156 |
print('Redacting file as text-based PDF')
|
157 |
pdf_text, decision_process_logs, page_text_outputs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
|
|
|
158 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
159 |
-
pdf_text.save(out_text_file_path)
|
|
|
160 |
|
161 |
# Convert message
|
162 |
-
convert_message="Converting PDF to image-based PDF to embed redactions."
|
163 |
-
print(convert_message)
|
164 |
|
165 |
# Convert document to image-based document to 'embed' redactions
|
166 |
-
img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
|
167 |
-
out_file_paths.extend(img_output_file_path)
|
168 |
-
|
169 |
-
#decision_process_logs_str = str(decision_process_logs)
|
170 |
-
#logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
|
171 |
-
#with open(logs_output_file_name, "w") as f:
|
172 |
-
# f.write(output_logs_str)
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
177 |
|
178 |
-
all_text_output_file_name =
|
179 |
page_text_outputs.to_csv(all_text_output_file_name)
|
180 |
log_files_output_paths.append(all_text_output_file_name)
|
181 |
|
@@ -214,6 +225,69 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
214 |
|
215 |
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
def bounding_boxes_overlap(box1, box2):
|
218 |
"""Check if two bounding boxes overlap."""
|
219 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
@@ -328,6 +402,9 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
328 |
#request_metadata = {}
|
329 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
330 |
|
|
|
|
|
|
|
331 |
if not image_paths:
|
332 |
out_message = "PDF does not exist as images. Converting pages to image"
|
333 |
print(out_message)
|
@@ -403,6 +480,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
403 |
# Need image size to convert textract OCR outputs to the correct sizes
|
404 |
page_width, page_height = image.size
|
405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
# Possibility to use different languages
|
407 |
if language == 'en':
|
408 |
ocr_lang = 'eng'
|
@@ -477,14 +570,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
477 |
|
478 |
|
479 |
# 3. Draw the merged boxes
|
480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
|
489 |
# Log OCR results
|
490 |
|
@@ -527,7 +628,9 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
527 |
|
528 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
|
529 |
|
530 |
-
|
|
|
|
|
531 |
|
532 |
# Write OCR results as a log file
|
533 |
# line_level_ocr_results_out = "\n".join(all_ocr_results)
|
@@ -537,33 +640,47 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
537 |
all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
|
538 |
logging_file_paths.append(ocr_results_file_path)
|
539 |
|
540 |
-
return
|
|
|
|
|
541 |
|
542 |
-
def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
|
543 |
if isinstance(text_container, LTTextContainer):
|
544 |
-
text_to_analyze = text_container.get_text()
|
545 |
-
|
546 |
-
analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
|
547 |
-
language=language,
|
548 |
-
entities=chosen_redact_entities,
|
549 |
-
score_threshold=score_threshold,
|
550 |
-
return_decision_process=True,
|
551 |
-
allow_list=allow_list)
|
552 |
-
|
553 |
-
#print("\ntext_container:", text_container)
|
554 |
characters = [char
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
return
|
560 |
-
return []
|
|
|
561 |
|
562 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
563 |
'''
|
564 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
565 |
'''
|
566 |
|
|
|
|
|
|
|
|
|
|
|
567 |
# Initialize variables
|
568 |
full_text = ""
|
569 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
@@ -574,6 +691,8 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> OCR
|
|
574 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
575 |
|
576 |
for char in char_objects:
|
|
|
|
|
577 |
if isinstance(char, LTAnno):
|
578 |
# Handle space separately by finalizing the word
|
579 |
full_text += char.get_text() # Adds space or newline
|
@@ -581,6 +700,23 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> OCR
|
|
581 |
word_bboxes.append((current_word, current_word_bbox))
|
582 |
current_word = ""
|
583 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
584 |
continue
|
585 |
|
586 |
# Concatenate text for LTChar
|
@@ -602,13 +738,18 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> OCR
|
|
602 |
current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
|
603 |
current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
|
604 |
|
|
|
605 |
# Finalize the last word if any
|
606 |
if current_word:
|
607 |
word_bboxes.append((current_word, current_word_bbox))
|
608 |
|
609 |
-
|
|
|
|
|
|
|
|
|
610 |
|
611 |
-
def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=
|
612 |
'''
|
613 |
Merge identified bounding boxes containing PII that are very close to one another
|
614 |
'''
|
@@ -653,13 +794,13 @@ def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, char
|
|
653 |
vertical_diff_bboxes = abs(char_box[1] - current_y)
|
654 |
horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
655 |
|
656 |
-
#print(f"Comparing boxes: current_box={current_box}, char_box={char_box}")
|
657 |
#print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
|
658 |
|
659 |
if (
|
660 |
-
vertical_diff_bboxes <= 5
|
661 |
-
and horizontal_diff_bboxes <= combine_pixel_dist
|
662 |
):
|
|
|
663 |
current_box[2] = char_box[2] # Extend the current box horizontally
|
664 |
current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
665 |
current_result.end = max(current_result.end, result.end) # Extend the text range
|
@@ -710,7 +851,7 @@ def create_text_redaction_process_results(analyzer_results, analyzed_bounding_bo
|
|
710 |
analyzed_bounding_boxes_df_new['page'] = page_num + 1
|
711 |
decision_process_table = pd.concat([decision_process_table, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
712 |
|
713 |
-
print('\n\ndecision_process_table:\n\n', decision_process_table)
|
714 |
|
715 |
return decision_process_table
|
716 |
|
@@ -741,11 +882,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
741 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
742 |
'''
|
743 |
annotations_all_pages = []
|
|
|
744 |
decision_process_table_all_pages = pd.DataFrame()
|
745 |
|
746 |
-
combine_pixel_dist =
|
747 |
|
|
|
748 |
pdf = Pdf.open(filename)
|
|
|
|
|
749 |
page_num = 0
|
750 |
|
751 |
number_of_pages = len(pdf.pages)
|
@@ -773,30 +918,31 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
773 |
#page_width = media_box[2] - media_box[0]
|
774 |
#page_height = media_box[3] - media_box[1]
|
775 |
|
776 |
-
|
777 |
-
annotations_on_page = []
|
778 |
-
decision_process_table_on_page = pd.DataFrame()
|
779 |
-
|
780 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
781 |
|
782 |
page_analyzer_results = []
|
783 |
-
page_analyzed_bounding_boxes = []
|
784 |
-
|
785 |
-
text_container_analyzed_bounding_boxes = []
|
786 |
characters = []
|
787 |
-
|
|
|
|
|
788 |
|
789 |
if analysis_type == "Simple text analysis - PDFs with selectable text":
|
790 |
-
for
|
|
|
|
|
|
|
|
|
|
|
791 |
|
792 |
-
text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
|
793 |
-
|
794 |
# Create dataframe for all the text on the page
|
795 |
-
|
796 |
|
797 |
-
|
798 |
-
line_level_text_results_list = [line_level_text_results]
|
799 |
|
|
|
|
|
800 |
# Convert to DataFrame and add to ongoing logging table
|
801 |
line_level_text_results_df = pd.DataFrame([{
|
802 |
'page': page_no + 1,
|
@@ -809,33 +955,58 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
809 |
|
810 |
page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
|
811 |
|
812 |
-
#
|
813 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
814 |
|
815 |
|
816 |
page_analyzer_results.extend(text_container_analyzer_results)
|
817 |
page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
|
818 |
|
819 |
-
|
820 |
-
print("page_analyzer_results:", page_analyzer_results)
|
821 |
-
print("page_analyzed_bounding_boxes:", page_analyzed_bounding_boxes)
|
822 |
-
|
823 |
-
decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
|
824 |
|
|
|
825 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
|
826 |
-
|
|
|
|
|
827 |
|
828 |
# Make page annotations
|
829 |
-
page.Annots = pdf.make_indirect(annotations_on_page)
|
830 |
-
|
|
|
831 |
|
832 |
-
decision_process_table_all_pages = pd.concat([decision_process_table_all_pages, decision_process_table_on_page])
|
833 |
-
|
834 |
-
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
835 |
-
#page_text_outputs.to_csv("text_page_text_outputs.csv")
|
836 |
-
|
837 |
print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
|
838 |
-
|
839 |
-
#page_num += 1
|
840 |
|
841 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from pdfminer.high_level import extract_pages
|
12 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
|
13 |
from pikepdf import Pdf, Dictionary, Name
|
14 |
+
import pymupdf
|
15 |
+
from pymupdf import Rect
|
16 |
+
|
17 |
import gradio as gr
|
18 |
from gradio import Progress
|
19 |
|
20 |
+
from typing import Tuple
|
21 |
+
|
22 |
from collections import defaultdict # For efficient grouping
|
23 |
|
24 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
|
|
119 |
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
120 |
|
121 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
122 |
+
|
123 |
pdf_images, redaction_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
|
124 |
|
125 |
# Save file
|
126 |
+
if is_pdf(file_path) == False:
|
127 |
+
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
128 |
+
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
129 |
+
|
130 |
+
else:
|
131 |
+
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
|
132 |
+
pdf_images.save(out_image_file_path)
|
133 |
|
134 |
out_file_paths.append(out_image_file_path)
|
135 |
if logging_file_paths:
|
|
|
159 |
latest_file_completed += 1
|
160 |
|
161 |
elif in_redact_method == "Simple text analysis - PDFs with selectable text":
|
162 |
+
|
163 |
+
print("file_path:", file_path)
|
164 |
|
165 |
if is_pdf(file_path) == False:
|
166 |
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
|
|
168 |
# Analyse text-based pdf
|
169 |
print('Redacting file as text-based PDF')
|
170 |
pdf_text, decision_process_logs, page_text_outputs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
|
171 |
+
|
172 |
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
173 |
+
pdf_text.save(out_text_file_path)
|
174 |
+
out_file_paths.append(out_text_file_path)
|
175 |
|
176 |
# Convert message
|
177 |
+
#convert_message="Converting PDF to image-based PDF to embed redactions."
|
178 |
+
#print(convert_message)
|
179 |
|
180 |
# Convert document to image-based document to 'embed' redactions
|
181 |
+
#img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
|
182 |
+
#out_file_paths.extend(img_output_file_path)
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
+
# Write logs to file
|
185 |
+
decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
|
186 |
+
decision_process_logs.to_csv(decision_logs_output_file_name)
|
187 |
+
log_files_output_paths.append(decision_logs_output_file_name)
|
188 |
|
189 |
+
all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
|
190 |
page_text_outputs.to_csv(all_text_output_file_name)
|
191 |
log_files_output_paths.append(all_text_output_file_name)
|
192 |
|
|
|
225 |
|
226 |
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
|
227 |
|
228 |
+
def redact_page_with_pymupdf(doc, annotations_on_page, page_no, scale=(1,1)):
|
229 |
+
|
230 |
+
page = doc.load_page(page_no)
|
231 |
+
page_height = max(page.rect.height, page.mediabox[3] - page.mediabox[1])
|
232 |
+
|
233 |
+
#print("page_rect_height:", page.rect.height)
|
234 |
+
#print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
|
235 |
+
|
236 |
+
for annot in annotations_on_page:
|
237 |
+
if isinstance(annot, CustomImageRecognizerResult):
|
238 |
+
scale_width = scale[0]
|
239 |
+
scale_height = scale[1]
|
240 |
+
|
241 |
+
print("scale:", scale)
|
242 |
+
|
243 |
+
# Calculate scaled coordinates
|
244 |
+
x1 = annot.left * scale_width
|
245 |
+
new_y1 = (annot.top * scale_height) # Flip Y0 (since it starts from bottom)
|
246 |
+
x2 = (annot.left + annot.width) * scale_width # Calculate x1
|
247 |
+
new_y2 = ((annot.top + annot.height) * scale_height) # Calculate y1 correctly
|
248 |
+
|
249 |
+
rect = Rect(x1, new_y1, x2, new_y2) # Create the PyMuPDF Rect (y1, y0 are flipped)
|
250 |
+
|
251 |
+
else:
|
252 |
+
#print("In the pikepdf conversion function")
|
253 |
+
# Extract the /Rect field
|
254 |
+
rect_field = annot["/Rect"]
|
255 |
+
|
256 |
+
# Convert the extracted /Rect field to a list of floats (since pikepdf uses Decimal objects)
|
257 |
+
rect_coordinates = [float(coord) for coord in rect_field]
|
258 |
+
|
259 |
+
# Convert the Y-coordinates (flip using the page height)
|
260 |
+
x1, y1, x2, y2 = rect_coordinates
|
261 |
+
new_y1 = page_height - y2
|
262 |
+
new_y2 = page_height - y1
|
263 |
+
|
264 |
+
rect = Rect(x1, new_y1, x2, new_y2)
|
265 |
+
|
266 |
+
# Convert to a PyMuPDF Rect object
|
267 |
+
#rect = Rect(rect_coordinates)
|
268 |
+
|
269 |
+
# Calculate the middle y value and set height to 1 pixel
|
270 |
+
middle_y = (new_y1 + new_y2) / 2
|
271 |
+
rect_single_pixel_height = Rect(x1, middle_y, x2, middle_y + 1) # Height of 1 pixel
|
272 |
+
|
273 |
+
print("rect:", rect)
|
274 |
+
# Add a redaction annotation
|
275 |
+
#page.add_redact_annot(rect)
|
276 |
+
|
277 |
+
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
278 |
+
page.add_redact_annot(rect_single_pixel_height)
|
279 |
+
|
280 |
+
# Set up drawing a black box over the whole rect
|
281 |
+
shape = page.new_shape()
|
282 |
+
shape.draw_rect(rect)
|
283 |
+
shape.finish(color=(0, 0, 0), fill=(0, 0, 0)) # Black fill for the rectangle
|
284 |
+
shape.commit()
|
285 |
+
|
286 |
+
page.apply_redactions(images=0, graphics=0)
|
287 |
+
page.clean_contents()
|
288 |
+
|
289 |
+
return doc
|
290 |
+
|
291 |
def bounding_boxes_overlap(box1, box2):
|
292 |
"""Check if two bounding boxes overlap."""
|
293 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
|
|
402 |
#request_metadata = {}
|
403 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
404 |
|
405 |
+
# Also open as pymupdf pdf to apply annotations later on
|
406 |
+
doc = pymupdf.open(file_path)
|
407 |
+
|
408 |
if not image_paths:
|
409 |
out_message = "PDF does not exist as images. Converting pages to image"
|
410 |
print(out_message)
|
|
|
480 |
# Need image size to convert textract OCR outputs to the correct sizes
|
481 |
page_width, page_height = image.size
|
482 |
|
483 |
+
|
484 |
+
# Get the dimensions of the page in points with pymupdf to get relative scale
|
485 |
+
page = doc.load_page(i)
|
486 |
+
mu_page_rect = page.rect
|
487 |
+
#mu_page_width = mu_page_rect.width
|
488 |
+
mu_page_height = max(mu_page_rect.height, page.mediabox[3] - page.mediabox[1])
|
489 |
+
mu_page_width = max(mu_page_rect.width, page.mediabox[2] - page.mediabox[0])
|
490 |
+
#mu_page_height = mu_page_rect.height
|
491 |
+
|
492 |
+
# Calculate scaling factors between PIL image and pymupdf
|
493 |
+
scale_width = mu_page_width / page_width
|
494 |
+
scale_height = mu_page_height / page_height
|
495 |
+
|
496 |
+
scale = (scale_width, scale_height)
|
497 |
+
|
498 |
+
|
499 |
# Possibility to use different languages
|
500 |
if language == 'en':
|
501 |
ocr_lang = 'eng'
|
|
|
570 |
|
571 |
|
572 |
# 3. Draw the merged boxes
|
573 |
+
if is_pdf(file_path) == False:
|
574 |
+
draw = ImageDraw.Draw(image)
|
575 |
+
|
576 |
+
for box in merged_redaction_bboxes:
|
577 |
+
x0 = box.left
|
578 |
+
y0 = box.top
|
579 |
+
x1 = x0 + box.width
|
580 |
+
y1 = y0 + box.height
|
581 |
+
draw.rectangle([x0, y0, x1, y1], fill=fill)
|
582 |
|
583 |
+
|
584 |
+
## Apply annotations with pymupdf
|
585 |
+
else:
|
586 |
+
doc = redact_page_with_pymupdf(doc, merged_redaction_bboxes, i, scale)
|
587 |
+
|
588 |
+
#doc.save("image_redact.pdf")
|
589 |
|
590 |
# Log OCR results
|
591 |
|
|
|
628 |
|
629 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
|
630 |
|
631 |
+
if is_pdf(file_path) == False:
|
632 |
+
images.append(image)
|
633 |
+
doc = images
|
634 |
|
635 |
# Write OCR results as a log file
|
636 |
# line_level_ocr_results_out = "\n".join(all_ocr_results)
|
|
|
640 |
all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
|
641 |
logging_file_paths.append(ocr_results_file_path)
|
642 |
|
643 |
+
return doc, all_decision_process_table, logging_file_paths, request_metadata
|
644 |
+
|
645 |
+
def get_text_container_characters(text_container:LTTextContainer):
|
646 |
|
|
|
647 |
if isinstance(text_container, LTTextContainer):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
648 |
characters = [char
|
649 |
+
for line in text_container
|
650 |
+
if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
|
651 |
+
for char in line]
|
652 |
+
|
653 |
+
return characters
|
654 |
+
return []
|
655 |
+
|
656 |
|
657 |
+
def analyze_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
|
658 |
+
'''
|
659 |
+
Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
|
660 |
+
'''
|
661 |
+
|
662 |
+
text_to_analyze = text_container.text
|
663 |
+
#print("text_to_analyze:", text_to_analyze)
|
664 |
+
|
665 |
+
analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
|
666 |
+
language=language,
|
667 |
+
entities=chosen_redact_entities,
|
668 |
+
score_threshold=score_threshold,
|
669 |
+
return_decision_process=True,
|
670 |
+
allow_list=allow_list)
|
671 |
+
return analyzer_results
|
672 |
+
|
673 |
+
|
674 |
+
def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
|
675 |
'''
|
676 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
677 |
'''
|
678 |
|
679 |
+
line_level_results_out = []
|
680 |
+
line_level_characters_out = []
|
681 |
+
#all_line_level_characters_out = []
|
682 |
+
character_objects_out = [] # New list to store character objects
|
683 |
+
|
684 |
# Initialize variables
|
685 |
full_text = ""
|
686 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
|
|
691 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
692 |
|
693 |
for char in char_objects:
|
694 |
+
character_objects_out.append(char) # Collect character objects
|
695 |
+
|
696 |
if isinstance(char, LTAnno):
|
697 |
# Handle space separately by finalizing the word
|
698 |
full_text += char.get_text() # Adds space or newline
|
|
|
700 |
word_bboxes.append((current_word, current_word_bbox))
|
701 |
current_word = ""
|
702 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
703 |
+
|
704 |
+
# Check for line break (assuming a new line is indicated by a specific character)
|
705 |
+
if '\n' in char.get_text():
|
706 |
+
#print("char_anno:", char)
|
707 |
+
# Finalize the current line
|
708 |
+
if current_word:
|
709 |
+
word_bboxes.append((current_word, current_word_bbox))
|
710 |
+
# Create an OCRResult for the current line
|
711 |
+
line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
|
712 |
+
line_level_characters_out.append(character_objects_out)
|
713 |
+
# Reset for the next line
|
714 |
+
character_objects_out = []
|
715 |
+
full_text = ""
|
716 |
+
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
717 |
+
current_word = ""
|
718 |
+
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
719 |
+
|
720 |
continue
|
721 |
|
722 |
# Concatenate text for LTChar
|
|
|
738 |
current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
|
739 |
current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
|
740 |
|
741 |
+
|
742 |
# Finalize the last word if any
|
743 |
if current_word:
|
744 |
word_bboxes.append((current_word, current_word_bbox))
|
745 |
|
746 |
+
if full_text:
|
747 |
+
line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
748 |
+
|
749 |
+
|
750 |
+
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
751 |
|
752 |
+
def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=0):
|
753 |
'''
|
754 |
Merge identified bounding boxes containing PII that are very close to one another
|
755 |
'''
|
|
|
794 |
vertical_diff_bboxes = abs(char_box[1] - current_y)
|
795 |
horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
796 |
|
797 |
+
#print(f"Comparing boxes: current_box={current_box}, char_box={char_box}, current_text={current_text}, char_text={text}")
|
798 |
#print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
|
799 |
|
800 |
if (
|
801 |
+
vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
|
|
|
802 |
):
|
803 |
+
#print("box is being extended")
|
804 |
current_box[2] = char_box[2] # Extend the current box horizontally
|
805 |
current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
806 |
current_result.end = max(current_result.end, result.end) # Extend the text range
|
|
|
851 |
analyzed_bounding_boxes_df_new['page'] = page_num + 1
|
852 |
decision_process_table = pd.concat([decision_process_table, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
853 |
|
854 |
+
#print('\n\ndecision_process_table:\n\n', decision_process_table)
|
855 |
|
856 |
return decision_process_table
|
857 |
|
|
|
882 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
883 |
'''
|
884 |
annotations_all_pages = []
|
885 |
+
page_text_outputs_all_pages = pd.DataFrame()
|
886 |
decision_process_table_all_pages = pd.DataFrame()
|
887 |
|
888 |
+
combine_pixel_dist = 20 # Horizontal distance between PII bounding boxes under/equal they are combined into one
|
889 |
|
890 |
+
# Open with Pikepdf to get text lines
|
891 |
pdf = Pdf.open(filename)
|
892 |
+
# Also open pdf with pymupdf to be able to annotate later while retaining text
|
893 |
+
doc = pymupdf.open(filename)
|
894 |
page_num = 0
|
895 |
|
896 |
number_of_pages = len(pdf.pages)
|
|
|
918 |
#page_width = media_box[2] - media_box[0]
|
919 |
#page_height = media_box[3] - media_box[1]
|
920 |
|
|
|
|
|
|
|
|
|
921 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
922 |
|
923 |
page_analyzer_results = []
|
924 |
+
page_analyzed_bounding_boxes = []
|
925 |
+
|
|
|
926 |
characters = []
|
927 |
+
annotations_on_page = []
|
928 |
+
decision_process_table_on_page = pd.DataFrame()
|
929 |
+
page_text_outputs = pd.DataFrame()
|
930 |
|
931 |
if analysis_type == "Simple text analysis - PDFs with selectable text":
|
932 |
+
for text_container in page_layout:
|
933 |
+
|
934 |
+
text_container_analyzer_results = []
|
935 |
+
text_container_analyzed_bounding_boxes = []
|
936 |
+
|
937 |
+
characters = get_text_container_characters(text_container)
|
938 |
|
|
|
|
|
939 |
# Create dataframe for all the text on the page
|
940 |
+
line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
|
941 |
|
942 |
+
print("line_characters:", line_characters)
|
|
|
943 |
|
944 |
+
# Create page_text_outputs (OCR format outputs)
|
945 |
+
if line_level_text_results_list:
|
946 |
# Convert to DataFrame and add to ongoing logging table
|
947 |
line_level_text_results_df = pd.DataFrame([{
|
948 |
'page': page_no + 1,
|
|
|
955 |
|
956 |
page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
|
957 |
|
958 |
+
# Analyse each line of text in turn for PII and add to list
|
959 |
+
for i, text_line in enumerate(line_level_text_results_list):
|
960 |
+
text_line_analyzer_result = []
|
961 |
+
text_line_bounding_boxes = []
|
962 |
+
|
963 |
+
#print("text_line:", text_line.text)
|
964 |
+
|
965 |
+
text_line_analyzer_result = analyze_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
|
966 |
+
|
967 |
+
# Merge bounding boxes for the line if multiple found close together
|
968 |
+
if text_line_analyzer_result:
|
969 |
+
# Merge bounding boxes if very close together
|
970 |
+
print("text_line_bounding_boxes:", text_line_bounding_boxes)
|
971 |
+
print("line_characters:")
|
972 |
+
#print(line_characters[i])
|
973 |
+
print("".join(char._text for char in line_characters[i]))
|
974 |
+
text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyzer_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
|
975 |
+
|
976 |
+
text_container_analyzer_results.extend(text_line_analyzer_result)
|
977 |
+
text_container_analyzed_bounding_boxes.extend(text_line_bounding_boxes)
|
978 |
+
|
979 |
+
print("\n FINAL text_container_analyzer_results:", text_container_analyzer_results)
|
980 |
|
981 |
|
982 |
page_analyzer_results.extend(text_container_analyzer_results)
|
983 |
page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
|
984 |
|
985 |
+
|
|
|
|
|
|
|
|
|
986 |
|
987 |
+
# Annotate redactions on page
|
988 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
|
989 |
+
|
990 |
+
# Make pymupdf redactions
|
991 |
+
doc = redact_page_with_pymupdf(doc, annotations_on_page, page_no)
|
992 |
|
993 |
# Make page annotations
|
994 |
+
#page.Annots = pdf.make_indirect(annotations_on_page)
|
995 |
+
if annotations_on_page:
|
996 |
+
annotations_all_pages.extend([annotations_on_page])
|
997 |
|
|
|
|
|
|
|
|
|
|
|
998 |
print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
|
|
|
|
|
999 |
|
1000 |
+
# Write logs
|
1001 |
+
# Create decision process table
|
1002 |
+
decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
|
1003 |
+
|
1004 |
+
if not decision_process_table_on_page.empty:
|
1005 |
+
decision_process_table_all_pages = pd.concat([decision_process_table_all_pages, decision_process_table_on_page])
|
1006 |
+
|
1007 |
+
if not page_text_outputs.empty:
|
1008 |
+
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
1009 |
+
#page_text_outputs.to_csv("text_page_text_outputs.csv")
|
1010 |
+
page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
|
1011 |
+
|
1012 |
+
return doc, decision_process_table_all_pages, page_text_outputs_all_pages
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -19,7 +19,8 @@ score_threshold = 0.001
|
|
19 |
titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
|
20 |
titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
|
21 |
titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
|
22 |
-
titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern]
|
|
|
23 |
|
24 |
# %%
|
25 |
# Custom postcode recogniser
|
|
|
19 |
titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
|
20 |
titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
|
21 |
titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
|
22 |
+
titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern],
|
23 |
+
global_regex_flags=re.DOTALL | re.MULTILINE)
|
24 |
|
25 |
# %%
|
26 |
# Custom postcode recogniser
|