seanpedrickcase commited on
Commit
339a165
1 Parent(s): 84c83c0

Redaction tool can now export pdfs with selectable text retained - redacted text is deleted and covered with a black box. Licence change for pymupdf use.

Browse files
README.md CHANGED
@@ -6,7 +6,7 @@ colorTo: green
6
  sdk: docker
7
  app_file: app.py
8
  pinned: false
9
- license: mit
10
  ---
11
 
12
  # Document redaction
 
6
  sdk: docker
7
  app_file: app.py
8
  pinned: false
9
+ license: agpl-3.0
10
  ---
11
 
12
  # Document redaction
app.py CHANGED
@@ -67,8 +67,7 @@ with app:
67
  doc_file_name_textbox = gr.Textbox(value="", visible=False)
68
  data_file_name_textbox = gr.Textbox(value="", visible=False)
69
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
70
- estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
71
-
72
 
73
  ###
74
  # UI DESIGN
 
67
  doc_file_name_textbox = gr.Textbox(value="", visible=False)
68
  data_file_name_textbox = gr.Textbox(value="", visible=False)
69
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
70
+ estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
 
71
 
72
  ###
73
  # UI DESIGN
tools/custom_image_analyser_engine.py CHANGED
@@ -498,12 +498,12 @@ class CustomImageAnalyzerEngine:
498
  total_width = 0 # Initialize total width
499
 
500
  for word_text in relevant_text.split(): # Iterate through each word in relevant_text
501
- print("Looking for word_text:", word_text)
502
  for word in child_words:
503
  #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
504
  if word_text in word['text']:
505
  found_word = word
506
- print("found_word:", found_word)
507
 
508
  if word_num == 0: # First word
509
  left = found_word['bounding_box'][0]
@@ -535,8 +535,8 @@ class CustomImageAnalyzerEngine:
535
  result_reset_pos.start = 0
536
  result_reset_pos.end = len(relevant_text)
537
 
538
- print("result_reset_pos:", result_reset_pos)
539
- print("relevant_line_ocr_result:", relevant_line_ocr_result)
540
  #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
541
 
542
  # Map the analyzer results to bounding boxes for this line
@@ -544,7 +544,7 @@ class CustomImageAnalyzerEngine:
544
  [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
545
  )
546
 
547
- print("line_results:", line_results)
548
 
549
  combined_results.extend(line_results)
550
 
@@ -581,7 +581,7 @@ class CustomImageAnalyzerEngine:
581
 
582
  #print("child_info in sub function:", child_info)
583
  #print("redaction_result_bounding_box:", redaction_result_bounding_box)
584
- print("Overlaps?", bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']))
585
 
586
  if bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']):
587
  # Use the bounding box from ocr_results_with_children
 
498
  total_width = 0 # Initialize total width
499
 
500
  for word_text in relevant_text.split(): # Iterate through each word in relevant_text
501
+ #print("Looking for word_text:", word_text)
502
  for word in child_words:
503
  #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
504
  if word_text in word['text']:
505
  found_word = word
506
+ #print("found_word:", found_word)
507
 
508
  if word_num == 0: # First word
509
  left = found_word['bounding_box'][0]
 
535
  result_reset_pos.start = 0
536
  result_reset_pos.end = len(relevant_text)
537
 
538
+ #print("result_reset_pos:", result_reset_pos)
539
+ #print("relevant_line_ocr_result:", relevant_line_ocr_result)
540
  #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
541
 
542
  # Map the analyzer results to bounding boxes for this line
 
544
  [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
545
  )
546
 
547
+ #print("line_results:", line_results)
548
 
549
  combined_results.extend(line_results)
550
 
 
581
 
582
  #print("child_info in sub function:", child_info)
583
  #print("redaction_result_bounding_box:", redaction_result_bounding_box)
584
+ #print("Overlaps?", bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']))
585
 
586
  if bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']):
587
  # Use the bounding box from ocr_results_with_children
tools/file_redaction.py CHANGED
@@ -11,9 +11,14 @@ import pandas as pd
11
  from pdfminer.high_level import extract_pages
12
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
13
  from pikepdf import Pdf, Dictionary, Name
 
 
 
14
  import gradio as gr
15
  from gradio import Progress
16
 
 
 
17
  from collections import defaultdict # For efficient grouping
18
 
19
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
@@ -114,11 +119,17 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
114
  return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
115
 
116
  print("Redacting file " + file_path_without_ext + " as an image-based file")
 
117
  pdf_images, redaction_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
118
 
119
  # Save file
120
- out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
121
- pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
 
 
 
 
 
122
 
123
  out_file_paths.append(out_image_file_path)
124
  if logging_file_paths:
@@ -148,6 +159,8 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
148
  latest_file_completed += 1
149
 
150
  elif in_redact_method == "Simple text analysis - PDFs with selectable text":
 
 
151
 
152
  if is_pdf(file_path) == False:
153
  return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
@@ -155,27 +168,25 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
155
  # Analyse text-based pdf
156
  print('Redacting file as text-based PDF')
157
  pdf_text, decision_process_logs, page_text_outputs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
 
158
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
159
- pdf_text.save(out_text_file_path)
 
160
 
161
  # Convert message
162
- convert_message="Converting PDF to image-based PDF to embed redactions."
163
- print(convert_message)
164
 
165
  # Convert document to image-based document to 'embed' redactions
166
- img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
167
- out_file_paths.extend(img_output_file_path)
168
-
169
- #decision_process_logs_str = str(decision_process_logs)
170
- #logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
171
- #with open(logs_output_file_name, "w") as f:
172
- # f.write(output_logs_str)
173
 
174
- logs_output_file_name = img_output_file_path[0] + "_decision_process_output.csv"
175
- decision_process_logs.to_csv(logs_output_file_name)
176
- log_files_output_paths.append(logs_output_file_name)
 
177
 
178
- all_text_output_file_name = img_output_file_path[0] + "_all_text_output.csv"
179
  page_text_outputs.to_csv(all_text_output_file_name)
180
  log_files_output_paths.append(all_text_output_file_name)
181
 
@@ -214,6 +225,69 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
214
 
215
  return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  def bounding_boxes_overlap(box1, box2):
218
  """Check if two bounding boxes overlap."""
219
  return (box1[0] < box2[2] and box2[0] < box1[2] and
@@ -328,6 +402,9 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
328
  #request_metadata = {}
329
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
330
 
 
 
 
331
  if not image_paths:
332
  out_message = "PDF does not exist as images. Converting pages to image"
333
  print(out_message)
@@ -403,6 +480,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
403
  # Need image size to convert textract OCR outputs to the correct sizes
404
  page_width, page_height = image.size
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  # Possibility to use different languages
407
  if language == 'en':
408
  ocr_lang = 'eng'
@@ -477,14 +570,22 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
477
 
478
 
479
  # 3. Draw the merged boxes
480
- draw = ImageDraw.Draw(image)
 
 
 
 
 
 
 
 
481
 
482
- for box in merged_redaction_bboxes:
483
- x0 = box.left
484
- y0 = box.top
485
- x1 = x0 + box.width
486
- y1 = y0 + box.height
487
- draw.rectangle([x0, y0, x1, y1], fill=fill)
488
 
489
  # Log OCR results
490
 
@@ -527,7 +628,9 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
527
 
528
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
529
 
530
- images.append(image)
 
 
531
 
532
  # Write OCR results as a log file
533
  # line_level_ocr_results_out = "\n".join(all_ocr_results)
@@ -537,33 +640,47 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
537
  all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
538
  logging_file_paths.append(ocr_results_file_path)
539
 
540
- return images, all_decision_process_table, logging_file_paths, request_metadata
 
 
541
 
542
- def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
543
  if isinstance(text_container, LTTextContainer):
544
- text_to_analyze = text_container.get_text()
545
-
546
- analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
547
- language=language,
548
- entities=chosen_redact_entities,
549
- score_threshold=score_threshold,
550
- return_decision_process=True,
551
- allow_list=allow_list)
552
-
553
- #print("\ntext_container:", text_container)
554
  characters = [char
555
- for line in text_container
556
- if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
557
- for char in line]
558
-
559
- return analyzer_results, characters
560
- return [], []
 
561
 
562
- def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> OCRResult:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  '''
564
  Create an OCRResult object based on a list of pdfminer LTChar objects.
565
  '''
566
 
 
 
 
 
 
567
  # Initialize variables
568
  full_text = ""
569
  overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
@@ -574,6 +691,8 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> OCR
574
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
575
 
576
  for char in char_objects:
 
 
577
  if isinstance(char, LTAnno):
578
  # Handle space separately by finalizing the word
579
  full_text += char.get_text() # Adds space or newline
@@ -581,6 +700,23 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> OCR
581
  word_bboxes.append((current_word, current_word_bbox))
582
  current_word = ""
583
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
  continue
585
 
586
  # Concatenate text for LTChar
@@ -602,13 +738,18 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> OCR
602
  current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
603
  current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
604
 
 
605
  # Finalize the last word if any
606
  if current_word:
607
  word_bboxes.append((current_word, current_word_bbox))
608
 
609
- return OCRResult(full_text, overall_bbox[0], overall_bbox[1], overall_bbox[2], overall_bbox[3])
 
 
 
 
610
 
611
- def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=2):
612
  '''
613
  Merge identified bounding boxes containing PII that are very close to one another
614
  '''
@@ -653,13 +794,13 @@ def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, char
653
  vertical_diff_bboxes = abs(char_box[1] - current_y)
654
  horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
655
 
656
- #print(f"Comparing boxes: current_box={current_box}, char_box={char_box}")
657
  #print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
658
 
659
  if (
660
- vertical_diff_bboxes <= 5
661
- and horizontal_diff_bboxes <= combine_pixel_dist
662
  ):
 
663
  current_box[2] = char_box[2] # Extend the current box horizontally
664
  current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
665
  current_result.end = max(current_result.end, result.end) # Extend the text range
@@ -710,7 +851,7 @@ def create_text_redaction_process_results(analyzer_results, analyzed_bounding_bo
710
  analyzed_bounding_boxes_df_new['page'] = page_num + 1
711
  decision_process_table = pd.concat([decision_process_table, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
712
 
713
- print('\n\ndecision_process_table:\n\n', decision_process_table)
714
 
715
  return decision_process_table
716
 
@@ -741,11 +882,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
741
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
742
  '''
743
  annotations_all_pages = []
 
744
  decision_process_table_all_pages = pd.DataFrame()
745
 
746
- combine_pixel_dist = 100 # Horizontal distance between PII bounding boxes under/equal they are combined into one
747
 
 
748
  pdf = Pdf.open(filename)
 
 
749
  page_num = 0
750
 
751
  number_of_pages = len(pdf.pages)
@@ -773,30 +918,31 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
773
  #page_width = media_box[2] - media_box[0]
774
  #page_height = media_box[3] - media_box[1]
775
 
776
-
777
- annotations_on_page = []
778
- decision_process_table_on_page = pd.DataFrame()
779
-
780
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
781
 
782
  page_analyzer_results = []
783
- page_analyzed_bounding_boxes = []
784
- text_container_analyzer_results = []
785
- text_container_analyzed_bounding_boxes = []
786
  characters = []
787
- page_text_outputs = pd.DataFrame()
 
 
788
 
789
  if analysis_type == "Simple text analysis - PDFs with selectable text":
790
- for i, text_container in enumerate(page_layout):
 
 
 
 
 
791
 
792
- text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
793
-
794
  # Create dataframe for all the text on the page
795
- line_level_text_results = create_text_bounding_boxes_from_characters(characters)
796
 
797
- if line_level_text_results.text:
798
- line_level_text_results_list = [line_level_text_results]
799
 
 
 
800
  # Convert to DataFrame and add to ongoing logging table
801
  line_level_text_results_df = pd.DataFrame([{
802
  'page': page_no + 1,
@@ -809,33 +955,58 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
809
 
810
  page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
811
 
812
- # Merge bounding boxes if very close together
813
- text_container_analyzed_bounding_boxes = merge_text_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
814
 
815
 
816
  page_analyzer_results.extend(text_container_analyzer_results)
817
  page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
818
 
819
-
820
- print("page_analyzer_results:", page_analyzer_results)
821
- print("page_analyzed_bounding_boxes:", page_analyzed_bounding_boxes)
822
-
823
- decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
824
 
 
825
  annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
826
- #print('\n\nAnnotations_on_page:', annotations_on_page)
 
 
827
 
828
  # Make page annotations
829
- page.Annots = pdf.make_indirect(annotations_on_page)
830
- annotations_all_pages.extend([annotations_on_page])
 
831
 
832
- decision_process_table_all_pages = pd.concat([decision_process_table_all_pages, decision_process_table_on_page])
833
-
834
- page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
835
- #page_text_outputs.to_csv("text_page_text_outputs.csv")
836
-
837
  print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
838
-
839
- #page_num += 1
840
 
841
- return pdf, decision_process_table_all_pages, page_text_outputs
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from pdfminer.high_level import extract_pages
12
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
13
  from pikepdf import Pdf, Dictionary, Name
14
+ import pymupdf
15
+ from pymupdf import Rect
16
+
17
  import gradio as gr
18
  from gradio import Progress
19
 
20
+ from typing import Tuple
21
+
22
  from collections import defaultdict # For efficient grouping
23
 
24
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 
119
  return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
120
 
121
  print("Redacting file " + file_path_without_ext + " as an image-based file")
122
+
123
  pdf_images, redaction_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
124
 
125
  # Save file
126
+ if is_pdf(file_path) == False:
127
+ out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
128
+ pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
129
+
130
+ else:
131
+ out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
132
+ pdf_images.save(out_image_file_path)
133
 
134
  out_file_paths.append(out_image_file_path)
135
  if logging_file_paths:
 
159
  latest_file_completed += 1
160
 
161
  elif in_redact_method == "Simple text analysis - PDFs with selectable text":
162
+
163
+ print("file_path:", file_path)
164
 
165
  if is_pdf(file_path) == False:
166
  return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
 
168
  # Analyse text-based pdf
169
  print('Redacting file as text-based PDF')
170
  pdf_text, decision_process_logs, page_text_outputs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
171
+
172
  out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
173
+ pdf_text.save(out_text_file_path)
174
+ out_file_paths.append(out_text_file_path)
175
 
176
  # Convert message
177
+ #convert_message="Converting PDF to image-based PDF to embed redactions."
178
+ #print(convert_message)
179
 
180
  # Convert document to image-based document to 'embed' redactions
181
+ #img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
182
+ #out_file_paths.extend(img_output_file_path)
 
 
 
 
 
183
 
184
+ # Write logs to file
185
+ decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
186
+ decision_process_logs.to_csv(decision_logs_output_file_name)
187
+ log_files_output_paths.append(decision_logs_output_file_name)
188
 
189
+ all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
190
  page_text_outputs.to_csv(all_text_output_file_name)
191
  log_files_output_paths.append(all_text_output_file_name)
192
 
 
225
 
226
  return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
227
 
228
+ def redact_page_with_pymupdf(doc, annotations_on_page, page_no, scale=(1,1)):
229
+
230
+ page = doc.load_page(page_no)
231
+ page_height = max(page.rect.height, page.mediabox[3] - page.mediabox[1])
232
+
233
+ #print("page_rect_height:", page.rect.height)
234
+ #print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
235
+
236
+ for annot in annotations_on_page:
237
+ if isinstance(annot, CustomImageRecognizerResult):
238
+ scale_width = scale[0]
239
+ scale_height = scale[1]
240
+
241
+ print("scale:", scale)
242
+
243
+ # Calculate scaled coordinates
244
+ x1 = annot.left * scale_width
245
+ new_y1 = (annot.top * scale_height) # Flip Y0 (since it starts from bottom)
246
+ x2 = (annot.left + annot.width) * scale_width # Calculate x1
247
+ new_y2 = ((annot.top + annot.height) * scale_height) # Calculate y1 correctly
248
+
249
+ rect = Rect(x1, new_y1, x2, new_y2) # Create the PyMuPDF Rect (y1, y0 are flipped)
250
+
251
+ else:
252
+ #print("In the pikepdf conversion function")
253
+ # Extract the /Rect field
254
+ rect_field = annot["/Rect"]
255
+
256
+ # Convert the extracted /Rect field to a list of floats (since pikepdf uses Decimal objects)
257
+ rect_coordinates = [float(coord) for coord in rect_field]
258
+
259
+ # Convert the Y-coordinates (flip using the page height)
260
+ x1, y1, x2, y2 = rect_coordinates
261
+ new_y1 = page_height - y2
262
+ new_y2 = page_height - y1
263
+
264
+ rect = Rect(x1, new_y1, x2, new_y2)
265
+
266
+ # Convert to a PyMuPDF Rect object
267
+ #rect = Rect(rect_coordinates)
268
+
269
+ # Calculate the middle y value and set height to 1 pixel
270
+ middle_y = (new_y1 + new_y2) / 2
271
+ rect_single_pixel_height = Rect(x1, middle_y, x2, middle_y + 1) # Height of 1 pixel
272
+
273
+ print("rect:", rect)
274
+ # Add a redaction annotation
275
+ #page.add_redact_annot(rect)
276
+
277
+ # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
278
+ page.add_redact_annot(rect_single_pixel_height)
279
+
280
+ # Set up drawing a black box over the whole rect
281
+ shape = page.new_shape()
282
+ shape.draw_rect(rect)
283
+ shape.finish(color=(0, 0, 0), fill=(0, 0, 0)) # Black fill for the rectangle
284
+ shape.commit()
285
+
286
+ page.apply_redactions(images=0, graphics=0)
287
+ page.clean_contents()
288
+
289
+ return doc
290
+
291
  def bounding_boxes_overlap(box1, box2):
292
  """Check if two bounding boxes overlap."""
293
  return (box1[0] < box2[2] and box2[0] < box1[2] and
 
402
  #request_metadata = {}
403
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
404
 
405
+ # Also open as pymupdf pdf to apply annotations later on
406
+ doc = pymupdf.open(file_path)
407
+
408
  if not image_paths:
409
  out_message = "PDF does not exist as images. Converting pages to image"
410
  print(out_message)
 
480
  # Need image size to convert textract OCR outputs to the correct sizes
481
  page_width, page_height = image.size
482
 
483
+
484
+ # Get the dimensions of the page in points with pymupdf to get relative scale
485
+ page = doc.load_page(i)
486
+ mu_page_rect = page.rect
487
+ #mu_page_width = mu_page_rect.width
488
+ mu_page_height = max(mu_page_rect.height, page.mediabox[3] - page.mediabox[1])
489
+ mu_page_width = max(mu_page_rect.width, page.mediabox[2] - page.mediabox[0])
490
+ #mu_page_height = mu_page_rect.height
491
+
492
+ # Calculate scaling factors between PIL image and pymupdf
493
+ scale_width = mu_page_width / page_width
494
+ scale_height = mu_page_height / page_height
495
+
496
+ scale = (scale_width, scale_height)
497
+
498
+
499
  # Possibility to use different languages
500
  if language == 'en':
501
  ocr_lang = 'eng'
 
570
 
571
 
572
  # 3. Draw the merged boxes
573
+ if is_pdf(file_path) == False:
574
+ draw = ImageDraw.Draw(image)
575
+
576
+ for box in merged_redaction_bboxes:
577
+ x0 = box.left
578
+ y0 = box.top
579
+ x1 = x0 + box.width
580
+ y1 = y0 + box.height
581
+ draw.rectangle([x0, y0, x1, y1], fill=fill)
582
 
583
+
584
+ ## Apply annotations with pymupdf
585
+ else:
586
+ doc = redact_page_with_pymupdf(doc, merged_redaction_bboxes, i, scale)
587
+
588
+ #doc.save("image_redact.pdf")
589
 
590
  # Log OCR results
591
 
 
628
 
629
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
630
 
631
+ if is_pdf(file_path) == False:
632
+ images.append(image)
633
+ doc = images
634
 
635
  # Write OCR results as a log file
636
  # line_level_ocr_results_out = "\n".join(all_ocr_results)
 
640
  all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
641
  logging_file_paths.append(ocr_results_file_path)
642
 
643
+ return doc, all_decision_process_table, logging_file_paths, request_metadata
644
+
645
+ def get_text_container_characters(text_container:LTTextContainer):
646
 
 
647
  if isinstance(text_container, LTTextContainer):
 
 
 
 
 
 
 
 
 
 
648
  characters = [char
649
+ for line in text_container
650
+ if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
651
+ for char in line]
652
+
653
+ return characters
654
+ return []
655
+
656
 
657
+ def analyze_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
658
+ '''
659
+ Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
660
+ '''
661
+
662
+ text_to_analyze = text_container.text
663
+ #print("text_to_analyze:", text_to_analyze)
664
+
665
+ analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
666
+ language=language,
667
+ entities=chosen_redact_entities,
668
+ score_threshold=score_threshold,
669
+ return_decision_process=True,
670
+ allow_list=allow_list)
671
+ return analyzer_results
672
+
673
+
674
+ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
675
  '''
676
  Create an OCRResult object based on a list of pdfminer LTChar objects.
677
  '''
678
 
679
+ line_level_results_out = []
680
+ line_level_characters_out = []
681
+ #all_line_level_characters_out = []
682
+ character_objects_out = [] # New list to store character objects
683
+
684
  # Initialize variables
685
  full_text = ""
686
  overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
 
691
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
692
 
693
  for char in char_objects:
694
+ character_objects_out.append(char) # Collect character objects
695
+
696
  if isinstance(char, LTAnno):
697
  # Handle space separately by finalizing the word
698
  full_text += char.get_text() # Adds space or newline
 
700
  word_bboxes.append((current_word, current_word_bbox))
701
  current_word = ""
702
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
703
+
704
+ # Check for line break (assuming a new line is indicated by a specific character)
705
+ if '\n' in char.get_text():
706
+ #print("char_anno:", char)
707
+ # Finalize the current line
708
+ if current_word:
709
+ word_bboxes.append((current_word, current_word_bbox))
710
+ # Create an OCRResult for the current line
711
+ line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
712
+ line_level_characters_out.append(character_objects_out)
713
+ # Reset for the next line
714
+ character_objects_out = []
715
+ full_text = ""
716
+ overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
717
+ current_word = ""
718
+ current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
719
+
720
  continue
721
 
722
  # Concatenate text for LTChar
 
738
  current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
739
  current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
740
 
741
+
742
  # Finalize the last word if any
743
  if current_word:
744
  word_bboxes.append((current_word, current_word_bbox))
745
 
746
+ if full_text:
747
+ line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
748
+
749
+
750
+ return line_level_results_out, line_level_characters_out # Return both results and character objects
751
 
752
+ def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=0):
753
  '''
754
  Merge identified bounding boxes containing PII that are very close to one another
755
  '''
 
794
  vertical_diff_bboxes = abs(char_box[1] - current_y)
795
  horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
796
 
797
+ #print(f"Comparing boxes: current_box={current_box}, char_box={char_box}, current_text={current_text}, char_text={text}")
798
  #print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
799
 
800
  if (
801
+ vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
 
802
  ):
803
+ #print("box is being extended")
804
  current_box[2] = char_box[2] # Extend the current box horizontally
805
  current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
806
  current_result.end = max(current_result.end, result.end) # Extend the text range
 
851
  analyzed_bounding_boxes_df_new['page'] = page_num + 1
852
  decision_process_table = pd.concat([decision_process_table, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
853
 
854
+ #print('\n\ndecision_process_table:\n\n', decision_process_table)
855
 
856
  return decision_process_table
857
 
 
882
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
883
  '''
884
  annotations_all_pages = []
885
+ page_text_outputs_all_pages = pd.DataFrame()
886
  decision_process_table_all_pages = pd.DataFrame()
887
 
888
+ combine_pixel_dist = 20 # Horizontal distance between PII bounding boxes under/equal they are combined into one
889
 
890
+ # Open with Pikepdf to get text lines
891
  pdf = Pdf.open(filename)
892
+ # Also open pdf with pymupdf to be able to annotate later while retaining text
893
+ doc = pymupdf.open(filename)
894
  page_num = 0
895
 
896
  number_of_pages = len(pdf.pages)
 
918
  #page_width = media_box[2] - media_box[0]
919
  #page_height = media_box[3] - media_box[1]
920
 
 
 
 
 
921
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
922
 
923
  page_analyzer_results = []
924
+ page_analyzed_bounding_boxes = []
925
+
 
926
  characters = []
927
+ annotations_on_page = []
928
+ decision_process_table_on_page = pd.DataFrame()
929
+ page_text_outputs = pd.DataFrame()
930
 
931
  if analysis_type == "Simple text analysis - PDFs with selectable text":
932
+ for text_container in page_layout:
933
+
934
+ text_container_analyzer_results = []
935
+ text_container_analyzed_bounding_boxes = []
936
+
937
+ characters = get_text_container_characters(text_container)
938
 
 
 
939
  # Create dataframe for all the text on the page
940
+ line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
941
 
942
+ print("line_characters:", line_characters)
 
943
 
944
+ # Create page_text_outputs (OCR format outputs)
945
+ if line_level_text_results_list:
946
  # Convert to DataFrame and add to ongoing logging table
947
  line_level_text_results_df = pd.DataFrame([{
948
  'page': page_no + 1,
 
955
 
956
  page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
957
 
958
+ # Analyse each line of text in turn for PII and add to list
959
+ for i, text_line in enumerate(line_level_text_results_list):
960
+ text_line_analyzer_result = []
961
+ text_line_bounding_boxes = []
962
+
963
+ #print("text_line:", text_line.text)
964
+
965
+ text_line_analyzer_result = analyze_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
966
+
967
+ # Merge bounding boxes for the line if multiple found close together
968
+ if text_line_analyzer_result:
969
+ # Merge bounding boxes if very close together
970
+ print("text_line_bounding_boxes:", text_line_bounding_boxes)
971
+ print("line_characters:")
972
+ #print(line_characters[i])
973
+ print("".join(char._text for char in line_characters[i]))
974
+ text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyzer_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
975
+
976
+ text_container_analyzer_results.extend(text_line_analyzer_result)
977
+ text_container_analyzed_bounding_boxes.extend(text_line_bounding_boxes)
978
+
979
+ print("\n FINAL text_container_analyzer_results:", text_container_analyzer_results)
980
 
981
 
982
  page_analyzer_results.extend(text_container_analyzer_results)
983
  page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
984
 
985
+
 
 
 
 
986
 
987
+ # Annotate redactions on page
988
  annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
989
+
990
+ # Make pymupdf redactions
991
+ doc = redact_page_with_pymupdf(doc, annotations_on_page, page_no)
992
 
993
  # Make page annotations
994
+ #page.Annots = pdf.make_indirect(annotations_on_page)
995
+ if annotations_on_page:
996
+ annotations_all_pages.extend([annotations_on_page])
997
 
 
 
 
 
 
998
  print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
 
 
999
 
1000
+ # Write logs
1001
+ # Create decision process table
1002
+ decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
1003
+
1004
+ if not decision_process_table_on_page.empty:
1005
+ decision_process_table_all_pages = pd.concat([decision_process_table_all_pages, decision_process_table_on_page])
1006
+
1007
+ if not page_text_outputs.empty:
1008
+ page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
1009
+ #page_text_outputs.to_csv("text_page_text_outputs.csv")
1010
+ page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
1011
+
1012
+ return doc, decision_process_table_all_pages, page_text_outputs_all_pages
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -19,7 +19,8 @@ score_threshold = 0.001
19
  titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
20
  titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
21
  titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
22
- titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
 
23
 
24
  # %%
25
  # Custom postcode recogniser
 
19
  titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
20
  titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
21
  titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
22
+ titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern],
23
+ global_regex_flags=re.DOTALL | re.MULTILINE)
24
 
25
  # %%
26
  # Custom postcode recogniser