Spaces:

seanpedrickcase
/

document_redaction

Sleeping

App Files Files Community

seanpedrickcase commited on Sep 24, 2024

Commit

a748df6

1 Parent(s): 8652429

Generally improved OCR recognition of texts, corrected postcode regex

Browse files

Files changed (3) hide show

tools/custom_image_analyser_engine.py +125 -22
tools/file_redaction.py +77 -291
tools/load_spacy_model_custom_recognisers.py +9 -7

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -420,7 +420,7 @@ class CustomImageAnalyzerEngine:
             #     block_size=11
             # )
             image_preprocessor = ContrastSegmentedImageEnhancer()
-            print(image_preprocessor)
         self.image_preprocessor = image_preprocessor
     def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
@@ -461,6 +461,7 @@ class CustomImageAnalyzerEngine:
     def analyze_text(
         self,
         ocr_results: List[OCRResult],
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified
@@ -468,8 +469,8 @@ class CustomImageAnalyzerEngine:
             text_analyzer_kwargs["language"] = "en"
         allow_list = text_analyzer_kwargs.get('allow_list', [])
-        combined_results = []
         for ocr_result in ocr_results:
             # Analyze each OCR result (line) individually
             analyzer_result = self.analyzer_engine.analyze(
@@ -480,18 +481,42 @@ class CustomImageAnalyzerEngine:
                 # Extract the relevant portion of text based on start and end
                 relevant_text = ocr_result.text[result.start:result.end]
-                # Create a new OCRResult with the relevant text and adjusted position
-                relevant_ocr_result = OCRResult(
-                    text=relevant_text,
-                    left=ocr_result.left + self.estimate_x_offset(ocr_result.text, result.start),
-                    top=ocr_result.top,
-                    width=self.estimate_width(ocr_result=ocr_result, start=result.start, end=result.end),
-                    height=ocr_result.height
-                )
                 # Map the analyzer results to bounding boxes for this line
                 line_results = self.map_analyzer_results_to_bounding_boxes(
-                    [result], [relevant_ocr_result], relevant_text, allow_list
                 )
                 combined_results.extend(line_results)
@@ -504,33 +529,95 @@ class CustomImageAnalyzerEngine:
         ocr_results: List[OCRResult],
         full_text: str,
         allow_list: List[str],
     ) -> List[CustomImageRecognizerResult]:
         pii_bboxes = []
         text_position = 0
         for ocr_result in ocr_results:
             word_end = text_position + len(ocr_result.text)
             for result in text_analyzer_results:
-                if (max(text_position, result.start) < min(word_end, result.end)) and (ocr_result.text not in allow_list):
                     pii_bboxes.append(
                         CustomImageRecognizerResult(
                             entity_type=result.entity_type,
                             start=result.start,
                             end=result.end,
                             score=result.score,
-                            left=ocr_result.left,
-                            top=ocr_result.top,
-                            width=ocr_result.width,
-                            height=ocr_result.height,
                             text=ocr_result.text
                         )
                     )
-                    break
             text_position = word_end + 1  # +1 for the space between words
         return pii_bboxes
     @staticmethod
     def remove_space_boxes(ocr_result: dict) -> dict:
@@ -676,17 +763,33 @@ class CustomImageAnalyzerEngine:
 # Function to combine OCR results into line-level results
-def combine_ocr_results(ocr_results, x_threshold=20, y_threshold=3):
-    # Sort OCR results by 'top' to ensure line order
-    ocr_results = sorted(ocr_results, key=lambda x: (x.top, x.left))
     combined_results = []
     new_format_results = {}
     current_line = []
     current_bbox = None
     line_counter = 1
-    for result in ocr_results:
         if not current_line:
             # Start a new line
             current_line.append(result)

             #     block_size=11
             # )
             image_preprocessor = ContrastSegmentedImageEnhancer()
+            #print(image_preprocessor)
         self.image_preprocessor = image_preprocessor
     def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
     def analyze_text(
         self,
         ocr_results: List[OCRResult],
+        ocr_results_with_children: Dict[str, Dict],
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified
             text_analyzer_kwargs["language"] = "en"
         allow_list = text_analyzer_kwargs.get('allow_list', [])
+        combined_results = []
         for ocr_result in ocr_results:
             # Analyze each OCR result (line) individually
             analyzer_result = self.analyzer_engine.analyze(
                 # Extract the relevant portion of text based on start and end
                 relevant_text = ocr_result.text[result.start:result.end]
+                # Find the corresponding entry in ocr_results_with_children
+                child_info = ocr_results_with_children.get(ocr_result.text)
+                if child_info:
+                    # Calculate left and width based on child words
+                    #print("Found in ocr_results_with_children")
+                    child_words = child_info['words']
+                    start_word = child_words[0]
+                    end_word = child_words[-1]
+                    left = start_word['bounding_box'][0]
+                    width = end_word['bounding_box'][2] - left
+                    relevant_ocr_result = OCRResult(
+                        text=relevant_text,
+                        left=left,
+                        top=ocr_result.top,
+                        width=width,
+                        height=ocr_result.height
+                    )
+                else:
+                    # Fallback to previous method if not found in ocr_results_with_children
+                    #print("Couldn't find result in ocr_results_with_children")
+                    relevant_ocr_result = OCRResult(
+                        text=relevant_text,
+                        left=ocr_result.left + self.estimate_x_offset(relevant_text, result.start),
+                        top=ocr_result.top,
+                        width=self.estimate_width(ocr_result=ocr_result, start=result.start, end=result.end),
+                        height=ocr_result.height
+                    )
+                result_mod = result
+                result.start = 0
+                result.end = len(relevant_text)
                 # Map the analyzer results to bounding boxes for this line
                 line_results = self.map_analyzer_results_to_bounding_boxes(
+                    [result_mod], [relevant_ocr_result], ocr_result.text, allow_list, ocr_results_with_children
                 )
                 combined_results.extend(line_results)
         ocr_results: List[OCRResult],
         full_text: str,
         allow_list: List[str],
+        ocr_results_with_children: Dict[str, Dict]
     ) -> List[CustomImageRecognizerResult]:
         pii_bboxes = []
         text_position = 0
         for ocr_result in ocr_results:
             word_end = text_position + len(ocr_result.text)
+            #print("Checking relevant OCR result:", ocr_result)
             for result in text_analyzer_results:
+                max_of_current_text_pos_or_result_start_pos = max(text_position, result.start)
+                min_of_result_end_pos_or_results_end = min(word_end, result.end)
+                #print("max_of_current_text_pos_or_result_start_pos", str(max_of_current_text_pos_or_result_start_pos))
+                #print("min_of_result_end_pos_or_results_end", str(min_of_result_end_pos_or_results_end))
+                if (max_of_current_text_pos_or_result_start_pos < min_of_result_end_pos_or_results_end) and (ocr_result.text not in allow_list):
+                    print("result", result, "made it through if statement")
+                    # Find the corresponding entry in ocr_results_with_children
+                    child_info = ocr_results_with_children.get(full_text)
+                    if child_info:
+                        # Use the bounding box from ocr_results_with_children
+                        bbox = child_info['bounding_box']
+                        left, top, right, bottom = bbox
+                        width = right - left
+                        height = bottom - top
+                    else:
+                        # Fallback to ocr_result if not found
+                        left = ocr_result.left
+                        top = ocr_result.top
+                        width = ocr_result.width
+                        height = ocr_result.height
                     pii_bboxes.append(
                         CustomImageRecognizerResult(
                             entity_type=result.entity_type,
                             start=result.start,
                             end=result.end,
                             score=result.score,
+                            left=left,
+                            top=top,
+                            width=width,
+                            height=height,
                             text=ocr_result.text
                         )
                     )
             text_position = word_end + 1  # +1 for the space between words
         return pii_bboxes
+    # @staticmethod
+    # def map_analyzer_results_to_bounding_boxes(
+    #     text_analyzer_results: List[RecognizerResult],
+    #     ocr_results: List[OCRResult],
+    #     full_text: str,
+    #     allow_list: List[str],
+    # ) -> List[CustomImageRecognizerResult]:
+    #     pii_bboxes = []
+    #     text_position = 0
+    #     for ocr_result in ocr_results:
+    #         word_end = text_position + len(ocr_result.text)
+    #         print("Checking relevant OCR result:", ocr_result)
+    #         for result in text_analyzer_results:
+    #             if (max(text_position, result.start) < min(word_end, result.end)) and (ocr_result.text not in allow_list):
+    #                 print("result", result, "made it through if statement")
+    #                 pii_bboxes.append(
+    #                     CustomImageRecognizerResult(
+    #                         entity_type=result.entity_type,
+    #                         start=result.start,
+    #                         end=result.end,
+    #                         score=result.score,
+    #                         left=ocr_result.left,
+    #                         top=ocr_result.top,
+    #                         width=ocr_result.width,
+    #                         height=ocr_result.height,
+    #                         text=ocr_result.text
+    #                     )
+    #                 )
+    #         text_position = word_end + 1  # +1 for the space between words
+    #     return pii_bboxes
     @staticmethod
     def remove_space_boxes(ocr_result: dict) -> dict:
 # Function to combine OCR results into line-level results
+def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
+    # Group OCR results into lines based on y_threshold
+    lines = []
+    current_line = []
+    for result in sorted(ocr_results, key=lambda x: x.top):
+        if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
+            current_line.append(result)
+        else:
+            lines.append(current_line)
+            current_line = [result]
+    if current_line:
+        lines.append(current_line)
+    # Sort each line by left position
+    for line in lines:
+        line.sort(key=lambda x: x.left)
+    # Flatten the sorted lines back into a single list
+    sorted_results = [result for line in lines for result in line]
     combined_results = []
     new_format_results = {}
     current_line = []
     current_bbox = None
     line_counter = 1
+    for result in sorted_results:
         if not current_line:
             # Start a new line
             current_line.append(result)

tools/file_redaction.py CHANGED Viewed

@@ -9,156 +9,7 @@ import pandas as pd
 #from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
-from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
-from pikepdf import Pdf, Dictionary, Name
-import gradio as gr
-from gradio import Progress
-from collections import defaultdict  # For efficient grouping
-from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
-from tools.file_conversion import process_file
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
-from tools.helper_functions import get_file_path_end, output_folder
-from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
-from tools.data_anonymise import generate_decision_process_output
-from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
-def sum_numbers_before_seconds(string:str):
-    """Extracts numbers that precede the word 'seconds' from a string and adds them up.
-    Args:
-        string: The input string.
-    Returns:
-        The sum of all numbers before 'seconds' in the string.
-    """
-    # Extract numbers before 'seconds' using regular expression
-    numbers = re.findall(r'(\d+\.\d+)?\s*seconds', string)
-    # Extract the numbers from the matches
-    numbers = [float(num.split()[0]) for num in numbers]
-    # Sum up the extracted numbers
-    sum_of_numbers = round(sum(numbers),1)
-    return sum_of_numbers
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", progress=gr.Progress(track_tqdm=True)):
-    '''
-    Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
-    '''
-    tic = time.perf_counter()
-    all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
-    # If this is the first time around, set variables to 0/blank
-    if first_loop_state==True:
-        latest_file_completed = 0
-        #out_message = []
-        out_file_paths = []
-    # If out message is string or out_file_paths are blank, change to a list so it can be appended to
-    if isinstance(out_message, str):
-        out_message = [out_message]
-    if not out_file_paths:
-        out_file_paths = []
-    latest_file_completed = int(latest_file_completed)
-    # If we have already redacted the last file, return the input out_message and file list to the relevant components
-    if latest_file_completed >= len(file_paths):
-        print("Last file reached")
-        # Set to a very high number so as not to mix up with subsequent file processing by the user
-        latest_file_completed = 99
-        final_out_message = '\n'.join(out_message)
-        #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
-        estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
-        print("Estimated total processing time:", str(estimate_total_processing_time))
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str
-    file_paths_loop = [file_paths[int(latest_file_completed)]]
-    if not in_allow_list.empty:
-        in_allow_list_flat = in_allow_list[0].tolist()
-        print("In allow list:", in_allow_list_flat)
-    else:
-        in_allow_list_flat = []
-    for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
-        file_path = file.name
-        if file_path:
-            file_path_without_ext = get_file_path_end(file_path)
-            is_a_pdf = is_pdf(file_path) == True
-            if is_a_pdf == False:
-                # If user has not submitted a pdf, assume it's an image
-                print("File is not a pdf, assuming that image analysis needs to be used.")
-                in_redact_method = "Quick image analysis - typed text"
-        else:
-            out_message = "No file selected"
-            print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
-        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
-            #Analyse and redact image-based pdf or image
-            if is_pdf_or_image(file_path) == False:
-                out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-                return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
-            print("Redacting file " + file_path_without_ext + " as an image-based file")
-            pdf_images, output_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
-            # Save file
-            out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
-            pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
-            out_file_paths.append(out_image_file_path)
-            if logging_file_paths:
-                log_files_output_paths.extend(logging_file_paths)
-            out_message.append("File '" + file_path_without_ext + "' successfully redacted")
-            # Save decision making process
-            output_logs_str = str(output_logs)
-            logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
-            with open(logs_output_file_name, "w") as f:
-                f.write(output_logs_str)
-            log_files_output_paths.append(logs_output_file_name)
-           # Save Textract request metadata (if exists)
-            if new_request_metadata:
-                print("Request metadata:", new_request_metadata)
-                all_request_metadata.append(new_request_metadata)
-            # Increase latest file completed count unless we are at the last file
-            if latest_file_completed != len(file_paths):
-                print("Completed file number:", str(latest_file_completed))
-                latest_file_completed += 1
-        elif in_redact_method == "Simple text analysis - PDFs with selectable text":
-            if is_pdf(file_path) == False:
-                return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
-            # Analyse text-based pdf
-            print('Redacting file as text-based PDF')
-import time
-import re
-import json
-import io
-import os
-from PIL import Image, ImageChops, ImageDraw
-from typing import List, Dict
-import pandas as pd
-#from presidio_image_redactor.entities import ImageRecognizerResult
-from pdfminer.high_level import extract_pages
-from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 import gradio as gr
 from gradio import Progress
@@ -349,8 +200,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
@@ -385,11 +234,11 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
                     current_char = 0
                     for word in line_info['words']:
                         word_end = current_char + len(word['text'])
-                        if current_char <= start_char < word_end or current_char < end_char <= word_end:
                             relevant_words.append(word)
                         if word_end >= end_char:
                             break
-                        current_char = word_end  # +1 for space
                         if not word['text'].endswith(' '):
                             current_char += 1  # +1 for space if the word doesn't already end with a space
@@ -400,7 +249,7 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
                         right = max(word['bounding_box'][2] for word in relevant_words)
                         bottom = max(word['bounding_box'][3] for word in relevant_words)
-                        # Combine the text of the relevant words
                         combined_text = " ".join(word['text'] for word in relevant_words)
                         reconstructed_bbox = CustomImageRecognizerResult(
@@ -551,6 +400,8 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 # Combine OCR results
                 ocr_results, ocr_results_with_children = combine_ocr_results(ocr_results)
                 # Save decision making process
                 ocr_results_with_children_str = str(ocr_results_with_children)
                 logs_output_file_name = output_folder + "ocr_with_children.txt"
@@ -589,6 +440,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
             # Step 2: Analyze text and identify PII
             bboxes = image_analyser.analyze_text(
                 ocr_results,
                 language=language,
                 entities=chosen_redact_entities,
                 allow_list=allow_list,
@@ -650,59 +502,81 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
                                                 score_threshold=score_threshold,
                                                 return_decision_process=True,
                                                 allow_list=allow_list)
         characters = [char
                 for line in text_container
-                if isinstance(line, LTTextLine)
                 for char in line]
         return analyzer_results, characters
     return [], []
-# Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
 def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
     analyzed_bounding_boxes = []
     if len(analyzer_results) > 0 and len(characters) > 0:
         merged_bounding_boxes = []
         current_box = None
         current_y = None
-        for i, result in enumerate(analyzer_results):
-            print("Considering result", str(i))
-            for char in characters[result.start : result.end]:
-                if isinstance(char, LTChar):
-                    char_box = list(char.bbox)
-                    # Add vertical padding to the top of the box
-                    char_box[3] += vertical_padding
-                    if current_y is None or current_box is None:
-                        current_box = char_box
-                        current_y = char_box[1]
-                    else:
-                        vertical_diff_bboxes = abs(char_box[1] - current_y)
-                        horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
-                        if (
-                            vertical_diff_bboxes <= 5
-                            and horizontal_diff_bboxes <= combine_pixel_dist
-                        ):
-                            current_box[2] = char_box[2]  # Extend the current box horizontally
-                            current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
-                        else:
-                            merged_bounding_boxes.append(
-                                {"boundingBox": current_box, "result": result})
-                            # Reset current_box and current_y after appending
-                            current_box = char_box
-                            current_y = char_box[1]
-            # After finishing with the current result, add the last box for this result
-            if current_box:
-                merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
-                current_box = None
-                current_y = None  # Reset for the next result
         if not merged_bounding_boxes:
             analyzed_bounding_boxes.extend(
@@ -714,104 +588,10 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
         else:
             analyzed_bounding_boxes.extend(merged_bounding_boxes)
-        print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
     return analyzed_bounding_boxes
-# def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2, signature_bounding_boxes=None):
-#     '''
-#     Merge identified bounding boxes containing PII or signatures that are very close to one another.
-#     '''
-#     analyzed_bounding_boxes = []
-#     merged_bounding_boxes = []
-#     current_box = None
-#     current_y = None
-#     # Handle PII and text bounding boxes first
-#     if len(analyzer_results) > 0 and len(characters) > 0:
-#         for i, result in enumerate(analyzer_results):
-#             #print("Considering result", str(i))
-#             #print("Result:", result)
-#             #print("Characters:", characters)
-#             for char in characters[result.start: result.end]:
-#                 if isinstance(char, LTChar):
-#                     char_box = list(char.bbox)
-#                     # Add vertical padding to the top of the box
-#                     char_box[3] += vertical_padding
-#                     if current_y is None or current_box is None:
-#                         current_box = char_box
-#                         current_y = char_box[1]
-#                     else:
-#                         vertical_diff_bboxes = abs(char_box[1] - current_y)
-#                         horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
-#                         if (
-#                             vertical_diff_bboxes <= 5
-#                             and horizontal_diff_bboxes <= combine_pixel_dist
-#                         ):
-#                             current_box[2] = char_box[2]  # Extend the current box horizontally
-#                             current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
-#                         else:
-#                             merged_bounding_boxes.append(
-#                                 {"boundingBox": current_box, "result": result})
-#                             # Reset current_box and current_y after appending
-#                             current_box = char_box
-#                             current_y = char_box[1]
-#             # After finishing with the current result, add the last box for this result
-#             if current_box:
-#                 merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
-#                 current_box = None
-#                 current_y = None  # Reset for the next result
-#     # Handle signature bounding boxes (without specific characters)
-#     if signature_bounding_boxes is not None:
-#         for sig_box in signature_bounding_boxes:
-#             sig_box = list(sig_box)  # Ensure it's a list to modify the values
-#             if current_y is None or current_box is None:
-#                 current_box = sig_box
-#                 current_y = sig_box[1]
-#             else:
-#                 vertical_diff_bboxes = abs(sig_box[1] - current_y)
-#                 horizontal_diff_bboxes = abs(sig_box[0] - current_box[2])
-#                 if (
-#                     vertical_diff_bboxes <= 5
-#                     and horizontal_diff_bboxes <= combine_pixel_dist
-#                 ):
-#                     current_box[2] = sig_box[2]  # Extend the current box horizontally
-#                     current_box[3] = max(current_box[3], sig_box[3])  # Ensure the top is the highest
-#                 else:
-#                     merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
-#                     # Reset current_box and current_y after appending
-#                     current_box = sig_box
-#                     current_y = sig_box[1]
-#             # Add the last bounding box for the signature
-#             if current_box:
-#                 merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
-#                 current_box = None
-#                 current_y = None
-#     # If no bounding boxes were merged, add individual character bounding boxes
-#     if not merged_bounding_boxes:
-#         analyzed_bounding_boxes.extend(
-#             {"boundingBox": char.bbox, "result": result}
-#             for result in analyzer_results
-#             for char in characters[result.start:result.end]
-#             if isinstance(char, LTChar)
-#         )
-#     else:
-#         analyzed_bounding_boxes.extend(merged_bounding_boxes)
-#     #print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
-#     return analyzed_bounding_boxes
 def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
@@ -857,7 +637,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
     annotations_all_pages = []
     decision_process_table_all_pages = []
-    combine_pixel_dist = 200 # Horizontal distance between PII bounding boxes under/equal they are combined into one
     pdf = Pdf.open(filename)
     page_num = 0
@@ -883,9 +663,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
         print("Page number is:", page_no)
         # The /MediaBox in a PDF specifies the size of the page [left, bottom, right, top]
-        media_box = page.MediaBox
-        page_width = media_box[2] - media_box[0]
-        page_height = media_box[3] - media_box[1]
         annotations_on_page = []
@@ -905,8 +685,14 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                     text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
                     # Merge bounding boxes if very close together
                     text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
                     page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
                     page_analyzer_results.extend(text_container_analyzer_results)
@@ -915,7 +701,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
             annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
-            #print('\n\nannotations_on_page:', annotations_on_page)
             # Make page annotations
             page.Annots = pdf.make_indirect(annotations_on_page)

 #from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 import gradio as gr
 from gradio import Progress
     return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
                     current_char = 0
                     for word in line_info['words']:
                         word_end = current_char + len(word['text'])
+                        if current_char <= start_char < word_end or current_char < end_char <= word_end or (start_char <= current_char and word_end <= end_char):
                             relevant_words.append(word)
                         if word_end >= end_char:
                             break
+                        current_char = word_end
                         if not word['text'].endswith(' '):
                             current_char += 1  # +1 for space if the word doesn't already end with a space
                         right = max(word['bounding_box'][2] for word in relevant_words)
                         bottom = max(word['bounding_box'][3] for word in relevant_words)
+                        # Combine the text of all relevant words
                         combined_text = " ".join(word['text'] for word in relevant_words)
                         reconstructed_bbox = CustomImageRecognizerResult(
                 # Combine OCR results
                 ocr_results, ocr_results_with_children = combine_ocr_results(ocr_results)
+                #print("ocr_results after:", ocr_results)
                 # Save decision making process
                 ocr_results_with_children_str = str(ocr_results_with_children)
                 logs_output_file_name = output_folder + "ocr_with_children.txt"
             # Step 2: Analyze text and identify PII
             bboxes = image_analyser.analyze_text(
                 ocr_results,
+                ocr_results_with_children,
                 language=language,
                 entities=chosen_redact_entities,
                 allow_list=allow_list,
                                                 score_threshold=score_threshold,
                                                 return_decision_process=True,
                                                 allow_list=allow_list)
+        #print("\ntext_container:", text_container)
         characters = [char
                 for line in text_container
+                if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
                 for char in line]
         return analyzer_results, characters
     return [], []
 def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
     analyzed_bounding_boxes = []
     if len(analyzer_results) > 0 and len(characters) > 0:
+        # Extract bounding box coordinates for sorting
+        bounding_boxes = []
+        for result in analyzer_results:
+            char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
+            if char_boxes:
+                # Calculate the bounding box that encompasses all characters
+                left = min(box[0] for box in char_boxes)
+                bottom = min(box[1] for box in char_boxes)
+                right = max(box[2] for box in char_boxes)
+                top = max(box[3] for box in char_boxes) + vertical_padding
+                bounding_boxes.append((bottom, left, result, [left, bottom, right, top]))  # (y, x, result, bbox)
+        # Sort the results by y-coordinate and then by x-coordinate
+        bounding_boxes.sort()
         merged_bounding_boxes = []
         current_box = None
         current_y = None
+        current_result = None
+        for y, x, result, char_box in bounding_boxes:
+            print(f"Considering result: {result}")
+            print(f"Character box: {char_box}")
+            if current_y is None or current_box is None:
+                current_box = char_box
+                current_y = char_box[1]
+                current_result = result
+                print(f"Starting new box: {current_box}")
+            else:
+                vertical_diff_bboxes = abs(char_box[1] - current_y)
+                horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
+                print(f"Comparing boxes: current_box={current_box}, char_box={char_box}")
+                print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
+                if (
+                    vertical_diff_bboxes <= 5
+                    and horizontal_diff_bboxes <= combine_pixel_dist
+                ):
+                    current_box[2] = char_box[2]  # Extend the current box horizontally
+                    current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
+                    current_result.end = max(current_result.end, result.end)  # Extend the text range
+                    print(f"Extended current box: {current_box}")
+                else:
+                    merged_bounding_boxes.append(
+                        {"boundingBox": current_box, "result": current_result})
+                    print(f"Appending merged box: {current_box}")
+                    # Reset current_box and current_y after appending
+                    current_box = char_box
+                    current_y = char_box[1]
+                    current_result = result
+                    print(f"Starting new box: {current_box}")
+        # After finishing with the current result, add the last box for this result
+        if current_box:
+            merged_bounding_boxes.append({"boundingBox": current_box, "result": current_result})
+            print(f"Appending final box for result: {current_box}")
         if not merged_bounding_boxes:
             analyzed_bounding_boxes.extend(
         else:
             analyzed_bounding_boxes.extend(merged_bounding_boxes)
+        print("Analyzed bounding boxes:\n\n", analyzed_bounding_boxes)
     return analyzed_bounding_boxes
 def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
     annotations_all_pages = []
     decision_process_table_all_pages = []
+    combine_pixel_dist = 100 # Horizontal distance between PII bounding boxes under/equal they are combined into one
     pdf = Pdf.open(filename)
     page_num = 0
         print("Page number is:", page_no)
         # The /MediaBox in a PDF specifies the size of the page [left, bottom, right, top]
+        #media_box = page.MediaBox
+        #page_width = media_box[2] - media_box[0]
+        #page_height = media_box[3] - media_box[1]
         annotations_on_page = []
                     text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
                     # Merge bounding boxes if very close together
+                    print("\n\ntext_container_analyzer_results:", text_container_analyzer_results)
+                    #print("\n\ncharacters:", characters)
                     text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
+                    print("\n\ntext_container_analyzed_bounding_boxes:", text_container_analyzed_bounding_boxes)
                     page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
                     page_analyzer_results.extend(text_container_analyzer_results)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
             annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
+            #print('\n\nAnnotations_on_page:', annotations_on_page)
             # Make page annotations
             page.Annots = pdf.make_indirect(annotations_on_page)

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -16,7 +16,6 @@ score_threshold = 0.001
 # %%
 # Custom title recogniser
-import re
 titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
 titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
 titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
@@ -26,7 +25,11 @@ titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [tit
 # Custom postcode recogniser
 # Define the regex pattern in a Presidio `Pattern` object:
-ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\b([A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0AA)\b", score = 1)
 # Define the recognizer with one or more patterns
 ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
@@ -77,10 +80,9 @@ def extract_street_name(text:str) -> str:
         street_name = match.group('street_name').strip()
         start_pos = match.start()
         end_pos = match.end()
-        print(f"Start: {start_pos}, End: {end_pos}")
-        print(f"Preceding words: {preceding_word}")
-        print(f"Street name: {street_name}")
-        print()
         start_positions.append(start_pos)
         end_positions.append(end_pos)
@@ -158,7 +160,7 @@ loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
 nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
                 default_score_threshold=score_threshold,
                 supported_languages=["en"],
-                log_decision_process=True,
                 )
 # %%

 # %%
 # Custom title recogniser
 titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
 titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
 titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
 # Custom postcode recogniser
 # Define the regex pattern in a Presidio `Pattern` object:
+ukpostcode_pattern = Pattern(
+    name="ukpostcode_pattern",
+    regex=r"\b([A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}|GIR ?0AA)\b",
+    score=1
+)
 # Define the recognizer with one or more patterns
 ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
         street_name = match.group('street_name').strip()
         start_pos = match.start()
         end_pos = match.end()
+        #print(f"Start: {start_pos}, End: {end_pos}")
+        #print(f"Preceding words: {preceding_word}")
+        #print(f"Street name: {street_name}")
         start_positions.append(start_pos)
         end_positions.append(end_pos)
 nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
                 default_score_threshold=score_threshold,
                 supported_languages=["en"],
+                log_decision_process=False,
                 )
 # %%