Spaces:

seanpedrickcase
/

document_redaction

Sleeping

App Files Files Community

seanpedrickcase commited on Sep 18, 2024

Commit

e9c4101

1 Parent(s): e1c402a

Added AWS Textract support. Allowed for OCR logs export.

Browse files

Files changed (5) hide show

app.py +1 -1
tools/aws_textract.py +151 -0
tools/custom_image_analyser_engine.py +116 -0
tools/file_conversion.py +9 -6
tools/file_redaction.py +294 -111

app.py CHANGED Viewed

@@ -124,7 +124,7 @@ with app:
     Define redaction settings that affect both document and open text redaction.
     """)
         with gr.Accordion("Settings for documents", open = True):
-            in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")

     Define redaction settings that affect both document and open text redaction.
     """)
         with gr.Accordion("Settings for documents", open = True):
+            in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis", "AWS Textract"])
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")

tools/aws_textract.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import boto3
+from PIL import Image
+import io
+import json
+import pikepdf
+# Example: converting this single page to an image
+from pdf2image import convert_from_bytes
+from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
+def analyse_page_with_textract(pdf_page_bytes, json_file_path):
+    '''
+    Analyse page with AWS Textract
+    '''
+    try:
+        client = boto3.client('textract')
+    except:
+        print("Cannot connect to AWS Textract")
+        return "", "", ""
+    print("Analysing page with AWS Textract")
+    # Convert the image to bytes using an in-memory buffer
+    #image_buffer = io.BytesIO()
+    #image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
+    #image_bytes = image_buffer.getvalue()
+    #response = client.detect_document_text(Document={'Bytes': image_bytes})
+    response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
+    text_blocks = response['Blocks']
+    # Write the response to a JSON file
+    with open(json_file_path, 'w') as json_file:
+        json.dump(response, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+    print("Response has been written to output:", json_file_path)
+    return text_blocks
+def convert_pike_pdf_page_to_bytes(pdf, page_num):
+    # Create a new empty PDF
+    new_pdf = pikepdf.Pdf.new()
+    # Specify the page number you want to extract (0-based index)
+    page_num = 0  # Example: first page
+    # Extract the specific page and add it to the new PDF
+    new_pdf.pages.append(pdf.pages[page_num])
+    # Save the new PDF to a bytes buffer
+    buffer = io.BytesIO()
+    new_pdf.save(buffer)
+    # Get the PDF bytes
+    pdf_bytes = buffer.getvalue()
+    # Now you can use the `pdf_bytes` to convert it to an image or further process
+    buffer.close()
+    #images = convert_from_bytes(pdf_bytes)
+    #image = images[0]
+    return pdf_bytes
+def json_to_ocrresult(json_data, page_width, page_height):
+    '''
+    Convert the json response from textract to the OCRResult format used elsewhere in the code.
+    '''
+    all_ocr_results = []
+    signature_or_handwriting_recogniser_results = []
+    signatures = []
+    handwriting = []
+    for text_block in json_data:
+        is_signature = False
+        is_handwriting = False
+        if (text_block['BlockType'] == 'WORD') | (text_block['BlockType'] == 'LINE'):
+            text = text_block['Text']
+            # Extract BoundingBox details
+            bbox = text_block["Geometry"]["BoundingBox"]
+            left = bbox["Left"]
+            top = bbox["Top"]
+            width = bbox["Width"]
+            height = bbox["Height"]
+            # Convert proportional coordinates to absolute coordinates
+            left_abs = int(left * page_width)
+            top_abs = int(top * page_height)
+            width_abs = int(width * page_width)
+            height_abs = int(height * page_height)
+            # Create OCRResult with absolute coordinates
+            ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
+            # If handwriting or signature, add to bounding box
+            confidence = text_block['Confidence']
+            if 'TextType' in text_block:
+                text_type = text_block["TextType"]
+                if text_type == "HANDWRITING":
+                    is_handwriting = True
+                    entity_name = "HANDWRITING"
+                    word_end = len(entity_name)
+                    recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
+                    handwriting.append(recogniser_result)
+                    print("Handwriting found:", handwriting[-1])
+            all_ocr_results.append(ocr_result)
+        elif (text_block['BlockType'] == 'SIGNATURE'):
+            text = "SIGNATURE"
+            # Extract BoundingBox details
+            bbox = text_block["Geometry"]["BoundingBox"]
+            left = bbox["Left"]
+            top = bbox["Top"]
+            width = bbox["Width"]
+            height = bbox["Height"]
+            # Convert proportional coordinates to absolute coordinates
+            left_abs = int(left * page_width)
+            top_abs = int(top * page_height)
+            width_abs = int(width * page_width)
+            height_abs = int(height * page_height)
+            # Create OCRResult with absolute coordinates
+            ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
+            is_signature = True
+            entity_name = "Signature"
+            word_end = len(entity_name)
+            recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
+            signatures.append(recogniser_result)
+            print("Signature found:", signatures[-1])
+            all_ocr_results.append(ocr_result)
+        is_signature_or_handwriting = is_signature | is_handwriting
+        # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
+        if is_signature_or_handwriting:
+            signature_or_handwriting_recogniser_results.append(recogniser_result)
+    return all_ocr_results, signature_or_handwriting_recogniser_results

tools/custom_image_analyser_engine.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import pytesseract
+from PIL import Image
+import numpy as np
+from presidio_analyzer import AnalyzerEngine, RecognizerResult
+from typing import List, Dict, Optional, Union, Tuple
+from dataclasses import dataclass
+@dataclass
+class OCRResult:
+    text: str
+    left: int
+    top: int
+    width: int
+    height: int
+@dataclass
+class CustomImageRecognizerResult:
+    entity_type: str
+    start: int
+    end: int
+    score: float
+    left: int
+    top: int
+    width: int
+    height: int
+    text: str
+class CustomImageAnalyzerEngine:
+    def __init__(
+        self,
+        analyzer_engine: Optional[AnalyzerEngine] = None,
+        tesseract_config: Optional[str] = None
+    ):
+        if not analyzer_engine:
+            analyzer_engine = AnalyzerEngine()
+        self.analyzer_engine = analyzer_engine
+        self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
+    def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
+        # Ensure image is a PIL Image
+        if isinstance(image, str):
+            image = Image.open(image)
+        elif isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT, config=self.tesseract_config)
+        # Filter out empty strings and low confidence results
+        valid_indices = [i for i, text in enumerate(ocr_data['text']) if text.strip() and int(ocr_data['conf'][i]) > 0]
+        return [
+            OCRResult(
+                text=ocr_data['text'][i],
+                left=ocr_data['left'][i],
+                top=ocr_data['top'][i],
+                width=ocr_data['width'][i],
+                height=ocr_data['height'][i]
+            )
+            for i in valid_indices
+        ]
+    def analyze_text(
+        self,
+        ocr_results: List[OCRResult],
+        **text_analyzer_kwargs
+    ) -> List[CustomImageRecognizerResult]:
+        # Combine all OCR text
+        full_text = ' '.join([result.text for result in ocr_results])
+        # Define English as default language, if not specified
+        if "language" not in text_analyzer_kwargs:
+            text_analyzer_kwargs["language"] = "en"
+        analyzer_result = self.analyzer_engine.analyze(
+            text=full_text, **text_analyzer_kwargs
+        )
+        allow_list = text_analyzer_kwargs.get('allow_list', [])
+        return self.map_analyzer_results_to_bounding_boxes(
+            analyzer_result, ocr_results, full_text, allow_list
+        )
+    @staticmethod
+    def map_analyzer_results_to_bounding_boxes(
+        text_analyzer_results: List[RecognizerResult],
+        ocr_results: List[OCRResult],
+        full_text: str,
+        allow_list: List[str],
+    ) -> List[CustomImageRecognizerResult]:
+        pii_bboxes = []
+        text_position = 0
+        for ocr_result in ocr_results:
+            word_end = text_position + len(ocr_result.text)
+            for result in text_analyzer_results:
+                if (max(text_position, result.start) < min(word_end, result.end)) and (ocr_result.text not in allow_list):
+                    pii_bboxes.append(
+                        CustomImageRecognizerResult(
+                            entity_type=result.entity_type,
+                            start=result.start,
+                            end=result.end,
+                            score=result.score,
+                            left=ocr_result.left,
+                            top=ocr_result.top,
+                            width=ocr_result.width,
+                            height=ocr_result.height,
+                            text=ocr_result.text
+                        )
+                    )
+                    break
+            text_position = word_end + 1  # +1 for the space between words
+        return pii_bboxes

tools/file_conversion.py CHANGED Viewed

@@ -49,7 +49,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
     for page_num in range(page_min,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
-        # print("Current page: ", str(page_num + 1))
         # Convert one page to image
         image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
@@ -128,8 +128,8 @@ def prepare_image_or_text_pdf(
     tic = time.perf_counter()
     # If out message or out_file_paths are blank, change to a list so it can be appended to
-    #if isinstance(out_message, str):
-    #    out_message = [out_message]
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -150,8 +150,11 @@ def prepare_image_or_text_pdf(
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
         print("Last file reached, returning files:", str(latest_file_completed))
-        #final_out_message = '\n'.join(out_message)
-        return out_message, out_file_paths
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
@@ -178,7 +181,7 @@ def prepare_image_or_text_pdf(
             print(out_message)
             return out_message, out_file_paths
-        if in_redact_method == "Image analysis":
             # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."

     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
     for page_num in range(page_min,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
+        print("Converting page: ", str(page_num + 1))
         # Convert one page to image
         image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
     tic = time.perf_counter()
     # If out message or out_file_paths are blank, change to a list so it can be appended to
+    if isinstance(out_message, str):
+        out_message = [out_message]
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
         print("Last file reached, returning files:", str(latest_file_completed))
+        if isinstance(out_message, list):
+            final_out_message = '\n'.join(out_message)
+        else:
+            final_out_message = out_message
+        return final_out_message, out_file_paths
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
             print(out_message)
             return out_message, out_file_paths
+        if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
             # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."

tools/file_redaction.py CHANGED Viewed

@@ -1,23 +1,28 @@
 from PIL import Image, ImageChops, ImageDraw
 from typing import List
 import pandas as pd
-from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
 from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
-from tools.file_conversion import process_file
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from gradio import Progress
-import time
-import re
 from collections import defaultdict  # For efficient grouping
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
 from tools.data_anonymise import generate_decision_process_output
-import gradio as gr
 def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, progress=gr.Progress(track_tqdm=True)):
@@ -93,17 +98,20 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             print(out_message)
             return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
-        if in_redact_method == "Image analysis":
             # Analyse and redact image-based pdf or image
             # if is_pdf_or_image(file_path) == False:
             #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-            print("Redacting file as image-based file")
-            pdf_images, output_logs = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max)
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             out_file_paths.append(out_image_file_path)
             out_message.append("File '" + file_path_without_ext + "' successfully redacted")
             output_logs_str = str(output_logs)
@@ -118,16 +126,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
                 latest_file_completed += 1
         elif in_redact_method == "Text analysis":
             if is_pdf(file_path) == False:
                 return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max)
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
-            pdf_text.save(out_text_file_path)
             # Convert message
             convert_message="Converting PDF to image-based PDF to embed redactions."
@@ -170,55 +177,60 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
-def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
-            merged_bboxes = []
-            grouped_bboxes = defaultdict(list)
-            # 1. Group by approximate vertical proximity
-            for box in bboxes:
-                grouped_bboxes[round(box.top / vertical_threshold)].append(box)
-            # 2. Merge within each group
-            for _, group in grouped_bboxes.items():
-                group.sort(key=lambda box: box.left)
-                merged_box = group[0]
-                for next_box in group[1:]:
-                    if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
-                        #print("Merging a box")
-                        # Calculate new dimensions for the merged box
-                        new_left = min(merged_box.left, next_box.left)
-                        new_top = min(merged_box.top, next_box.top)
-                        new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
-                        new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
-                        merged_box = ImageRecognizerResult(
-                            merged_box.entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height
-                        )
-                    else:
-                        merged_bboxes.append(merged_box)
-                        merged_box = next_box
-                merged_bboxes.append(merged_box)
-            return merged_bboxes
-def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
-    fill = (0, 0, 0)
     decision_process_output_str = ""
     if not image_paths:
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
-        #progress(0, desc=out_message)
         image_paths = process_file(file_path)
-    print("image_paths:", image_paths)
     if not isinstance(image_paths, list):
         print("Converting image_paths to list")
         image_paths = [image_paths]
@@ -235,84 +247,142 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     # Check that page_min and page_max are within expected ranges
     if page_max > number_of_pages or page_max == 0:
         page_max = number_of_pages
-    #else:
-    #    page_max = page_max - 1
     if page_min <= 0:
         page_min = 0
     else:
         page_min = page_min - 1
-    print("Page range:", str(page_min), "to", str(page_max))
     #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
-    images = []
     for n in range(0, number_of_pages):
         try:
             image = image_paths[0][n]#.copy()
             print("Skipping page", str(n))
             #print("image:", image)
         except Exception as e:
-            print("Could not redact page:", str(i), "due to:")
             print(e)
             continue
-        if n >= page_min and n <= page_max:
-        #for i in range(page_min, page_max):
             i = n
-            print("Redacting page", str(i))
-            # Get the image to redact using PIL lib (pillow)
-            #print("image_paths:", image_paths)
-            #image = ImageChops.duplicate(image_paths[i])
-            #print("Image paths i:", image_paths[0])
             # Assuming image_paths[i] is your PIL image object
             try:
                 image = image_paths[0][i]#.copy()
                 #print("image:", image)
             except Exception as e:
-                print("Could not redact page:", str(i), "due to:")
                 print(e)
                 continue
             # %%
-            image_analyser = ImageAnalyzerEngine(nlp_analyser)
-            engine = ImageRedactorEngine(image_analyser)
             if language == 'en':
                 ocr_lang = 'eng'
             else: ocr_lang = language
-            bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
-                    **{
-                    "allow_list": allow_list,
-                    "language": language,
-                    "entities": chosen_redact_entities,
-                    "score_threshold": score_threshold,
-                    "return_decision_process":True,
-                })
-            # Text placeholder in this processing step, as the analyze method does not return the OCR text
             if bboxes:
                 decision_process_output_str = str(bboxes)
                 print("Decision process:", decision_process_output_str)
-            #print("For page: ", str(i), "Bounding boxes: ", bboxes)
-            draw = ImageDraw.Draw(image)
-            merged_bboxes = merge_img_bboxes(bboxes)
             #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
-            # 3. Draw the merged boxes (unchanged)
             for box in merged_bboxes:
                 x0 = box.left
                 y0 = box.top
@@ -322,7 +392,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
         images.append(image)
-    return images, decision_process_output_str
 def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
     if isinstance(text_container, LTTextContainer):
@@ -343,16 +413,82 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
     return [], []
 # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
-def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
     analyzed_bounding_boxes = []
-    if len(analyzer_results) > 0 and len(characters) > 0:
-        merged_bounding_boxes = []
-        current_box = None
-        current_y = None
         for i, result in enumerate(analyzer_results):
-            print("Considering result", str(i))
-            for char in characters[result.start : result.end]:
                 if isinstance(char, LTChar):
                     char_box = list(char.bbox)
                     # Add vertical padding to the top of the box
@@ -378,24 +514,55 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
                             # Reset current_box and current_y after appending
                             current_box = char_box
                             current_y = char_box[1]
             # After finishing with the current result, add the last box for this result
             if current_box:
                 merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
                 current_box = None
                 current_y = None  # Reset for the next result
-        if not merged_bounding_boxes:
-            analyzed_bounding_boxes.extend(
-                {"boundingBox": char.bbox, "result": result}
-                for result in analyzer_results
-                for char in characters[result.start:result.end]
-                if isinstance(char, LTChar)
-            )
-        else:
-            analyzed_bounding_boxes.extend(merged_bounding_boxes)
-        print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
     return analyzed_bounding_boxes
@@ -437,7 +604,7 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
         annotations_on_page.append(annotation)
     return annotations_on_page
-def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
@@ -469,6 +636,12 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
         print("Page number is:", page_no)
         annotations_on_page = []
         decision_process_table_on_page = []
@@ -480,13 +653,23 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
             text_container_analyzed_bounding_boxes = []
             characters = []
-            for text_container in page_layout:
-                text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
-                # Merge bounding boxes if very close together
-                text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
-                page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
-                page_analyzer_results.extend(text_container_analyzer_results)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)

+import time
+import re
+import json
+import io
+import os
 from PIL import Image, ImageChops, ImageDraw
 from typing import List
 import pandas as pd
 from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
+import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
+from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult
+from tools.file_conversion import process_file
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
 from tools.data_anonymise import generate_decision_process_output
+from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
 def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, progress=gr.Progress(track_tqdm=True)):
             print(out_message)
             return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
+        if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
             # Analyse and redact image-based pdf or image
             # if is_pdf_or_image(file_path) == False:
             #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
+            print("Redacting file" + file_path_without_ext + "as an image-based file")
+            pdf_images, output_logs, logging_file_paths = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method)
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             out_file_paths.append(out_image_file_path)
+            if logging_file_paths:
+                log_files_output_paths.extend(logging_file_paths)
             out_message.append("File '" + file_path_without_ext + "' successfully redacted")
             output_logs_str = str(output_logs)
                 latest_file_completed += 1
         elif in_redact_method == "Text analysis":
             if is_pdf(file_path) == False:
                 return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Text analysis")
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
+            pdf_text.save(out_text_file_path)
             # Convert message
             convert_message="Converting PDF to image-based PDF to embed redactions."
     return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state
+def merge_img_bboxes(bboxes, handwriting_or_signature_boxes = [], horizontal_threshold=150, vertical_threshold=25):
+    merged_bboxes = []
+    grouped_bboxes = defaultdict(list)
+    if handwriting_or_signature_boxes:
+        print("Handwriting or signature boxes exist at merge:", handwriting_or_signature_boxes)
+        bboxes.extend(handwriting_or_signature_boxes)
+    # 1. Group by approximate vertical proximity
+    for box in bboxes:
+        grouped_bboxes[round(box.top / vertical_threshold)].append(box)
+    # 2. Merge within each group
+    for _, group in grouped_bboxes.items():
+        group.sort(key=lambda box: box.left)
+        merged_box = group[0]
+        for next_box in group[1:]:
+            if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
+                #print("Merging a box")
+                # Calculate new dimensions for the merged box
+                print("Merged box:", merged_box)
+                new_left = min(merged_box.left, next_box.left)
+                new_top = min(merged_box.top, next_box.top)
+                new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
+                new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
+                merged_box = ImageRecognizerResult(
+                    merged_box.entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height
+                )
+            else:
+                merged_bboxes.append(merged_box)
+                merged_box = next_box
+        merged_bboxes.append(merged_box)
+    return merged_bboxes
+def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Image analysis", progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
+    # json_file_path is for AWS Textract outputs
+    logging_file_paths = []
+    file_name = get_file_path_end(file_path)
+    fill = (0, 0, 0)   # Fill colour
     decision_process_output_str = ""
+    images = []
+    image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if not image_paths:
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
         image_paths = process_file(file_path)
     if not isinstance(image_paths, list):
         print("Converting image_paths to list")
         image_paths = [image_paths]
     # Check that page_min and page_max are within expected ranges
     if page_max > number_of_pages or page_max == 0:
         page_max = number_of_pages
     if page_min <= 0:
         page_min = 0
     else:
         page_min = page_min - 1
+    print("Page range:", str(page_min + 1), "to", str(page_max))
     #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
     for n in range(0, number_of_pages):
+        handwriting_or_signature_boxes = []
         try:
             image = image_paths[0][n]#.copy()
             print("Skipping page", str(n))
             #print("image:", image)
         except Exception as e:
+            print("Could not redact page:", str(n), "due to:")
             print(e)
             continue
+        if n >= page_min and n < page_max:
             i = n
+            reported_page_number = str(i + 1)
+            print("Redacting page", reported_page_number)
             # Assuming image_paths[i] is your PIL image object
             try:
                 image = image_paths[0][i]#.copy()
                 #print("image:", image)
             except Exception as e:
+                print("Could not redact page:", reported_page_number, "due to:")
                 print(e)
                 continue
             # %%
+            # image_analyser = ImageAnalyzerEngine(nlp_analyser)
+            # engine = ImageRedactorEngine(image_analyser)
             if language == 'en':
                 ocr_lang = 'eng'
             else: ocr_lang = language
+            # bboxes = image_analyser.analyze(image,
+            #         ocr_kwargs={"lang": ocr_lang},
+            #         **{
+            #         "allow_list": allow_list,
+            #         "language": language,
+            #         "entities": chosen_redact_entities,
+            #         "score_threshold": score_threshold,
+            #         "return_decision_process":True,
+            #     })
+            # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
+            if analysis_type == "Image analysis":
+                ocr_results = image_analyser.perform_ocr(image)
+                # Process all OCR text with bounding boxes
+                #print("OCR results:", ocr_results)
+                ocr_results_str = str(ocr_results)
+                ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_page_" + reported_page_number + ".txt"
+                with open(ocr_results_file_path, "w") as f:
+                    f.write(ocr_results_str)
+                logging_file_paths.append(ocr_results_file_path)
+            # Import results from json and convert
+            if analysis_type == "AWS Textract":
+                # Ensure image is a PIL Image object
+                # if isinstance(image, str):
+                #     image = Image.open(image)
+                # elif not isinstance(image, Image.Image):
+                #     print(f"Unexpected image type on page {i}: {type(image)}")
+                #     continue
+                # Convert the image to bytes using an in-memory buffer
+                image_buffer = io.BytesIO()
+                image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
+                pdf_page_as_bytes = image_buffer.getvalue()
+                json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
+                if not os.path.exists(json_file_path):
+                    text_blocks = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
+                    logging_file_paths.append(json_file_path)
+                else:
+                    # Open the file and load the JSON data
+                    print("Found existing Textract json results file for this page.")
+                    with open(json_file_path, 'r') as json_file:
+                        text_blocks = json.load(json_file)
+                        text_blocks = text_blocks['Blocks']
+                # Need image size to convert textract OCR outputs to the correct sizes
+                #print("Image size:", image.size)
+                page_width, page_height = image.size
+                ocr_results, handwriting_or_signature_boxes = json_to_ocrresult(text_blocks, page_width, page_height)
+                #print("OCR results:", ocr_results)
+                ocr_results_str = str(ocr_results)
+                textract_ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_page_" + reported_page_number + "_textract.txt"
+                with open(textract_ocr_results_file_path, "w") as f:
+                            f.write(ocr_results_str)
+                logging_file_paths.append(textract_ocr_results_file_path)
+            # Step 2: Analyze text and identify PII
+            bboxes = image_analyser.analyze_text(
+                ocr_results,
+                language=language,
+                entities=chosen_redact_entities,
+                allow_list=allow_list,
+                score_threshold=score_threshold,
+            )
+            # Process the bboxes (PII entities)
             if bboxes:
+                for bbox in bboxes:
+                    print(f"Entity: {bbox.entity_type}, Text: {bbox.text}, Bbox: ({bbox.left}, {bbox.top}, {bbox.width}, {bbox.height})")
                 decision_process_output_str = str(bboxes)
                 print("Decision process:", decision_process_output_str)
+            # Merge close bounding boxes
+            merged_bboxes = merge_img_bboxes(bboxes, handwriting_or_signature_boxes)
             #print("For page:", str(i), "Merged bounding boxes:", merged_bboxes)
+            #from PIL import Image
+            #image_object = Image.open(image)
+            # 3. Draw the merged boxes
+            draw = ImageDraw.Draw(image)
             for box in merged_bboxes:
                 x0 = box.left
                 y0 = box.top
         images.append(image)
+    return images, decision_process_output_str, logging_file_paths
 def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
     if isinstance(text_container, LTTextContainer):
     return [], []
 # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
+# def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
+#     '''
+#     Merge identified bounding boxes containing PII that are very close to one another
+#     '''
+#     analyzed_bounding_boxes = []
+#     if len(analyzer_results) > 0 and len(characters) > 0:
+#         merged_bounding_boxes = []
+#         current_box = None
+#         current_y = None
+#         for i, result in enumerate(analyzer_results):
+#             print("Considering result", str(i))
+#             for char in characters[result.start : result.end]:
+#                 if isinstance(char, LTChar):
+#                     char_box = list(char.bbox)
+#                     # Add vertical padding to the top of the box
+#                     char_box[3] += vertical_padding
+#                     if current_y is None or current_box is None:
+#                         current_box = char_box
+#                         current_y = char_box[1]
+#                     else:
+#                         vertical_diff_bboxes = abs(char_box[1] - current_y)
+#                         horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
+#                         if (
+#                             vertical_diff_bboxes <= 5
+#                             and horizontal_diff_bboxes <= combine_pixel_dist
+#                         ):
+#                             current_box[2] = char_box[2]  # Extend the current box horizontally
+#                             current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
+#                         else:
+#                             merged_bounding_boxes.append(
+#                                 {"boundingBox": current_box, "result": result})
+#                             # Reset current_box and current_y after appending
+#                             current_box = char_box
+#                             current_y = char_box[1]
+#             # After finishing with the current result, add the last box for this result
+#             if current_box:
+#                 merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
+#                 current_box = None
+#                 current_y = None  # Reset for the next result
+#         if not merged_bounding_boxes:
+#             analyzed_bounding_boxes.extend(
+#                 {"boundingBox": char.bbox, "result": result}
+#                 for result in analyzer_results
+#                 for char in characters[result.start:result.end]
+#                 if isinstance(char, LTChar)
+#             )
+#         else:
+#             analyzed_bounding_boxes.extend(merged_bounding_boxes)
+#         print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
+#     return analyzed_bounding_boxes
+def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2, signature_bounding_boxes=None):
+    '''
+    Merge identified bounding boxes containing PII or signatures that are very close to one another.
+    '''
     analyzed_bounding_boxes = []
+    merged_bounding_boxes = []
+    current_box = None
+    current_y = None
+    # Handle PII and text bounding boxes first
+    if len(analyzer_results) > 0 and len(characters) > 0:
         for i, result in enumerate(analyzer_results):
+            #print("Considering result", str(i))
+            #print("Result:", result)
+            #print("Characters:", characters)
+            for char in characters[result.start: result.end]:
                 if isinstance(char, LTChar):
                     char_box = list(char.bbox)
                     # Add vertical padding to the top of the box
                             # Reset current_box and current_y after appending
                             current_box = char_box
                             current_y = char_box[1]
             # After finishing with the current result, add the last box for this result
             if current_box:
                 merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
                 current_box = None
                 current_y = None  # Reset for the next result
+    # Handle signature bounding boxes (without specific characters)
+    if signature_bounding_boxes is not None:
+        for sig_box in signature_bounding_boxes:
+            sig_box = list(sig_box)  # Ensure it's a list to modify the values
+            if current_y is None or current_box is None:
+                current_box = sig_box
+                current_y = sig_box[1]
+            else:
+                vertical_diff_bboxes = abs(sig_box[1] - current_y)
+                horizontal_diff_bboxes = abs(sig_box[0] - current_box[2])
+                if (
+                    vertical_diff_bboxes <= 5
+                    and horizontal_diff_bboxes <= combine_pixel_dist
+                ):
+                    current_box[2] = sig_box[2]  # Extend the current box horizontally
+                    current_box[3] = max(current_box[3], sig_box[3])  # Ensure the top is the highest
+                else:
+                    merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
+                    # Reset current_box and current_y after appending
+                    current_box = sig_box
+                    current_y = sig_box[1]
+            # Add the last bounding box for the signature
+            if current_box:
+                merged_bounding_boxes.append({"boundingBox": current_box, "type": "signature"})
+                current_box = None
+                current_y = None
+    # If no bounding boxes were merged, add individual character bounding boxes
+    if not merged_bounding_boxes:
+        analyzed_bounding_boxes.extend(
+            {"boundingBox": char.bbox, "result": result}
+            for result in analyzer_results
+            for char in characters[result.start:result.end]
+            if isinstance(char, LTChar)
+        )
+    else:
+        analyzed_bounding_boxes.extend(merged_bounding_boxes)
+    #print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
     return analyzed_bounding_boxes
         annotations_on_page.append(annotation)
     return annotations_on_page
+def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Text analysis", progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
         print("Page number is:", page_no)
+        # The /MediaBox in a PDF specifies the size of the page [left, bottom, right, top]
+        media_box = page.MediaBox
+        page_width = media_box[2] - media_box[0]
+        page_height = media_box[3] - media_box[1]
         annotations_on_page = []
         decision_process_table_on_page = []
             text_container_analyzed_bounding_boxes = []
             characters = []
+            if analysis_type == "Text analysis":
+                for i, text_container in enumerate(page_layout):
+                    text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
+                    # Merge bounding boxes if very close together
+                    text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
+                    page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
+                    page_analyzer_results.extend(text_container_analyzer_results)
+                    # Merge bounding boxes if very close together
+                    text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
+                    page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
+                    page_analyzer_results.extend(text_container_analyzer_results)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)