Sean Pedrick-Case commited on
Commit
d8f9fd9
·
unverified ·
2 Parent(s): e6934e4 b805ec6

Merge pull request #12 from seanpedrick-case/dev

Browse files
DocRedactApp_0.2.0.spec → DocRedactApp_0.4.0.spec RENAMED
@@ -43,7 +43,7 @@ exe = EXE(
43
  a.scripts,
44
  [],
45
  exclude_binaries=True,
46
- name='DocRedactApp_0.2.0',
47
  debug=False,
48
  bootloader_ignore_signals=False,
49
  strip=False,
@@ -62,5 +62,5 @@ coll = COLLECT(
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
- name='DocRedactApp_0.2.0',
66
  )
 
43
  a.scripts,
44
  [],
45
  exclude_binaries=True,
46
+ name='DocRedactApp_0.4.0',
47
  debug=False,
48
  bootloader_ignore_signals=False,
49
  strip=False,
 
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
+ name='DocRedactApp_0.4.0',
66
  )
Dockerfile CHANGED
@@ -62,8 +62,8 @@ RUN mkdir -p /home/user/app/output \
62
  # Copy installed packages from builder stage
63
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
64
 
65
- # Download NLTK data packages
66
- RUN python -m nltk.downloader punkt stopwords punkt_tab
67
 
68
  # Entrypoint helps to switch between Gradio and Lambda mode
69
  COPY entrypoint.sh /entrypoint.sh
 
62
  # Copy installed packages from builder stage
63
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
64
 
65
+ # Download NLTK data packages - now no longer necessary
66
+ # RUN python -m nltk.downloader --quiet punkt stopwords punkt_tab
67
 
68
  # Entrypoint helps to switch between Gradio and Lambda mode
69
  COPY entrypoint.sh /entrypoint.sh
README.md CHANGED
@@ -317,8 +317,8 @@ The Redaction Settings tab now has boxes for entering the AWS access key and sec
317
  ### Picking up AWS access keys through an .env file
318
  The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
319
 
320
- AWS_ACCESS_KEY=<your-access-key>
321
- AWS_SECRET_KEY=<your-secret-key>
322
 
323
  The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
324
 
 
317
  ### Picking up AWS access keys through an .env file
318
  The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
319
 
320
+ AWS_ACCESS_KEY= your-access-key
321
+ AWS_SECRET_KEY= your-secret-key
322
 
323
  The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
324
 
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
how_to_create_exe_dist.txt CHANGED
@@ -16,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
16
 
17
  9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
18
 
19
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.3.0 app.py
20
 
21
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
22
 
@@ -32,12 +32,12 @@ a = Analysis(
32
 
33
  hook-presidio-image-redactor.py
34
 
35
- c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.3.0.spec
36
 
37
 
38
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
39
 
40
- 10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
41
 
42
  def create_or_modify_pyi(
43
  component_class: type, class_name: str, events: list[str | EventListener]
 
16
 
17
  9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
18
 
19
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.4.0 app.py
20
 
21
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
22
 
 
32
 
33
  hook-presidio-image-redactor.py
34
 
35
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.4.0.spec
36
 
37
 
38
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
39
 
40
+ 10. go to dist/APP-NAME/internal/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
41
 
42
  def create_or_modify_pyi(
43
  component_class: type, class_name: str, events: list[str | EventListener]
requirements.txt CHANGED
@@ -2,18 +2,18 @@ pdfminer.six==20240706
2
  pdf2image==1.17.0
3
  pymupdf==1.25.3
4
  opencv-python==4.10.0.84
5
- presidio_analyzer==2.2.357
6
- presidio_anonymizer==2.2.357
7
- presidio-image-redactor==0.0.55
8
  pikepdf==9.5.2
9
  pandas==2.2.3
10
- nltk==3.9.1
11
  scikit-learn==1.6.1
12
- spacy==3.8.3
13
- #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.18.0
16
- boto3==1.36.26
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
 
2
  pdf2image==1.17.0
3
  pymupdf==1.25.3
4
  opencv-python==4.10.0.84
5
+ presidio_analyzer==2.2.358
6
+ presidio_anonymizer==2.2.358
7
+ presidio-image-redactor==0.0.56
8
  pikepdf==9.5.2
9
  pandas==2.2.3
10
+ #nltk==3.9.1 # Not required
11
  scikit-learn==1.6.1
12
+ spacy==3.8.4
13
+ en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
+ #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
+ gradio==5.23.3
16
+ boto3==1.37.17
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
tools/auth.py CHANGED
@@ -1,32 +1,12 @@
1
-
2
- import os
3
  import boto3
4
- import gradio as gr
5
  import hmac
6
  import hashlib
7
  import base64
 
8
 
9
- def get_or_create_env_var(var_name, default_value):
10
- # Get the environment variable if it exists
11
- value = os.environ.get(var_name)
12
-
13
- # If it doesn't exist, set it to the default value
14
- if value is None:
15
- os.environ[var_name] = default_value
16
- value = default_value
17
-
18
- return value
19
-
20
- client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
21
- #print(f'The value of AWS_CLIENT_ID is {client_id}')
22
-
23
- client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
24
- #print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
25
-
26
- user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
27
- #print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
28
-
29
- def calculate_secret_hash(client_id, client_secret, username):
30
  message = username + client_id
31
  dig = hmac.new(
32
  str(client_secret).encode('utf-8'),
@@ -36,7 +16,7 @@ def calculate_secret_hash(client_id, client_secret, username):
36
  secret_hash = base64.b64encode(dig).decode()
37
  return secret_hash
38
 
39
- def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id, client_id:str=client_id, client_secret:str=client_secret):
40
  """Authenticates a user against an AWS Cognito user pool.
41
 
42
  Args:
@@ -50,7 +30,7 @@ def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id,
50
  bool: True if the user is authenticated, False otherwise.
51
  """
52
 
53
- client = boto3.client('cognito-idp') # Cognito Identity Provider client
54
 
55
  # Compute the secret hash
56
  secret_hash = calculate_secret_hash(client_id, client_secret, username)
 
1
+ #import os
 
2
  import boto3
3
+ #import gradio as gr
4
  import hmac
5
  import hashlib
6
  import base64
7
+ from tools.config import AWS_CLIENT_ID, AWS_CLIENT_SECRET, AWS_USER_POOL_ID, AWS_REGION
8
 
9
+ def calculate_secret_hash(client_id:str, client_secret:str, username:str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  message = username + client_id
11
  dig = hmac.new(
12
  str(client_secret).encode('utf-8'),
 
16
  secret_hash = base64.b64encode(dig).decode()
17
  return secret_hash
18
 
19
+ def authenticate_user(username:str, password:str, user_pool_id:str=AWS_USER_POOL_ID, client_id:str=AWS_CLIENT_ID, client_secret:str=AWS_CLIENT_SECRET):
20
  """Authenticates a user against an AWS Cognito user pool.
21
 
22
  Args:
 
30
  bool: True if the user is authenticated, False otherwise.
31
  """
32
 
33
+ client = boto3.client('cognito-idp', region_name=AWS_REGION) # Cognito Identity Provider client
34
 
35
  # Compute the secret hash
36
  secret_hash = calculate_secret_hash(client_id, client_secret, username)
tools/aws_functions.py CHANGED
@@ -3,38 +3,9 @@ import pandas as pd
3
  import boto3
4
  import tempfile
5
  import os
6
- from tools.helper_functions import get_or_create_env_var
7
- from dotenv import load_dotenv
8
-
9
  PandasDataFrame = Type[pd.DataFrame]
10
 
11
- # Get AWS credentials
12
- bucket_name=""
13
-
14
- RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
15
- print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
16
-
17
- AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
18
- print(f'The value of AWS_REGION is {AWS_REGION}')
19
-
20
- # If you have an aws_config env file in the config folder, you can load in AWS keys this way
21
- AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '/env/aws_config.env')
22
- print(f'The value of AWS_CONFIG_PATH is {AWS_CONFIG_PATH}')
23
-
24
- if os.path.exists(AWS_CONFIG_PATH):
25
- print("Loading AWS keys from config folder")
26
- load_dotenv(AWS_CONFIG_PATH)
27
-
28
- AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
29
- if AWS_ACCESS_KEY:
30
- print(f'AWS_ACCESS_KEY found in environment variables')
31
-
32
- AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
33
- if AWS_SECRET_KEY:
34
- print(f'AWS_SECRET_KEY found in environment variables')
35
-
36
-
37
-
38
  def get_assumed_role_info():
39
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
40
  sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
@@ -49,18 +20,16 @@ def get_assumed_role_info():
49
  return assumed_role_arn, assumed_role_name
50
 
51
  if RUN_AWS_FUNCTIONS == "1":
52
- try:
53
- bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
54
- session = boto3.Session()
55
-
56
- #print("session:", session)
57
 
58
  except Exception as e:
59
- print("Could not start boto3 session:", e)
60
 
61
  try:
62
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
63
 
 
64
  print("Assumed Role ARN:", assumed_role_arn)
65
  print("Assumed Role Name:", assumed_role_name)
66
 
@@ -68,17 +37,17 @@ if RUN_AWS_FUNCTIONS == "1":
68
  print("Could not get assumed role from STS:", e)
69
 
70
  # Download direct from S3 - requires login credentials
71
- def download_file_from_s3(bucket_name, key, local_file_path_and_name):
72
 
73
- s3 = boto3.client('s3')
74
  s3.download_file(bucket_name, key, local_file_path_and_name)
75
- print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path_and_name}")
76
 
77
- def download_folder_from_s3(bucket_name, s3_folder, local_folder):
78
  """
79
  Download all files from an S3 folder to a local folder.
80
  """
81
- s3 = boto3.client('s3')
82
 
83
  # List objects in the specified S3 folder
84
  response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
@@ -99,11 +68,11 @@ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
99
  except Exception as e:
100
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
101
 
102
- def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
103
  """
104
  Download specific files from an S3 folder to a local folder.
105
  """
106
- s3 = boto3.client('s3')
107
 
108
  print("Trying to download file: ", filenames)
109
 
@@ -132,7 +101,7 @@ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
132
  except Exception as e:
133
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
134
 
135
- def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
136
 
137
  temp_dir = tempfile.mkdtemp()
138
  local_address_stub = temp_dir + '/doc-redaction/'
@@ -183,7 +152,7 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
183
 
184
  return files, out_message
185
 
186
- def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
187
  """
188
  Uploads a file from local machine to Amazon S3.
189
 
@@ -197,7 +166,7 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
197
  """
198
  final_out_message = []
199
 
200
- s3_client = boto3.client('s3')
201
 
202
  if isinstance(local_file_paths, str):
203
  local_file_paths = [local_file_paths]
 
3
  import boto3
4
  import tempfile
5
  import os
6
+ from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
 
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def get_assumed_role_info():
10
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
11
  sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
 
20
  return assumed_role_arn, assumed_role_name
21
 
22
  if RUN_AWS_FUNCTIONS == "1":
23
+ try:
24
+ session = boto3.Session(region_name=AWS_REGION)
 
 
 
25
 
26
  except Exception as e:
27
+ print("Could not start boto3 session:", e)
28
 
29
  try:
30
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
31
 
32
+ print("Successfully assumed ARN role")
33
  print("Assumed Role ARN:", assumed_role_arn)
34
  print("Assumed Role Name:", assumed_role_name)
35
 
 
37
  print("Could not get assumed role from STS:", e)
38
 
39
  # Download direct from S3 - requires login credentials
40
+ def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str):
41
 
42
+ s3 = boto3.client('s3', region_name=AWS_REGION)
43
  s3.download_file(bucket_name, key, local_file_path_and_name)
44
+ print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
45
 
46
+ def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str):
47
  """
48
  Download all files from an S3 folder to a local folder.
49
  """
50
+ s3 = boto3.client('s3', region_name=AWS_REGION)
51
 
52
  # List objects in the specified S3 folder
53
  response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
 
68
  except Exception as e:
69
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
70
 
71
+ def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str]):
72
  """
73
  Download specific files from an S3 folder to a local folder.
74
  """
75
+ s3 = boto3.client('s3', region_name=AWS_REGION)
76
 
77
  print("Trying to download file: ", filenames)
78
 
 
101
  except Exception as e:
102
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
103
 
104
+ def load_data_from_aws(in_aws_keyword_file, aws_password:str="", bucket_name:str=DOCUMENT_REDACTION_BUCKET):
105
 
106
  temp_dir = tempfile.mkdtemp()
107
  local_address_stub = temp_dir + '/doc-redaction/'
 
152
 
153
  return files, out_message
154
 
155
+ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET):
156
  """
157
  Uploads a file from local machine to Amazon S3.
158
 
 
166
  """
167
  final_out_message = []
168
 
169
+ s3_client = boto3.client('s3', region_name=AWS_REGION)
170
 
171
  if isinstance(local_file_paths, str):
172
  local_file_paths = [local_file_paths]
tools/aws_textract.py CHANGED
@@ -1,16 +1,15 @@
1
  import boto3
2
- #from PIL import Image
3
  from typing import List
4
  import io
5
- #import json
 
 
6
  import pikepdf
7
  import time
8
- # Example: converting this single page to an image
9
- #from pdf2image import convert_from_bytes
10
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
11
- from tools.aws_functions import AWS_ACCESS_KEY, AWS_SECRET_KEY
12
 
13
- def extract_textract_metadata(response):
14
  """Extracts metadata from an AWS Textract response."""
15
 
16
  #print("Document metadata:", response['DocumentMetadata'])
@@ -26,7 +25,7 @@ def extract_textract_metadata(response):
26
  #'NumberOfPages': number_of_pages
27
  })
28
 
29
- def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
30
  '''
31
  Analyse page with AWS Textract
32
  '''
@@ -35,9 +34,9 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
35
  if AWS_ACCESS_KEY and AWS_SECRET_KEY:
36
  client = boto3.client('textract',
37
  aws_access_key_id=AWS_ACCESS_KEY,
38
- aws_secret_access_key=AWS_SECRET_KEY)
39
  else:
40
- client = boto3.client('textract')
41
  except:
42
  print("Cannot connect to AWS Textract")
43
  return [], "" # Return an empty list and an empty string
@@ -65,19 +64,27 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
65
  time.sleep(5)
66
  response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
67
 
 
 
 
 
 
68
  # Wrap the response with the page number in the desired format
69
  wrapped_response = {
70
  'page_no': page_no,
71
  'data': response
72
  }
73
 
 
 
74
  request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
75
 
 
 
76
  # Return a list containing the wrapped response and the metadata
77
  return wrapped_response, request_metadata # Return as a list to match the desired structure
78
 
79
-
80
- def convert_pike_pdf_page_to_bytes(pdf, page_num):
81
  # Create a new empty PDF
82
  new_pdf = pikepdf.Pdf.new()
83
 
@@ -102,8 +109,7 @@ def convert_pike_pdf_page_to_bytes(pdf, page_num):
102
 
103
  return pdf_bytes
104
 
105
-
106
- def json_to_ocrresult(json_data, page_width, page_height, page_no):
107
  '''
108
  Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
109
  '''
@@ -123,6 +129,8 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
123
  # Find the specific page data
124
  page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
125
 
 
 
126
  if "Blocks" in page_json_data:
127
  # Access the data for the specific page
128
  text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
@@ -265,4 +273,80 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
265
 
266
  i += 1
267
 
268
- return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import boto3
 
2
  from typing import List
3
  import io
4
+ import os
5
+ import json
6
+ from collections import defaultdict
7
  import pikepdf
8
  import time
 
 
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
+ from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
11
 
12
+ def extract_textract_metadata(response:object):
13
  """Extracts metadata from an AWS Textract response."""
14
 
15
  #print("Document metadata:", response['DocumentMetadata'])
 
25
  #'NumberOfPages': number_of_pages
26
  })
27
 
28
+ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting", "Redact all identified signatures"]):
29
  '''
30
  Analyse page with AWS Textract
31
  '''
 
34
  if AWS_ACCESS_KEY and AWS_SECRET_KEY:
35
  client = boto3.client('textract',
36
  aws_access_key_id=AWS_ACCESS_KEY,
37
+ aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
38
  else:
39
+ client = boto3.client('textract', region_name=AWS_REGION)
40
  except:
41
  print("Cannot connect to AWS Textract")
42
  return [], "" # Return an empty list and an empty string
 
64
  time.sleep(5)
65
  response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
66
 
67
+ # Add the 'Page' attribute to each block
68
+ if "Blocks" in response:
69
+ for block in response["Blocks"]:
70
+ block["Page"] = page_no # Inject the page number into each block
71
+
72
  # Wrap the response with the page number in the desired format
73
  wrapped_response = {
74
  'page_no': page_no,
75
  'data': response
76
  }
77
 
78
+ #print("response:", response)
79
+
80
  request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
81
 
82
+ #print("request_metadata:", request_metadata)
83
+
84
  # Return a list containing the wrapped response and the metadata
85
  return wrapped_response, request_metadata # Return as a list to match the desired structure
86
 
87
+ def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
 
88
  # Create a new empty PDF
89
  new_pdf = pikepdf.Pdf.new()
90
 
 
109
 
110
  return pdf_bytes
111
 
112
+ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
 
113
  '''
114
  Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
115
  '''
 
129
  # Find the specific page data
130
  page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
131
 
132
+ #print("page_json_data:", page_json_data)
133
+
134
  if "Blocks" in page_json_data:
135
  # Access the data for the specific page
136
  text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
 
273
 
274
  i += 1
275
 
276
+ return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
277
+
278
+ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
279
+ """
280
+ Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
281
+ """
282
+
283
+ if not os.path.exists(textract_json_file_path):
284
+ print("No existing Textract results file found.")
285
+ return {}, True, log_files_output_paths # Return empty dict and flag indicating missing file
286
+
287
+ no_textract_file = False
288
+ print("Found existing Textract json results file.")
289
+
290
+ # Track log files
291
+ if textract_json_file_path not in log_files_output_paths:
292
+ log_files_output_paths.append(textract_json_file_path)
293
+
294
+ try:
295
+ with open(textract_json_file_path, 'r', encoding='utf-8') as json_file:
296
+ textract_data = json.load(json_file)
297
+ except json.JSONDecodeError:
298
+ print("Error: Failed to parse Textract JSON file. Returning empty data.")
299
+ return {}, True, log_files_output_paths # Indicate failure
300
+
301
+ # Check if conversion is needed
302
+ if "pages" in textract_data:
303
+ print("JSON already in the correct format for app. No changes needed.")
304
+ return textract_data, False, log_files_output_paths # No conversion required
305
+
306
+ if "Blocks" in textract_data:
307
+ print("Need to convert Textract JSON to app format.")
308
+ try:
309
+
310
+ textract_data = restructure_textract_output(textract_data)
311
+ return textract_data, False, log_files_output_paths # Successfully converted
312
+
313
+ except Exception as e:
314
+ print("Failed to convert JSON data to app format due to:", e)
315
+ return {}, True, log_files_output_paths # Conversion failed
316
+ else:
317
+ print("Invalid Textract JSON format: 'Blocks' missing.")
318
+ print("textract data:", textract_data)
319
+ return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
320
+
321
+ def restructure_textract_output(textract_output: dict):
322
+ """
323
+ Reorganise Textract output from the bulk Textract analysis option on AWS
324
+ into a format that works in this redaction app, reducing size.
325
+ """
326
+ pages_dict = {}
327
+
328
+ # Extract total pages from DocumentMetadata
329
+ document_metadata = textract_output.get("DocumentMetadata", {})
330
+
331
+ for block in textract_output.get("Blocks", []):
332
+ page_no = block.get("Page", 1) # Default to 1 if missing
333
+
334
+ # Initialize page structure if not already present
335
+ if page_no not in pages_dict:
336
+ pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
337
+
338
+ # Keep only essential fields to reduce size
339
+ filtered_block = {
340
+ key: block[key] for key in ["BlockType", "Confidence", "Text", "Geometry", "Page", "Id", "Relationships"]
341
+ if key in block
342
+ }
343
+
344
+ pages_dict[page_no]["data"]["Blocks"].append(filtered_block)
345
+
346
+ # Convert pages dictionary to a sorted list
347
+ structured_output = {
348
+ "DocumentMetadata": document_metadata, # Store metadata separately
349
+ "pages": [pages_dict[page] for page in sorted(pages_dict.keys())]
350
+ }
351
+
352
+ return structured_output
tools/cli_redact.py CHANGED
@@ -1,12 +1,13 @@
1
  import argparse
2
  import os
3
- from tools.helper_functions import ensure_output_folder_exists, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 
4
  from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
5
  from tools.file_redaction import choose_and_run_redactor
6
  import pandas as pd
7
  from datetime import datetime
8
 
9
- chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
10
  'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
11
  'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
12
  'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
 
1
  import argparse
2
  import os
3
+ from tools.config import get_or_create_env_var
4
+ from tools.helper_functions import ensure_output_folder_exists,tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
5
  from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
6
  from tools.file_redaction import choose_and_run_redactor
7
  import pandas as pd
8
  from datetime import datetime
9
 
10
+ chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
11
  'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
12
  'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
13
  'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
tools/config.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import socket
4
+ from datetime import datetime
5
+ from dotenv import load_dotenv
6
+ from tldextract import TLDExtract
7
+
8
+ today_rev = datetime.now().strftime("%Y%m%d")
9
+ host_name = socket.gethostname()
10
+
11
+ # Set or retrieve configuration variables for the redaction app
12
+
13
+ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
14
+ '''
15
+ Get an environmental variable, and set it to a default value if it doesn't exist
16
+ '''
17
+ # Get the environment variable if it exists
18
+ value = os.environ.get(var_name)
19
+
20
+ # If it doesn't exist, set the environment variable to the default value
21
+ if value is None:
22
+ os.environ[var_name] = default_value
23
+ value = default_value
24
+
25
+ if print_val == True:
26
+ print(f'The value of {var_name} is {value}')
27
+
28
+ return value
29
+
30
+
31
+ # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
32
+ APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '')
33
+
34
+
35
+ if os.path.exists(APP_CONFIG_PATH):
36
+ print(f"Loading APP variables from config file {APP_CONFIG_PATH}")
37
+ load_dotenv(APP_CONFIG_PATH)
38
+
39
+ ###
40
+ # AWS CONFIG
41
+ ###
42
+
43
+ # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
44
+ AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '')
45
+
46
+ if os.path.exists(AWS_CONFIG_PATH):
47
+ print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
48
+ load_dotenv(AWS_CONFIG_PATH)
49
+
50
+ RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
51
+
52
+ AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
53
+
54
+ AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
55
+
56
+ AWS_CLIENT_SECRET = get_or_create_env_var('AWS_CLIENT_SECRET', '')
57
+
58
+ AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '')
59
+
60
+ AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
61
+ if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
62
+
63
+ AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
64
+ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
65
+
66
+ DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
67
+
68
+ # Custom headers e.g. if routing traffic through Cloudfront
69
+ # Retrieving or setting CUSTOM_HEADER
70
+ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
71
+ if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
72
+
73
+ # Retrieving or setting CUSTOM_HEADER_VALUE
74
+ CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
75
+ if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
76
+
77
+ ###
78
+ # Images config
79
+ ###
80
+ IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
81
+ LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
82
+ MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
83
+
84
+ ###
85
+ # File I/O config
86
+ ###
87
+
88
+ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
89
+
90
+ OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
91
+ INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
92
+
93
+ # Allow for files to be saved in a temporary folder for increased security in some instances
94
+ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
95
+ # Create a temporary directory
96
+ with tempfile.TemporaryDirectory() as temp_dir:
97
+ print(f'Temporary directory created at: {temp_dir}')
98
+
99
+ if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
100
+ if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
101
+
102
+ FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + host_name + '/')
103
+
104
+ USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'logs/' + today_rev + '/' + host_name + '/')
105
+
106
+ ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'usage/' + today_rev + '/' + host_name + '/')
107
+
108
+ DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
109
+
110
+ ###
111
+ # REDACTION CONFIG
112
+ ###
113
+ TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
114
+
115
+ POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
116
+
117
+ SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
118
+
119
+ # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
120
+ PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
121
+
122
+ MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
123
+
124
+ CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
125
+
126
+ REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
127
+
128
+ ###
129
+ # APP RUN CONFIG
130
+ ###
131
+
132
+ TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
133
+ extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
134
+
135
+ # Get some environment variables and Launch the Gradio app
136
+ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
137
+
138
+ RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
139
+
140
+ MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
141
+
142
+ MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
143
+
144
+ GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
145
+
146
+ ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
147
+
148
+ DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
149
+
150
+ GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
151
+
152
+ ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
153
+
154
+ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
155
+
156
+ SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'True')
157
+
158
+ GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
159
+
160
+ COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
161
+
162
+ S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
163
+
164
+ ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, are they compulsory?
165
+
166
+ if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
167
+ if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
tools/custom_csvlogger.py CHANGED
@@ -8,9 +8,7 @@ from collections.abc import Sequence
8
  from multiprocessing import Lock
9
  from pathlib import Path
10
  from typing import TYPE_CHECKING, Any
11
-
12
  from gradio_client import utils as client_utils
13
-
14
  import gradio as gr
15
  from gradio import utils, wasm_utils
16
 
 
8
  from multiprocessing import Lock
9
  from pathlib import Path
10
  from typing import TYPE_CHECKING, Any
 
11
  from gradio_client import utils as client_utils
 
12
  import gradio as gr
13
  from gradio import utils, wasm_utils
14
 
tools/custom_image_analyser_engine.py CHANGED
@@ -6,6 +6,7 @@ from dataclasses import dataclass
6
  import time
7
  import cv2
8
  import copy
 
9
  from copy import deepcopy
10
  from pdfminer.layout import LTChar
11
  import PIL
@@ -399,12 +400,12 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
399
  adjusted_contrast = contrast
400
  return adjusted_image, contrast, adjusted_contrast
401
 
402
- def bounding_boxes_overlap(box1, box2):
403
  """Check if two bounding boxes overlap."""
404
  return (box1[0] < box2[2] and box2[0] < box1[2] and
405
  box1[1] < box2[3] and box2[1] < box1[3])
406
 
407
- def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
408
  for entity in page_analyser_result:
409
  entity_start = entity.start
410
  entity_end = entity.end
@@ -442,7 +443,7 @@ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_li
442
 
443
  return all_text_line_results
444
 
445
- def map_back_comprehend_entity_results(response, current_batch_mapping, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
446
  if not response or "Entities" not in response:
447
  return all_text_line_results
448
 
@@ -489,7 +490,7 @@ def map_back_comprehend_entity_results(response, current_batch_mapping, allow_li
489
 
490
  return all_text_line_results
491
 
492
- def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_client, language, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
493
  if not current_batch:
494
  return all_text_line_results
495
 
@@ -685,7 +686,7 @@ def run_page_text_redaction(
685
 
686
  return page_analysed_bounding_boxes
687
 
688
- def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
689
  '''
690
  Merge identified bounding boxes containing PII that are very close to one another
691
  '''
@@ -775,7 +776,7 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
775
  return analysed_bounding_boxes
776
 
777
  # Function to combine OCR results into line-level results
778
- def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
779
  # Group OCR results into lines based on y_threshold
780
  lines = []
781
  current_line = []
@@ -913,7 +914,8 @@ class CustomImageAnalyzerEngine:
913
  ocr_results_with_children: Dict[str, Dict],
914
  chosen_redact_comprehend_entities: List[str],
915
  pii_identification_method: str = "Local",
916
- comprehend_client = "",
 
917
  **text_analyzer_kwargs
918
  ) -> List[CustomImageRecognizerResult]:
919
 
 
6
  import time
7
  import cv2
8
  import copy
9
+ import botocore
10
  from copy import deepcopy
11
  from pdfminer.layout import LTChar
12
  import PIL
 
400
  adjusted_contrast = contrast
401
  return adjusted_image, contrast, adjusted_contrast
402
 
403
+ def bounding_boxes_overlap(box1:List, box2:List):
404
  """Check if two bounding boxes overlap."""
405
  return (box1[0] < box2[2] and box2[0] < box1[2] and
406
  box1[1] < box2[3] and box2[1] < box1[3])
407
 
408
+ def map_back_entity_results(page_analyser_result:dict, page_text_mapping:dict, all_text_line_results:List[Tuple]):
409
  for entity in page_analyser_result:
410
  entity_start = entity.start
411
  entity_end = entity.end
 
443
 
444
  return all_text_line_results
445
 
446
+ def map_back_comprehend_entity_results(response:object, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
447
  if not response or "Entities" not in response:
448
  return all_text_line_results
449
 
 
490
 
491
  return all_text_line_results
492
 
493
+ def do_aws_comprehend_call(current_batch:str, current_batch_mapping:List[Tuple], comprehend_client:botocore.client.BaseClient, language:str, allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
494
  if not current_batch:
495
  return all_text_line_results
496
 
 
686
 
687
  return page_analysed_bounding_boxes
688
 
689
+ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
690
  '''
691
  Merge identified bounding boxes containing PII that are very close to one another
692
  '''
 
776
  return analysed_bounding_boxes
777
 
778
  # Function to combine OCR results into line-level results
779
+ def combine_ocr_results(ocr_results:dict, x_threshold:float=50.0, y_threshold:float=12.0):
780
  # Group OCR results into lines based on y_threshold
781
  lines = []
782
  current_line = []
 
914
  ocr_results_with_children: Dict[str, Dict],
915
  chosen_redact_comprehend_entities: List[str],
916
  pii_identification_method: str = "Local",
917
+ comprehend_client = "",
918
+ custom_entities:List[str]=custom_entities,
919
  **text_analyzer_kwargs
920
  ) -> List[CustomImageRecognizerResult]:
921
 
tools/data_anonymise.py CHANGED
@@ -2,6 +2,8 @@ import re
2
  import secrets
3
  import base64
4
  import time
 
 
5
  import pandas as pd
6
 
7
  from faker import Faker
@@ -12,9 +14,10 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerR
12
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
 
15
- from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
16
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
17
-
 
18
  # Use custom version of analyze_dict to be able to track progress
19
  from tools.presidio_analyzer_custom import analyze_dict
20
 
@@ -108,9 +111,6 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
108
 
109
  decision_process_output_str = '\n'.join(decision_process_output)
110
 
111
- print("decision_process_output_str:\n\n", decision_process_output_str)
112
-
113
-
114
  return decision_process_output_str
115
 
116
  def anon_consistent_names(df):
@@ -205,90 +205,248 @@ def anon_consistent_names(df):
205
 
206
  return scrubbed_df_consistent_names
207
 
208
- def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- print("Identifying personal information")
211
- analyse_tic = time.perf_counter()
 
 
 
212
 
213
- key_string = ""
 
 
 
214
 
215
- # DataFrame to dict
216
- df_dict = df.to_dict(orient="list")
 
 
 
 
 
 
217
 
218
  if in_allow_list:
219
  in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
220
  else:
221
  in_allow_list_flat = []
 
 
222
 
223
- #analyzer = nlp_analyser #AnalyzerEngine()
224
- batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
225
-
226
- anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
 
 
 
 
 
229
 
230
- #print("Allow list:", in_allow_list)
231
- #print("Input data keys:", df_dict.keys())
 
 
 
232
 
233
- # Use custom analyzer to be able to track progress with Gradio
234
- analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
235
- entities=chosen_redact_entities,
236
- score_threshold=score_threshold,
237
- return_decision_process=True,
238
- allow_list=in_allow_list_flat)
239
 
240
- analyzer_results = list(analyzer_results)
 
 
 
 
 
241
 
242
- # Usage in the main function:
243
- decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
244
 
245
- #print("decision_process_output_str:\n\n", decision_process_output_str)
 
246
 
247
- analyse_toc = time.perf_counter()
248
- analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
249
- print(analyse_time_out)
250
 
251
- # Create faker function (note that it has to receive a value)
252
- fake = Faker("en_UK")
253
 
254
- def fake_first_name(x):
255
- return fake.first_name()
 
 
 
256
 
257
- # Set up the anonymization configuration WITHOUT DATE_TIME
258
- simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
259
- replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
260
- redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
261
- hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
262
- mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
263
- people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
264
- fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
265
 
266
- if anon_strat == "replace with <REDACTED>": chosen_mask_config = simple_replace_config
267
- if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
268
- if anon_strat == "redact": chosen_mask_config = redact_config
269
- if anon_strat == "hash": chosen_mask_config = hash_config
270
- if anon_strat == "mask": chosen_mask_config = mask_config
271
- if anon_strat == "encrypt":
272
- chosen_mask_config = people_encrypt_config
273
- # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
274
- key = secrets.token_bytes(16) # 128 bits = 16 bytes
275
- key_string = base64.b64encode(key).decode('utf-8')
276
- elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
277
 
278
- # I think in general people will want to keep date / times
279
- keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
280
 
281
- combined_config = {**chosen_mask_config, **keep_date_config}
282
- combined_config
 
 
283
 
284
- anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
 
 
 
 
 
285
 
286
- scrubbed_df = pd.DataFrame(anonymizer_results)
287
-
288
- return scrubbed_df, key_string, decision_process_output_str
 
289
 
290
- def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths):
 
 
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  def check_lists(list1, list2):
293
  return any(string in list2 for string in list1)
294
 
@@ -309,6 +467,9 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
309
  common_strings.append(string)
310
  return common_strings
311
 
 
 
 
312
  # Check for chosen col, skip file if not found
313
  all_cols_original_order = list(anon_df.columns)
314
 
@@ -321,13 +482,13 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
321
  chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
322
 
323
  # Split dataframe to keep only selected columns
324
- print("Remaining columns to redact:", chosen_cols_in_anon_df)
325
 
326
  anon_df_part = anon_df[chosen_cols_in_anon_df]
327
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
328
-
329
  # Anonymise the selected columns
330
- anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
331
 
332
  # Rejoin the dataframe together
333
  anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
@@ -336,8 +497,9 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
336
  # Export file
337
 
338
  # Rename anonymisation strategy for file path naming
339
- if anon_strat == "replace with <REDACTED>": anon_strat_txt = "redact_simple"
340
  elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
 
341
  else: anon_strat_txt = anon_strat
342
 
343
  # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
@@ -374,130 +536,196 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
374
 
375
  return out_file_paths, out_message, key_string, log_files_output_paths
376
 
377
- def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], log_files_output_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
378
-
379
- tic = time.perf_counter()
380
-
381
- # If this is the first time around, set variables to 0/blank
382
- if first_loop_state==True:
383
- latest_file_completed = 0
384
- out_message = []
385
- out_file_paths = []
386
-
387
- # Load file
388
- # If out message or out_file_paths are blank, change to a list so it can be appended to
389
- if isinstance(out_message, str):
390
- out_message = [out_message]
391
 
392
- print("log_files_output_paths:",log_files_output_paths)
 
393
 
394
- if isinstance(log_files_output_paths, str):
395
- log_files_output_paths = []
 
396
 
397
- if not out_file_paths:
398
- out_file_paths = []
399
-
400
 
401
  if in_allow_list:
402
  in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
403
  else:
404
  in_allow_list_flat = []
405
-
406
- anon_df = pd.DataFrame()
407
- #out_file_paths = []
408
-
409
- # Check if files and text exist
410
- if not file_paths:
411
- if in_text:
412
- file_paths=['open_text']
413
- else:
414
- out_message = "Please enter text or a file to redact."
415
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
416
-
417
- # If we have already redacted the last file, return the input out_message and file list to the relevant components
418
- if latest_file_completed >= len(file_paths):
419
- print("Last file reached, returning files:", str(latest_file_completed))
420
- # Set to a very high number so as not to mess with subsequent file processing by the user
421
- latest_file_completed = 99
422
- final_out_message = '\n'.join(out_message)
423
- return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
424
-
425
- file_path_loop = [file_paths[int(latest_file_completed)]]
426
-
427
- for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
428
 
429
- if anon_file=='open_text':
430
- anon_df = pd.DataFrame(data={'text':[in_text]})
431
- chosen_cols=['text']
432
- sheet_name = ""
433
- file_type = ""
434
- out_file_part = anon_file
435
-
436
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
437
  else:
438
- # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
439
- file_type = detect_file_type(anon_file)
440
- print("File type is:", file_type)
441
 
442
- out_file_part = get_file_name_without_type(anon_file.name)
443
-
444
- if file_type == 'xlsx':
445
- print("Running through all xlsx sheets")
446
- #anon_xlsx = pd.ExcelFile(anon_file)
447
- if not in_excel_sheets:
448
- out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
449
- continue
450
 
451
- anon_xlsx = pd.ExcelFile(anon_file)
 
 
 
452
 
453
- # Create xlsx file:
454
- anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
 
455
 
456
- from openpyxl import Workbook
 
457
 
458
- wb = Workbook()
459
- wb.save(anon_xlsx_export_file_name)
460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
 
462
- # Iterate through the sheet names
463
- for sheet_name in in_excel_sheets:
464
- # Read each sheet into a DataFrame
465
- if sheet_name not in anon_xlsx.sheet_names:
466
- continue
467
 
468
- anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
 
 
469
 
470
- # Process the DataFrame (e.g., print its contents)
471
- print(f"Sheet Name: {sheet_name}")
472
- print(anon_df.head()) # Print the first few rows
473
 
474
-
475
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths)
476
-
477
- else:
478
- sheet_name = ""
479
- anon_df = read_file(anon_file)
480
- out_file_part = get_file_name_without_type(anon_file.name)
481
 
482
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
 
 
 
 
 
 
 
483
 
484
- # Increase latest file completed count unless we are at the last file
485
- if latest_file_completed != len(file_paths):
486
- print("Completed file number:", str(latest_file_completed))
487
- latest_file_completed += 1
 
 
 
 
 
 
 
488
 
489
- toc = time.perf_counter()
490
- out_time = f"in {toc - tic:0.1f} seconds."
491
- print(out_time)
492
-
493
- if anon_strat == "encrypt":
494
- out_message.append(". Your decryption key is " + key_string + ".")
495
 
496
- out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
497
 
498
- out_message_out = '\n'.join(out_message)
499
- out_message_out = out_message_out + " " + out_time
500
 
501
- out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
502
 
503
- return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
 
2
  import secrets
3
  import base64
4
  import time
5
+ import boto3
6
+ import botocore
7
  import pandas as pd
8
 
9
  from faker import Faker
 
14
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
15
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
16
 
17
+ from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
18
+ from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
19
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
20
+ from tools.custom_image_analyser_engine import do_aws_comprehend_call
21
  # Use custom version of analyze_dict to be able to track progress
22
  from tools.presidio_analyzer_custom import analyze_dict
23
 
 
111
 
112
  decision_process_output_str = '\n'.join(decision_process_output)
113
 
 
 
 
114
  return decision_process_output_str
115
 
116
  def anon_consistent_names(df):
 
205
 
206
  return scrubbed_df_consistent_names
207
 
208
+ def anonymise_data_files(file_paths: List[str],
209
+ in_text: str,
210
+ anon_strat: str,
211
+ chosen_cols: List[str],
212
+ language: str,
213
+ chosen_redact_entities: List[str],
214
+ in_allow_list: List[str] = None,
215
+ latest_file_completed: int = 0,
216
+ out_message: list = [],
217
+ out_file_paths: list = [],
218
+ log_files_output_paths: list = [],
219
+ in_excel_sheets: list = [],
220
+ first_loop_state: bool = False,
221
+ output_folder: str = OUTPUT_FOLDER,
222
+ in_deny_list:list[str]=[],
223
+ max_fuzzy_spelling_mistakes_num:int=0,
224
+ pii_identification_method:str="Local",
225
+ chosen_redact_comprehend_entities:List[str]=[],
226
+ comprehend_query_number:int=0,
227
+ aws_access_key_textbox:str='',
228
+ aws_secret_key_textbox:str='',
229
+ progress: Progress = Progress(track_tqdm=True)):
230
+ """
231
+ This function anonymises data files based on the provided parameters.
232
+
233
+ Parameters:
234
+ - file_paths (List[str]): A list of file paths to anonymise.
235
+ - in_text (str): The text to anonymise if file_paths is 'open_text'.
236
+ - anon_strat (str): The anonymisation strategy to use.
237
+ - chosen_cols (List[str]): A list of column names to anonymise.
238
+ - language (str): The language of the text to anonymise.
239
+ - chosen_redact_entities (List[str]): A list of entities to redact.
240
+ - in_allow_list (List[str], optional): A list of allowed values. Defaults to None.
241
+ - latest_file_completed (int, optional): The index of the last file completed. Defaults to 0.
242
+ - out_message (list, optional): A list to store output messages. Defaults to an empty list.
243
+ - out_file_paths (list, optional): A list to store output file paths. Defaults to an empty list.
244
+ - log_files_output_paths (list, optional): A list to store log file paths. Defaults to an empty list.
245
+ - in_excel_sheets (list, optional): A list of Excel sheet names. Defaults to an empty list.
246
+ - first_loop_state (bool, optional): Indicates if this is the first loop iteration. Defaults to False.
247
+ - output_folder (str, optional): The output folder path. Defaults to the global output_folder variable.
248
+ - in_deny_list (list[str], optional): A list of specific terms to redact.
249
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
250
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
251
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
252
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
253
+ - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
254
+ - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
255
+ - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
256
+ """
257
+
258
+ tic = time.perf_counter()
259
+ comprehend_client = ""
260
 
261
+ # If this is the first time around, set variables to 0/blank
262
+ if first_loop_state==True:
263
+ latest_file_completed = 0
264
+ out_message = []
265
+ out_file_paths = []
266
 
267
+ # Load file
268
+ # If out message or out_file_paths are blank, change to a list so it can be appended to
269
+ if isinstance(out_message, str):
270
+ out_message = [out_message]
271
 
272
+ #print("log_files_output_paths:",log_files_output_paths)
273
+
274
+ if isinstance(log_files_output_paths, str):
275
+ log_files_output_paths = []
276
+
277
+ if not out_file_paths:
278
+ out_file_paths = []
279
+
280
 
281
  if in_allow_list:
282
  in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
283
  else:
284
  in_allow_list_flat = []
285
+
286
+ anon_df = pd.DataFrame()
287
 
288
+ # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
289
+ if pii_identification_method == "AWS Comprehend":
290
+ print("Trying to connect to AWS Comprehend service")
291
+ if aws_access_key_textbox and aws_secret_key_textbox:
292
+ print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
293
+ print("aws_access_key_textbox:", aws_access_key_textbox)
294
+ print("aws_secret_access_key:", aws_secret_key_textbox)
295
+ comprehend_client = boto3.client('comprehend',
296
+ aws_access_key_id=aws_access_key_textbox,
297
+ aws_secret_access_key=aws_secret_key_textbox)
298
+ elif RUN_AWS_FUNCTIONS == "1":
299
+ print("Connecting to Comprehend via existing SSO connection")
300
+ comprehend_client = boto3.client('comprehend')
301
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
302
+ print("Getting Comprehend credentials from environment variables")
303
+ comprehend_client = boto3.client('comprehend',
304
+ aws_access_key_id=AWS_ACCESS_KEY,
305
+ aws_secret_access_key=AWS_SECRET_KEY)
306
+ else:
307
+ comprehend_client = ""
308
+ out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
309
+ print(out_message)
310
+
311
+ # Check if files and text exist
312
+ if not file_paths:
313
+ if in_text:
314
+ file_paths=['open_text']
315
+ else:
316
+ out_message = "Please enter text or a file to redact."
317
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
318
+
319
+ # If we have already redacted the last file, return the input out_message and file list to the relevant components
320
+ if latest_file_completed >= len(file_paths):
321
+ print("Last file reached") #, returning files:", str(latest_file_completed))
322
+ # Set to a very high number so as not to mess with subsequent file processing by the user
323
+ latest_file_completed = 99
324
+ final_out_message = '\n'.join(out_message)
325
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
326
+
327
+ file_path_loop = [file_paths[int(latest_file_completed)]]
328
+
329
+ for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
330
 
331
+ if anon_file=='open_text':
332
+ anon_df = pd.DataFrame(data={'text':[in_text]})
333
+ chosen_cols=['text']
334
+ sheet_name = ""
335
+ file_type = ""
336
+ out_file_part = anon_file
337
 
338
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER)
339
+ else:
340
+ # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
341
+ file_type = detect_file_type(anon_file)
342
+ print("File type is:", file_type)
343
 
344
+ out_file_part = get_file_name_without_type(anon_file.name)
 
 
 
 
 
345
 
346
+ if file_type == 'xlsx':
347
+ print("Running through all xlsx sheets")
348
+ #anon_xlsx = pd.ExcelFile(anon_file)
349
+ if not in_excel_sheets:
350
+ out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
351
+ continue
352
 
353
+ anon_xlsx = pd.ExcelFile(anon_file)
 
354
 
355
+ # Create xlsx file:
356
+ anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
357
 
358
+ from openpyxl import Workbook
 
 
359
 
360
+ wb = Workbook()
361
+ wb.save(anon_xlsx_export_file_name)
362
 
363
+ # Iterate through the sheet names
364
+ for sheet_name in in_excel_sheets:
365
+ # Read each sheet into a DataFrame
366
+ if sheet_name not in anon_xlsx.sheet_names:
367
+ continue
368
 
369
+ anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
 
 
 
 
 
 
 
370
 
371
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
372
+
373
+ else:
374
+ sheet_name = ""
375
+ anon_df = read_file(anon_file)
376
+ out_file_part = get_file_name_without_type(anon_file.name)
 
 
 
 
 
377
 
378
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
 
379
 
380
+ # Increase latest file completed count unless we are at the last file
381
+ if latest_file_completed != len(file_paths):
382
+ print("Completed file number:", str(latest_file_completed))
383
+ latest_file_completed += 1
384
 
385
+ toc = time.perf_counter()
386
+ out_time = f"in {toc - tic:0.1f} seconds."
387
+ print(out_time)
388
+
389
+ if anon_strat == "encrypt":
390
+ out_message.append(". Your decryption key is " + key_string + ".")
391
 
392
+ out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
393
+
394
+ out_message_out = '\n'.join(out_message)
395
+ out_message_out = out_message_out + " " + out_time
396
 
397
+ out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
398
+
399
+ return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
400
 
401
+ def anon_wrapper_func(
402
+ anon_file: str,
403
+ anon_df: pd.DataFrame,
404
+ chosen_cols: List[str],
405
+ out_file_paths: List[str],
406
+ out_file_part: str,
407
+ out_message: str,
408
+ excel_sheet_name: str,
409
+ anon_strat: str,
410
+ language: str,
411
+ chosen_redact_entities: List[str],
412
+ in_allow_list: List[str],
413
+ file_type: str,
414
+ anon_xlsx_export_file_name: str,
415
+ log_files_output_paths: List[str],
416
+ in_deny_list: List[str]=[],
417
+ max_fuzzy_spelling_mistakes_num:int=0,
418
+ pii_identification_method:str="Local",
419
+ chosen_redact_comprehend_entities:List[str]=[],
420
+ comprehend_query_number:int=0,
421
+ comprehend_client:botocore.client.BaseClient="",
422
+ output_folder: str = OUTPUT_FOLDER
423
+ ):
424
+ """
425
+ This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
426
+
427
+ Input Variables:
428
+ - anon_file: The path to the file containing the data to be anonymized.
429
+ - anon_df: The pandas DataFrame containing the data to be anonymized.
430
+ - chosen_cols: A list of column names to be anonymized.
431
+ - out_file_paths: A list of paths where the anonymized files will be saved.
432
+ - out_file_part: A part of the output file name.
433
+ - out_message: A message to be displayed during the anonymization process.
434
+ - excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
435
+ - anon_strat: The anonymization strategy to be applied.
436
+ - language: The language of the data to be anonymized.
437
+ - chosen_redact_entities: A list of entities to be redacted.
438
+ - in_allow_list: A list of allowed values.
439
+ - file_type: The type of file to be exported.
440
+ - anon_xlsx_export_file_name: The name of the anonymized Excel file.
441
+ - log_files_output_paths: A list of paths where the log files will be saved.
442
+ - in_deny_list: List of specific terms to remove from the data.
443
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
444
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
445
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
446
+ - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
447
+ - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
448
+ - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
449
+ """
450
  def check_lists(list1, list2):
451
  return any(string in list2 for string in list1)
452
 
 
467
  common_strings.append(string)
468
  return common_strings
469
 
470
+ if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
471
+ raise("Connection to AWS Comprehend service not found, please check connection details.")
472
+
473
  # Check for chosen col, skip file if not found
474
  all_cols_original_order = list(anon_df.columns)
475
 
 
482
  chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
483
 
484
  # Split dataframe to keep only selected columns
485
+ #print("Remaining columns to redact:", chosen_cols_in_anon_df)
486
 
487
  anon_df_part = anon_df[chosen_cols_in_anon_df]
488
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
489
+
490
  # Anonymise the selected columns
491
+ anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
492
 
493
  # Rejoin the dataframe together
494
  anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
 
497
  # Export file
498
 
499
  # Rename anonymisation strategy for file path naming
500
+ if anon_strat == "replace with 'REDACTED'": anon_strat_txt = "redact_replace"
501
  elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
502
+ elif anon_strat == "redact completely": anon_strat_txt = "redact_remove"
503
  else: anon_strat_txt = anon_strat
504
 
505
  # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
 
536
 
537
  return out_file_paths, out_message, key_string, log_files_output_paths
538
 
539
+ def anonymise_script(df:pd.DataFrame, anon_strat:str, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], max_fuzzy_spelling_mistakes_num:int=0, pii_identification_method:str="Local", chosen_redact_comprehend_entities:List[str]=[], comprehend_query_number:int=0, comprehend_client:botocore.client.BaseClient="", custom_entities=custom_entities, progress=Progress(track_tqdm=False)):
540
+ '''
541
+ Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
542
+ '''
 
 
 
 
 
 
 
 
 
 
543
 
544
+ print("Identifying personal information")
545
+ analyse_tic = time.perf_counter()
546
 
547
+ # Initialize analyzer_results as an empty dictionary to store results by column
548
+ results_by_column = {}
549
+ key_string = ""
550
 
551
+ # DataFrame to dict
552
+ df_dict = df.to_dict(orient="list")
 
553
 
554
  if in_allow_list:
555
  in_allow_list_flat = in_allow_list #[item for sublist in in_allow_list for item in sublist]
556
  else:
557
  in_allow_list_flat = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
 
559
+ if isinstance(in_deny_list, pd.DataFrame):
560
+ if not in_deny_list.empty:
561
+ in_deny_list = in_deny_list.iloc[:, 0].tolist()
 
 
 
 
 
562
  else:
563
+ # Handle the case where the DataFrame is empty
564
+ in_deny_list = [] # or some default value
 
565
 
566
+ # Sort the strings in order from the longest string to the shortest
567
+ in_deny_list = sorted(in_deny_list, key=len, reverse=True)
 
 
 
 
 
 
568
 
569
+ if in_deny_list:
570
+ nlp_analyser.registry.remove_recognizer("CUSTOM")
571
+ new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
572
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
573
 
574
+ nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
575
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=in_deny_list, search_whole_phrase=max_fuzzy_spelling_mistakes_num)
576
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
577
 
578
+ #analyzer = nlp_analyser #AnalyzerEngine()
579
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
580
 
581
+ anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
 
582
 
583
+ batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
584
+
585
+ analyzer_results = []
586
+
587
+ if pii_identification_method == "Local":
588
+
589
+ # Use custom analyzer to be able to track progress with Gradio
590
+ custom_results = analyze_dict(batch_analyzer,
591
+ df_dict,
592
+ language=language,
593
+ entities=chosen_redact_entities,
594
+ score_threshold=score_threshold,
595
+ return_decision_process=True,
596
+ allow_list=in_allow_list_flat)
597
+
598
+ # Initialize results_by_column with custom entity results
599
+ for result in custom_results:
600
+ results_by_column[result.key] = result
601
+
602
+ # Convert the dictionary of results back to a list
603
+ analyzer_results = list(results_by_column.values())
604
+
605
+ # AWS Comprehend calls
606
+ elif pii_identification_method == "AWS Comprehend" and comprehend_client:
607
+
608
+ # Only run Local anonymisation for entities that are not covered by AWS Comprehend
609
+ if custom_entities:
610
+ custom_redact_entities = [
611
+ entity for entity in chosen_redact_comprehend_entities
612
+ if entity in custom_entities
613
+ ]
614
+ if custom_redact_entities:
615
+ # Get results from analyze_dict
616
+ custom_results = analyze_dict(batch_analyzer,
617
+ df_dict,
618
+ language=language,
619
+ entities=custom_redact_entities,
620
+ score_threshold=score_threshold,
621
+ return_decision_process=True,
622
+ allow_list=in_allow_list_flat)
623
+
624
+ # Initialize results_by_column with custom entity results
625
+ for result in custom_results:
626
+ results_by_column[result.key] = result
627
+
628
+ max_retries = 3
629
+ retry_delay = 3
630
+
631
+ # Process each text column in the dictionary
632
+ for column_name, texts in progress.tqdm(df_dict.items(), desc="Querying AWS Comprehend service.", unit = "Columns"):
633
+ # Get or create DictAnalyzerResult for this column
634
+ if column_name in results_by_column:
635
+ column_results = results_by_column[column_name]
636
+ else:
637
+ column_results = DictAnalyzerResult(
638
+ recognizer_results=[[] for _ in texts],
639
+ key=column_name,
640
+ value=texts
641
+ )
642
+
643
+ # Process each text in the column
644
+ for text_idx, text in progress.tqdm(enumerate(texts), desc="Querying AWS Comprehend service.", unit = "Row"):
645
+
646
+ for attempt in range(max_retries):
647
+ try:
648
+ response = comprehend_client.detect_pii_entities(
649
+ Text=str(text),
650
+ LanguageCode=language
651
+ )
652
+
653
+ comprehend_query_number += 1
654
+
655
+ # Add all entities from this text to the column's recognizer_results
656
+ for entity in response["Entities"]:
657
+ if entity.get("Type") not in chosen_redact_comprehend_entities:
658
+ continue
659
+
660
+ recognizer_result = RecognizerResult(
661
+ entity_type=entity["Type"],
662
+ start=entity["BeginOffset"],
663
+ end=entity["EndOffset"],
664
+ score=entity["Score"]
665
+ )
666
+ column_results.recognizer_results[text_idx].append(recognizer_result)
667
+
668
+ break # Success, exit retry loop
669
+
670
+ except Exception as e:
671
+ if attempt == max_retries - 1:
672
+ print(f"AWS Comprehend calls failed for text: {text[:100]}... due to", e)
673
+ raise
674
+ time.sleep(retry_delay)
675
+
676
+ # Store or update the column results
677
+ results_by_column[column_name] = column_results
678
+
679
+ # Convert the dictionary of results back to a list
680
+ analyzer_results = list(results_by_column.values())
681
+
682
+ elif (pii_identification_method == "AWS Comprehend") & (not comprehend_client):
683
+ raise("Unable to redact, Comprehend connection details not found.")
684
+
685
+ else:
686
+ print("Unable to redact.")
687
 
688
+ # Usage in the main function:
689
+ decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
 
 
 
690
 
691
+ analyse_toc = time.perf_counter()
692
+ analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
693
+ print(analyse_time_out)
694
 
695
+ # Create faker function (note that it has to receive a value)
696
+ #fake = Faker("en_UK")
 
697
 
698
+ #def fake_first_name(x):
699
+ # return fake.first_name()
 
 
 
 
 
700
 
701
+ # Set up the anonymization configuration WITHOUT DATE_TIME
702
+ simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
703
+ replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
704
+ redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
705
+ hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
706
+ mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
707
+ people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
708
+ fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
709
 
710
+ if anon_strat == "replace with 'REDACTED'": chosen_mask_config = simple_replace_config
711
+ if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
712
+ if anon_strat == "redact completely": chosen_mask_config = redact_config
713
+ if anon_strat == "hash": chosen_mask_config = hash_config
714
+ if anon_strat == "mask": chosen_mask_config = mask_config
715
+ if anon_strat == "encrypt":
716
+ chosen_mask_config = people_encrypt_config
717
+ # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
718
+ key = secrets.token_bytes(16) # 128 bits = 16 bytes
719
+ key_string = base64.b64encode(key).decode('utf-8')
720
+ elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
721
 
722
+ # I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
723
+ #keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
 
 
 
 
724
 
725
+ combined_config = {**chosen_mask_config} #, **keep_date_config}
726
 
727
+ anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
 
728
 
729
+ scrubbed_df = pd.DataFrame(anonymizer_results)
730
 
731
+ return scrubbed_df, key_string, decision_process_output_str
tools/file_conversion.py CHANGED
@@ -1,23 +1,37 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
6
  import time
7
  import json
 
8
  import pymupdf
 
9
  import pandas as pd
10
- import numpy as np
11
- from pymupdf import Rect
12
- from fitz import Page
13
  from tqdm import tqdm
14
  from gradio import Progress
15
- from typing import List, Optional
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- image_dpi = 300.0
19
- ImageFile.LOAD_TRUNCATED_IMAGES = True
20
- Image.MAX_IMAGE_PIXELS = None
 
21
 
22
  def is_pdf_or_image(filename):
23
  """
@@ -47,175 +61,166 @@ def is_pdf(filename):
47
  """
48
  return filename.lower().endswith(".pdf")
49
 
50
- # %%
51
  ## Convert pdf to image if necessary
52
 
53
- CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
54
- print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
 
 
55
 
56
- import os
57
- from pdf2image import convert_from_path
58
- from PIL import Image
59
 
60
- def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
61
- try:
62
- # Construct the full output directory path
63
- output_dir = os.path.join(os.getcwd(), output_dir)
64
- out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
65
- os.makedirs(os.path.dirname(out_path), exist_ok=True)
66
-
67
- if os.path.exists(out_path):
68
- # Load existing image
69
- image = Image.open(out_path)
70
- else:
71
- # Convert PDF page to image
72
- image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
73
- dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
74
- image = image_l[0]
75
- image = image.convert("L")
76
- image.save(out_path, format="PNG")
77
-
78
- # Check file size and resize if necessary
79
- max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
80
- file_size = os.path.getsize(out_path)
81
-
82
- # Resize images if they are too big
83
- if file_size > max_size:
84
- # Start with the original image size
85
- width, height = image.size
86
 
87
- print(f"Image size before {width}x{height}, original file_size: {file_size}")
 
88
 
89
- while file_size > max_size:
90
- # Reduce the size by a factor (e.g., 50% of the current size)
91
- new_width = int(width * 0.5)
92
- new_height = int(height * 0.5)
93
- image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
94
-
95
- # Save the resized image
96
- image.save(out_path, format="PNG", optimize=True)
97
-
98
- # Update the file size
99
- file_size = os.path.getsize(out_path)
100
- print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
101
-
102
- # Update the dimensions for the next iteration
103
- width, height = new_width, new_height
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- return page_num, out_path
 
 
 
 
 
 
 
 
106
 
107
- except Exception as e:
108
- print(f"Error processing page {page_num + 1}: {e}")
109
- return page_num, None
 
110
 
111
- def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- # If preparing for review, just load the first page (not used)
114
  if prepare_for_review == True:
115
  page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
 
 
116
  else:
117
  page_count = pdfinfo_from_path(pdf_path)['Pages']
118
 
119
  print(f"Number of pages in PDF: {page_count}")
120
 
 
 
 
121
  results = []
122
  with ThreadPoolExecutor(max_workers=num_threads) as executor:
123
  futures = []
124
- for page_num in range(page_min, page_count):
125
- futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
126
 
127
- for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
128
- page_num, result = future.result()
129
- if result:
130
- results.append((page_num, result))
131
  else:
132
  print(f"Page {page_num + 1} failed to process.")
 
133
 
134
  # Sort results by page number
135
  results.sort(key=lambda x: x[0])
136
  images = [result[1] for result in results]
 
 
137
 
138
  print("PDF has been converted to images.")
139
- return images
140
-
141
-
142
- # def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
143
-
144
- # print("pdf_path in convert_pdf_to_images:", pdf_path)
145
-
146
- # # Get the number of pages in the PDF
147
- # page_count = pdfinfo_from_path(pdf_path)['Pages']
148
- # print("Number of pages in PDF: ", str(page_count))
149
-
150
- # images = []
151
-
152
- # # Open the PDF file
153
- # #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
154
- # for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
155
-
156
- # #print("page_num in convert_pdf_to_images:", page_num)
157
-
158
- # print("Converting page: ", str(page_num + 1))
159
-
160
- # # Convert one page to image
161
- # out_path = pdf_path + "_" + str(page_num) + ".png"
162
-
163
- # # Ensure the directory exists
164
- # os.makedirs(os.path.dirname(out_path), exist_ok=True)
165
-
166
- # # Check if the image already exists
167
- # if os.path.exists(out_path):
168
- # #print(f"Loading existing image from {out_path}.")
169
- # image = Image.open(out_path) # Load the existing image
170
-
171
- # else:
172
- # image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
173
-
174
- # image = image_l[0]
175
-
176
- # # Convert to greyscale
177
- # image = image.convert("L")
178
-
179
- # image.save(out_path, format="PNG") # Save the new image
180
-
181
- # # If no images are returned, break the loop
182
- # if not image:
183
- # print("Conversion of page", str(page_num), "to file failed.")
184
- # break
185
-
186
- # # print("Conversion of page", str(page_num), "to file succeeded.")
187
- # # print("image:", image)
188
-
189
- # images.append(out_path)
190
-
191
- # print("PDF has been converted to images.")
192
- # # print("Images:", images)
193
-
194
- # return images
195
 
196
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
197
- def process_file(file_path:str, prepare_for_review:bool=False):
198
  # Get the file extension
199
  file_extension = os.path.splitext(file_path)[1].lower()
200
-
201
  # Check if the file is an image type
202
  if file_extension in ['.jpg', '.jpeg', '.png']:
203
  print(f"{file_path} is an image file.")
204
  # Perform image processing here
205
  img_object = [file_path] #[Image.open(file_path)]
206
- # Load images from the file paths
 
 
 
 
 
 
 
 
 
 
207
 
208
  # Check if the file is a PDF
209
  elif file_extension == '.pdf':
210
  print(f"{file_path} is a PDF file. Converting to image set")
 
211
  # Run your function for processing PDF files here
212
- img_object = convert_pdf_to_images(file_path, prepare_for_review)
213
 
214
  else:
215
  print(f"{file_path} is not an image or PDF file.")
216
- img_object = ['']
 
 
 
217
 
218
- return img_object
219
 
220
  def get_input_file_names(file_input:List[str]):
221
  '''
@@ -225,8 +230,8 @@ def get_input_file_names(file_input:List[str]):
225
  all_relevant_files = []
226
  file_name_with_extension = ""
227
  full_file_name = ""
 
228
 
229
- #print("file_input in input file names:", file_input)
230
  if isinstance(file_input, dict):
231
  file_input = os.path.abspath(file_input["name"])
232
 
@@ -245,23 +250,38 @@ def get_input_file_names(file_input:List[str]):
245
 
246
  file_extension = os.path.splitext(file_path)[1].lower()
247
 
248
- # Check if the file is an image type
249
  if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
250
  all_relevant_files.append(file_path_without_ext)
251
  file_name_with_extension = file_path_without_ext + file_extension
252
  full_file_name = file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  all_relevant_files_str = ", ".join(all_relevant_files)
255
 
256
- #print("all_relevant_files_str in input_file_names", all_relevant_files_str)
257
- #print("all_relevant_files in input_file_names", all_relevant_files)
258
-
259
- return all_relevant_files_str, file_name_with_extension, full_file_name, all_relevant_files
260
 
261
  def convert_color_to_range_0_1(color):
262
  return tuple(component / 255 for component in color)
263
 
264
  def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
 
 
 
 
265
  pymupdf_x1 = pymupdf_rect[0]
266
  pymupdf_y1 = pymupdf_rect[1]
267
  pymupdf_x2 = pymupdf_rect[2]
@@ -277,7 +297,6 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
277
  redact_bottom_y = middle_y - 1
278
  redact_top_y = middle_y + 1
279
 
280
- #print("Rect:", rect)
281
 
282
  rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
283
 
@@ -304,73 +323,7 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
304
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
305
  shape.commit()
306
 
307
- # def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
308
- # '''
309
- # Converts coordinates from pymupdf format to image coordinates,
310
- # accounting for mediabox dimensions and offset.
311
- # '''
312
- # # Get rect dimensions
313
- # rect = pymupdf_page.rect
314
- # rect_width = rect.width
315
- # rect_height = rect.height
316
-
317
- # # Get mediabox dimensions and position
318
- # mediabox = pymupdf_page.mediabox
319
- # mediabox_width = mediabox.width
320
- # mediabox_height = mediabox.height
321
-
322
- # # Get target image dimensions
323
- # image_page_width, image_page_height = image.size
324
-
325
- # # Calculate scaling factors
326
- # image_to_mediabox_x_scale = image_page_width / mediabox_width
327
- # image_to_mediabox_y_scale = image_page_height / mediabox_height
328
-
329
- # image_to_rect_scale_width = image_page_width / rect_width
330
- # image_to_rect_scale_height = image_page_height / rect_height
331
-
332
- # # Adjust for offsets (difference in position between mediabox and rect)
333
- # x_offset = rect.x0 - mediabox.x0 # Difference in x position
334
- # y_offset = rect.y0 - mediabox.y0 # Difference in y position
335
-
336
- # print("x_offset:", x_offset)
337
- # print("y_offset:", y_offset)
338
-
339
- # # Adjust coordinates:
340
- # # Apply scaling to match image dimensions
341
- # x1_image = x1 * image_to_mediabox_x_scale
342
- # x2_image = x2 * image_to_mediabox_x_scale
343
- # y1_image = y1 * image_to_mediabox_y_scale
344
- # y2_image = y2 * image_to_mediabox_y_scale
345
-
346
- # # Correct for difference in rect and mediabox size
347
- # if mediabox_width != rect_width:
348
-
349
- # mediabox_to_rect_x_scale = mediabox_width / rect_width
350
- # mediabox_to_rect_y_scale = mediabox_height / rect_height
351
-
352
- # x1_image *= mediabox_to_rect_x_scale
353
- # x2_image *= mediabox_to_rect_x_scale
354
- # y1_image *= mediabox_to_rect_y_scale
355
- # y2_image *= mediabox_to_rect_y_scale
356
-
357
- # print("mediabox_to_rect_x_scale:", mediabox_to_rect_x_scale)
358
- # #print("mediabox_to_rect_y_scale:", mediabox_to_rect_y_scale)
359
-
360
- # print("image_to_mediabox_x_scale:", image_to_mediabox_x_scale)
361
- # #print("image_to_mediabox_y_scale:", image_to_mediabox_y_scale)
362
-
363
- # mediabox_rect_x_diff = (mediabox_width - rect_width) * 2
364
- # mediabox_rect_y_diff = (mediabox_height - rect_height) * 2
365
-
366
- # x1_image -= mediabox_rect_x_diff
367
- # x2_image -= mediabox_rect_x_diff
368
- # y1_image += mediabox_rect_y_diff
369
- # y2_image += mediabox_rect_y_diff
370
-
371
- # return x1_image, y1_image, x2_image, y2_image
372
-
373
- def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
374
  '''
375
  Converts coordinates from pymupdf format to image coordinates,
376
  accounting for mediabox dimensions and offset.
@@ -386,22 +339,17 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
386
  mediabox_height = mediabox.height
387
 
388
  # Get target image dimensions
389
- image_page_width, image_page_height = image.size
 
 
 
 
 
390
 
391
  # Calculate scaling factors
392
  image_to_mediabox_x_scale = image_page_width / mediabox_width
393
  image_to_mediabox_y_scale = image_page_height / mediabox_height
394
 
395
- image_to_rect_scale_width = image_page_width / rect_width
396
- image_to_rect_scale_height = image_page_height / rect_height
397
-
398
- # Adjust for offsets (difference in position between mediabox and rect)
399
- x_offset = rect.x0 - mediabox.x0 # Difference in x position
400
- y_offset = rect.y0 - mediabox.y0 # Difference in y position
401
-
402
- #print("x_offset:", x_offset)
403
- #print("y_offset:", y_offset)
404
-
405
  # Adjust coordinates:
406
  # Apply scaling to match image dimensions
407
  x1_image = x1 * image_to_mediabox_x_scale
@@ -434,26 +382,24 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
434
 
435
  return x1_image, y1_image, x2_image, y2_image
436
 
437
-
438
-
439
- def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
440
  # Small border to page that remains white
441
  border = 5
442
  # Define the coordinates for the Rect
443
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
444
  whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
445
 
446
- whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
447
 
448
  # Create new image annotation element based on whole page coordinates
449
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
450
 
451
  # Write whole page annotation to annotation boxes
452
  whole_page_img_annotation_box = {}
453
- whole_page_img_annotation_box["xmin"] = whole_page_image_x1
454
- whole_page_img_annotation_box["ymin"] = whole_page_image_y1
455
- whole_page_img_annotation_box["xmax"] = whole_page_image_x2
456
- whole_page_img_annotation_box["ymax"] = whole_page_image_y2
457
  whole_page_img_annotation_box["color"] = (0,0,0)
458
  whole_page_img_annotation_box["label"] = "Whole page"
459
 
@@ -461,6 +407,27 @@ def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colou
461
 
462
  return whole_page_img_annotation_box
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  def prepare_image_or_pdf(
465
  file_paths: List[str],
466
  in_redact_method: str,
@@ -471,6 +438,11 @@ def prepare_image_or_pdf(
471
  all_annotations_object:List = [],
472
  prepare_for_review:bool = False,
473
  in_fully_redacted_list:List[int]=[],
 
 
 
 
 
474
  progress: Progress = Progress(track_tqdm=True)
475
  ) -> tuple[List[str], List[str]]:
476
  """
@@ -489,7 +461,11 @@ def prepare_image_or_pdf(
489
  all_annotations_object(optional, List of annotation objects): All annotations for current document
490
  prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
491
  in_fully_redacted_list(optional, List of int): A list of pages to fully redact
492
- progress (optional, Progress): Progress tracker for the operation.
 
 
 
 
493
 
494
 
495
  Returns:
@@ -498,50 +474,34 @@ def prepare_image_or_pdf(
498
 
499
  tic = time.perf_counter()
500
  json_from_csv = False
 
 
 
 
 
 
501
 
502
  if isinstance(in_fully_redacted_list, pd.DataFrame):
503
- in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
 
504
 
505
  # If this is the first time around, set variables to 0/blank
506
  if first_loop_state==True:
507
- print("first_loop_state is True")
508
  latest_file_completed = 0
509
  out_message = []
510
  all_annotations_object = []
511
  else:
512
- print("Now attempting file:", str(latest_file_completed))
513
-
514
- # This is only run when a new page is loaded, so can reset page loop values. If end of last file (99), current loop number set to 999
515
- # if latest_file_completed == 99:
516
- # current_loop_page_number = 999
517
- # page_break_return = False
518
- # else:
519
- # current_loop_page_number = 0
520
- # page_break_return = False
521
-
522
  # If out message or converted_file_paths are blank, change to a list so it can be appended to
523
- if isinstance(out_message, str):
524
- out_message = [out_message]
525
-
526
- converted_file_paths = []
527
- image_file_paths = []
528
- pymupdf_doc = []
529
- review_file_csv = pd.DataFrame()
530
 
531
- if not file_paths:
532
- file_paths = []
533
 
534
- if isinstance(file_paths, dict):
535
- file_paths = os.path.abspath(file_paths["name"])
536
 
537
- if isinstance(file_paths, str):
538
- file_path_number = 1
539
- else:
540
- file_path_number = len(file_paths)
541
-
542
- #print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
543
- print("Number of file paths:", file_path_number)
544
- print("Latest_file_completed:", latest_file_completed)
545
 
546
  latest_file_completed = int(latest_file_completed)
547
 
@@ -552,9 +512,7 @@ def prepare_image_or_pdf(
552
  final_out_message = '\n'.join(out_message)
553
  else:
554
  final_out_message = out_message
555
- return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
556
-
557
- #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
558
 
559
  progress(0.1, desc='Preparing file')
560
 
@@ -586,16 +544,23 @@ def prepare_image_or_pdf(
586
  if not file_path:
587
  out_message = "Please select a file."
588
  print(out_message)
589
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
590
-
591
  file_extension = os.path.splitext(file_path)[1].lower()
592
 
593
  # If a pdf, load as a pymupdf document
594
  if is_pdf(file_path):
595
  pymupdf_doc = pymupdf.open(file_path)
 
596
 
597
  converted_file_path = file_path
598
- image_file_paths = process_file(file_path, prepare_for_review)
 
 
 
 
 
 
599
 
600
  #Create base version of the annotation object that doesn't have any annotations in it
601
  if (not all_annotations_object) & (prepare_for_review == True):
@@ -604,6 +569,7 @@ def prepare_image_or_pdf(
604
  for image_path in image_file_paths:
605
  annotation = {}
606
  annotation["image"] = image_path
 
607
 
608
  all_annotations_object.append(annotation)
609
 
@@ -617,24 +583,25 @@ def prepare_image_or_pdf(
617
 
618
  img = Image.open(file_path) # Open the image file
619
  rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
620
- page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
621
- page.insert_image(rect, filename=file_path) # Insert the image into the page
 
622
 
623
  file_path_str = str(file_path)
624
 
625
- image_file_paths = process_file(file_path_str, prepare_for_review)
626
 
627
- #print("image_file_paths:", image_file_paths)
628
 
629
- converted_file_path = output_folder + file_name_with_ext
 
630
 
631
- pymupdf_doc.save(converted_file_path)
632
 
633
- print("Inserted image into PDF file")
634
 
635
  elif file_extension in ['.csv']:
636
  review_file_csv = read_file(file)
637
- all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
638
  json_from_csv = True
639
  print("Converted CSV review file to json")
640
 
@@ -642,7 +609,6 @@ def prepare_image_or_pdf(
642
  if (file_extension in ['.json']) | (json_from_csv == True):
643
 
644
  if (file_extension in ['.json']) & (prepare_for_review == True):
645
- print("Preparing file for review")
646
  if isinstance(file_path, str):
647
  with open(file_path, 'r') as json_file:
648
  all_annotations_object = json.load(json_file)
@@ -651,18 +617,19 @@ def prepare_image_or_pdf(
651
  all_annotations_object = json.loads(file_path) # Use loads for string content
652
 
653
  # Assume it's a textract json
654
- elif (file_extension in ['.json']) & (prepare_for_review != True):
655
- # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
656
- json_contents = json.load(file_path)
657
- # Write the response to a JSON file in output folder
658
- out_folder = output_folder + file_path_without_ext + ".json"
659
- with open(out_folder, 'w') as json_file:
660
- json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
 
 
661
  continue
662
 
663
  # If you have an annotations object from the above code
664
  if all_annotations_object:
665
- #print("out_annotations_object before reloading images:", all_annotations_object)
666
 
667
  # Get list of page numbers
668
  image_file_paths_pages = [
@@ -674,11 +641,6 @@ def prepare_image_or_pdf(
674
 
675
  # If PDF pages have been converted to image files, replace the current image paths in the json to this.
676
  if image_file_paths:
677
- #print("Image file paths found")
678
-
679
- #print("Image_file_paths:", image_file_paths)
680
-
681
- #for i, annotation in enumerate(all_annotations_object):
682
  for i, image_file_path in enumerate(image_file_paths):
683
 
684
  if i < len(all_annotations_object):
@@ -687,18 +649,15 @@ def prepare_image_or_pdf(
687
  annotation = {}
688
  all_annotations_object.append(annotation)
689
 
690
- #print("annotation:", annotation, "for page:", str(i))
691
  try:
692
  if not annotation:
693
  annotation = {"image":"", "boxes": []}
694
  annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
695
-
696
  else:
697
  annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
698
  except Exception as e:
699
  print("Extracting page number from image failed due to:", e)
700
  annotation_page_number = 0
701
- #print("Annotation page number:", annotation_page_number)
702
 
703
  # Check if the annotation page number exists in the image file paths pages
704
  if annotation_page_number in image_file_paths_pages:
@@ -711,40 +670,66 @@ def prepare_image_or_pdf(
711
 
712
  all_annotations_object[i] = annotation
713
 
714
- #print("all_annotations_object at end of json/csv load part:", all_annotations_object)
 
 
715
 
716
  # Get list of pages that are to be fully redacted and redact them
717
- if in_fully_redacted_list:
718
  print("Redacting whole pages")
719
 
720
  for i, image in enumerate(image_file_paths):
721
  page = pymupdf_doc.load_page(i)
722
  rect_height = page.rect.height
723
  rect_width = page.rect.width
724
- whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
725
 
726
  all_annotations_object.append(whole_page_img_annotation_box)
727
 
728
  # Write the response to a JSON file in output folder
729
  out_folder = output_folder + file_path_without_ext + ".json"
730
- with open(out_folder, 'w') as json_file:
731
- json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
732
  continue
733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  # Must be something else, return with error message
735
  else:
736
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
737
  if is_pdf_or_image(file_path) == False:
738
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
739
  print(out_message)
740
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
741
 
742
  elif in_redact_method == text_ocr_option:
743
  if is_pdf(file_path) == False:
744
  out_message = "Please upload a PDF file for text analysis."
745
  print(out_message)
746
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
747
-
748
 
749
  converted_file_paths.append(converted_file_path)
750
  image_file_paths.extend(image_file_path)
@@ -759,32 +744,26 @@ def prepare_image_or_pdf(
759
 
760
  number_of_pages = len(image_file_paths)
761
 
762
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
763
 
764
- def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
765
  file_path_without_ext = get_file_name_without_type(in_file_path)
766
 
767
  out_file_paths = out_text_file_path
768
 
769
- # Convert annotated text pdf back to image to give genuine redactions
770
- print("Creating image version of redacted PDF to embed redactions.")
771
-
772
- pdf_text_image_paths = process_file(out_text_file_path[0])
773
  out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
774
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
775
 
776
- # out_file_paths.append(out_text_image_file_path)
777
-
778
  out_file_paths = [out_text_image_file_path]
779
 
780
  out_message = "PDF " + file_path_without_ext + " converted to image-based file."
781
  print(out_message)
782
 
783
- #print("Out file paths:", out_file_paths)
784
-
785
  return out_message, out_file_paths
786
 
787
- def join_values_within_threshold(df1, df2):
788
  # Threshold for matching
789
  threshold = 5
790
 
@@ -812,94 +791,598 @@ def join_values_within_threshold(df1, df2):
812
 
813
  # Clean up extra columns
814
  final_df = final_df.drop(columns=['key'])
815
- print(final_df)
816
 
817
-
818
- def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame()) -> pd.DataFrame:
819
  '''
820
- Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
821
  '''
822
- # Flatten the data
823
- flattened_annotation_data = []
824
-
825
- if not isinstance(redaction_decision_output, pd.DataFrame):
826
- redaction_decision_output = pd.DataFrame()
827
-
828
- for annotation in all_annotations:
829
- #print("annotation:", annotation)
830
- #print("flattened_data:", flattened_data)
831
- image_path = annotation["image"]
832
-
833
- # Use regex to find the number before .png
834
- match = re.search(r'_(\d+)\.png$', image_path)
835
- if match:
836
- number = match.group(1) # Extract the number
837
- #print(number) # Output: 0
838
- reported_number = int(number) + 1
 
 
839
  else:
840
- print("No number found before .png. Returning page 1.")
841
- reported_number = 1
842
 
843
- # Check if 'boxes' is in the annotation, if not, add an empty list
844
- if 'boxes' not in annotation:
845
- annotation['boxes'] = []
846
 
847
- for box in annotation["boxes"]:
848
- if 'text' not in box:
849
- data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
850
- else:
851
- data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
852
- #print("data_to_add:", data_to_add)
853
- flattened_annotation_data.append(data_to_add)
854
 
855
- # Convert to a DataFrame
856
- annotation_data_as_df = pd.DataFrame(flattened_annotation_data)
857
 
858
- #print("redaction_decision_output:", redaction_decision_output)
859
- #print("annotation_data_as_df:", annotation_data_as_df)
860
 
861
- # Join on additional text data from decision output results if included, if text not already there
862
- if not redaction_decision_output.empty:
863
- #print("redaction_decision_output is not empty")
864
- #print("redaction_decision_output:", redaction_decision_output)
865
- #print("annotation_data_as_df:", annotation_data_as_df)
866
- redaction_decision_output['page'] = redaction_decision_output['page'].astype(str)
867
- annotation_data_as_df['page'] = annotation_data_as_df['page'].astype(str)
868
- redaction_decision_output = redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
869
-
870
- # Round to the closest number divisible by 5
871
- redaction_decision_output.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
872
-
873
- redaction_decision_output = redaction_decision_output.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
874
 
875
- #annotation_data_as_df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
876
 
877
- annotation_data_as_df.loc[:, ['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878
 
879
- annotation_data_as_df = annotation_data_as_df.merge(redaction_decision_output, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
880
 
881
- annotation_data_as_df = annotation_data_as_df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
 
882
 
883
- annotation_data_as_df = annotation_data_as_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
 
884
 
885
  # Ensure required columns exist, filling with blank if they don't
886
- for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
887
- if col not in annotation_data_as_df.columns:
888
- annotation_data_as_df[col] = ''
 
 
 
 
 
 
 
889
 
890
- for col in ['xmin', 'xmax', 'ymin', 'ymax']:
891
- annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
892
 
893
- annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
894
 
895
- return annotation_data_as_df
896
 
897
- def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
 
 
898
  '''
899
- Convert a review csv to a json file for use by the Gradio Annotation object
900
  '''
 
 
 
 
 
 
 
 
 
 
 
 
901
  # Keep only necessary columns
902
- review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
 
 
 
903
 
904
  # Group the DataFrame by the 'image' column
905
  grouped_csv_pages = review_file_df.groupby('page')
@@ -907,15 +1390,16 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
907
  # Create a list to hold the JSON data
908
  json_data = []
909
 
910
- for n, pdf_image_path in enumerate(image_paths):
911
- reported_page_number = int(n + 1)
 
912
 
913
  if reported_page_number in review_file_df["page"].values:
914
 
915
  # Convert each relevant group to a list of box dictionaries
916
  selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
917
  annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
918
-
919
  annotation = {
920
  "image": pdf_image_path,
921
  "boxes": annotation_boxes
@@ -924,6 +1408,7 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
924
  else:
925
  annotation = {}
926
  annotation["image"] = pdf_image_path
 
927
 
928
  # Append the structured data to the json_data list
929
  json_data.append(annotation)
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
6
  import time
7
  import json
8
+ import numpy as np
9
  import pymupdf
10
+ from pymupdf import Document, Page, Rect
11
  import pandas as pd
12
+ import shutil
13
+ import zipfile
14
+ from collections import defaultdict
15
  from tqdm import tqdm
16
  from gradio import Progress
17
+ from typing import List, Optional, Dict, Any
18
  from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from pdf2image import convert_from_path
20
+ from PIL import Image
21
+ from scipy.spatial import cKDTree
22
+
23
+ IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
24
+
25
+ pd.set_option('future.no_silent_downcasting', True)
26
+
27
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
28
+ from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
29
+ # from tools.aws_textract import load_and_convert_textract_json
30
 
31
+ image_dpi = float(IMAGES_DPI)
32
+ if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
33
+ else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
34
+ ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
35
 
36
  def is_pdf_or_image(filename):
37
  """
 
61
  """
62
  return filename.lower().endswith(".pdf")
63
 
 
64
  ## Convert pdf to image if necessary
65
 
66
+ def check_image_size_and_reduce(out_path:str, image:Image):
67
+ '''
68
+ Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
69
+ '''
70
 
71
+ all_img_details = []
72
+ page_num = 0
 
73
 
74
+ # Check file size and resize if necessary
75
+ max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
76
+ file_size = os.path.getsize(out_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ width = image.width
79
+ height = image.height
80
 
81
+ # Resize images if they are too big
82
+ if file_size > max_size:
83
+ # Start with the original image size
84
+
85
+ print(f"Image size before {width}x{height}, original file_size: {file_size}")
86
+
87
+ while file_size > max_size:
88
+ # Reduce the size by a factor (e.g., 50% of the current size)
89
+ new_width = int(width * 0.5)
90
+ new_height = int(height * 0.5)
91
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
92
+
93
+ # Save the resized image
94
+ image.save(out_path, format="PNG", optimize=True)
95
+
96
+ # Update the file size
97
+ file_size = os.path.getsize(out_path)
98
+ print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
99
+ else:
100
+ new_width = width
101
+ new_height = height
102
+
103
+
104
+ all_img_details.append((page_num, image, new_width, new_height))
105
+
106
+ return image, new_width, new_height, all_img_details, out_path
107
+
108
+ def process_single_page_for_image_conversion(pdf_path:str, page_num:int, image_dpi:float=image_dpi, create_images:bool = True, input_folder: str = INPUT_FOLDER) -> tuple[int, str, float, float]:
109
+
110
+ out_path_placeholder = "placeholder_image_" + str(page_num) + ".png"
111
+
112
+ if create_images == True:
113
+ try:
114
+ # Construct the full output directory path
115
+ image_output_dir = os.path.join(os.getcwd(), input_folder)
116
+ out_path = os.path.join(image_output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
117
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
118
 
119
+ if os.path.exists(out_path):
120
+ # Load existing image
121
+ image = Image.open(out_path)
122
+ elif pdf_path.lower().endswith(".pdf"):
123
+ # Convert PDF page to image
124
+ image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
125
+ dpi=image_dpi, use_cropbox=False, use_pdftocairo=False)
126
+ image = image_l[0]
127
+ image = image.convert("L")
128
 
129
+ image.save(out_path, format="PNG")
130
+ elif pdf_path.lower().endswith(".jpg") or pdf_path.lower().endswith(".png") or pdf_path.lower().endswith(".jpeg"):
131
+ image = Image.open(pdf_path)
132
+ image.save(out_path, format="PNG")
133
 
134
+ width, height = image.size
135
+
136
+ # Check if image size too large and reduce if necessary
137
+ #print("Checking size of image and reducing if necessary.")
138
+ image, width, height, all_img_details, img_path = check_image_size_and_reduce(out_path, image)
139
+
140
+ return page_num, out_path, width, height
141
+
142
+ except Exception as e:
143
+ print(f"Error processing page {page_num + 1}: {e}")
144
+ return page_num, out_path_placeholder, pd.NA, pd.NA
145
+ else:
146
+ # print("Not creating image for page", page_num)
147
+ return page_num, out_path_placeholder, pd.NA, pd.NA
148
+
149
+ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, page_max:int = 0, create_images:bool=True, image_dpi: float = image_dpi, num_threads: int = 8, input_folder: str = INPUT_FOLDER):
150
 
151
+ # If preparing for review, just load the first page (not currently used)
152
  if prepare_for_review == True:
153
  page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
154
+ page_min = 0
155
+ page_max = page_count
156
  else:
157
  page_count = pdfinfo_from_path(pdf_path)['Pages']
158
 
159
  print(f"Number of pages in PDF: {page_count}")
160
 
161
+ # Set page max to length of pdf if not specified
162
+ if page_max == 0: page_max = page_count
163
+
164
  results = []
165
  with ThreadPoolExecutor(max_workers=num_threads) as executor:
166
  futures = []
167
+ for page_num in range(page_min, page_max):
168
+ futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
169
 
170
+ for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages to image"):
171
+ page_num, img_path, width, height = future.result()
172
+ if img_path:
173
+ results.append((page_num, img_path, width, height))
174
  else:
175
  print(f"Page {page_num + 1} failed to process.")
176
+ results.append((page_num, "placeholder_image_" + str(page_num) + ".png", pd.NA, pd.NA))
177
 
178
  # Sort results by page number
179
  results.sort(key=lambda x: x[0])
180
  images = [result[1] for result in results]
181
+ widths = [result[2] for result in results]
182
+ heights = [result[3] for result in results]
183
 
184
  print("PDF has been converted to images.")
185
+ return images, widths, heights, results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
188
+ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False, input_folder:str=INPUT_FOLDER, create_images:bool=True):
189
  # Get the file extension
190
  file_extension = os.path.splitext(file_path)[1].lower()
191
+
192
  # Check if the file is an image type
193
  if file_extension in ['.jpg', '.jpeg', '.png']:
194
  print(f"{file_path} is an image file.")
195
  # Perform image processing here
196
  img_object = [file_path] #[Image.open(file_path)]
197
+
198
+ # Load images from the file paths. Test to see if it is bigger than 4.5 mb and reduct if needed (Textract limit is 5mb)
199
+ image = Image.open(file_path)
200
+ img_object, image_sizes_width, image_sizes_height, all_img_details, img_path = check_image_size_and_reduce(file_path, image)
201
+
202
+ if not isinstance(image_sizes_width, list):
203
+ img_path = [img_path]
204
+ image_sizes_width = [image_sizes_width]
205
+ image_sizes_height = [image_sizes_height]
206
+ all_img_details = [all_img_details]
207
+
208
 
209
  # Check if the file is a PDF
210
  elif file_extension == '.pdf':
211
  print(f"{file_path} is a PDF file. Converting to image set")
212
+
213
  # Run your function for processing PDF files here
214
+ img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
215
 
216
  else:
217
  print(f"{file_path} is not an image or PDF file.")
218
+ img_path = []
219
+ image_sizes_width = []
220
+ image_sizes_height = []
221
+ all_img_details = []
222
 
223
+ return img_path, image_sizes_width, image_sizes_height, all_img_details
224
 
225
  def get_input_file_names(file_input:List[str]):
226
  '''
 
230
  all_relevant_files = []
231
  file_name_with_extension = ""
232
  full_file_name = ""
233
+ total_pdf_page_count = 0
234
 
 
235
  if isinstance(file_input, dict):
236
  file_input = os.path.abspath(file_input["name"])
237
 
 
250
 
251
  file_extension = os.path.splitext(file_path)[1].lower()
252
 
253
+ # Check if the file is in acceptable types
254
  if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
255
  all_relevant_files.append(file_path_without_ext)
256
  file_name_with_extension = file_path_without_ext + file_extension
257
  full_file_name = file_path
258
+
259
+ # If PDF, get number of pages
260
+ if (file_extension in ['.pdf']):
261
+ # Open the PDF file
262
+ pdf_document = pymupdf.open(file_path)
263
+ # Get the number of pages
264
+ page_count = pdf_document.page_count
265
+
266
+ # Close the document
267
+ pdf_document.close()
268
+ else:
269
+ page_count = 1
270
+
271
+ total_pdf_page_count += page_count
272
 
273
  all_relevant_files_str = ", ".join(all_relevant_files)
274
 
275
+ return all_relevant_files_str, file_name_with_extension, full_file_name, all_relevant_files, total_pdf_page_count
 
 
 
276
 
277
  def convert_color_to_range_0_1(color):
278
  return tuple(component / 255 for component in color)
279
 
280
  def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
281
+ '''
282
+ Commit redaction boxes to a PyMuPDF page.
283
+ '''
284
+
285
  pymupdf_x1 = pymupdf_rect[0]
286
  pymupdf_y1 = pymupdf_rect[1]
287
  pymupdf_x2 = pymupdf_rect[2]
 
297
  redact_bottom_y = middle_y - 1
298
  redact_top_y = middle_y + 1
299
 
 
300
 
301
  rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
302
 
 
323
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
324
  shape.commit()
325
 
326
+ def convert_pymupdf_to_image_coords(pymupdf_page:Page, x1:float, y1:float, x2:float, y2:float, image: Image=None, image_dimensions:dict={}):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  '''
328
  Converts coordinates from pymupdf format to image coordinates,
329
  accounting for mediabox dimensions and offset.
 
339
  mediabox_height = mediabox.height
340
 
341
  # Get target image dimensions
342
+ if image:
343
+ image_page_width, image_page_height = image.size
344
+ elif image_dimensions:
345
+ image_page_width, image_page_height = image_dimensions['image_width'], image_dimensions['image_height']
346
+ else:
347
+ image_page_width, image_page_height = mediabox_width, mediabox_height
348
 
349
  # Calculate scaling factors
350
  image_to_mediabox_x_scale = image_page_width / mediabox_width
351
  image_to_mediabox_y_scale = image_page_height / mediabox_height
352
 
 
 
 
 
 
 
 
 
 
 
353
  # Adjust coordinates:
354
  # Apply scaling to match image dimensions
355
  x1_image = x1 * image_to_mediabox_x_scale
 
382
 
383
  return x1_image, y1_image, x2_image, y2_image
384
 
385
+ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, image:Image, page:Page, custom_colours, border:float = 5, image_dimensions:dict={}):
 
 
386
  # Small border to page that remains white
387
  border = 5
388
  # Define the coordinates for the Rect
389
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
390
  whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
391
 
392
+ # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image, image_dimensions=image_dimensions)
393
 
394
  # Create new image annotation element based on whole page coordinates
395
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
396
 
397
  # Write whole page annotation to annotation boxes
398
  whole_page_img_annotation_box = {}
399
+ whole_page_img_annotation_box["xmin"] = whole_page_x1 #whole_page_image_x1
400
+ whole_page_img_annotation_box["ymin"] = whole_page_y1 #whole_page_image_y1
401
+ whole_page_img_annotation_box["xmax"] = whole_page_x2 #whole_page_image_x2
402
+ whole_page_img_annotation_box["ymax"] = whole_page_y2 #whole_page_image_y2
403
  whole_page_img_annotation_box["color"] = (0,0,0)
404
  whole_page_img_annotation_box["label"] = "Whole page"
405
 
 
407
 
408
  return whole_page_img_annotation_box
409
 
410
+ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
411
+ page_sizes = []
412
+ original_cropboxes = []
413
+
414
+ for page_no, page in enumerate(pymupdf_doc):
415
+ reported_page_no = page_no + 1
416
+
417
+ pymupdf_page = pymupdf_doc.load_page(page_no)
418
+ original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
419
+
420
+ # Create a page_sizes_object.
421
+ # If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
422
+ if image_sizes_width and image_sizes_height:
423
+ out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
424
+ else:
425
+ out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":pd.NA, "image_height":pd.NA, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
426
+
427
+ page_sizes.append(out_page_image_sizes)
428
+
429
+ return page_sizes, original_cropboxes
430
+
431
  def prepare_image_or_pdf(
432
  file_paths: List[str],
433
  in_redact_method: str,
 
438
  all_annotations_object:List = [],
439
  prepare_for_review:bool = False,
440
  in_fully_redacted_list:List[int]=[],
441
+ output_folder:str=OUTPUT_FOLDER,
442
+ input_folder:str=INPUT_FOLDER,
443
+ prepare_images:bool=True,
444
+ page_sizes:list[dict]=[],
445
+ textract_output_found:bool = False,
446
  progress: Progress = Progress(track_tqdm=True)
447
  ) -> tuple[List[str], List[str]]:
448
  """
 
461
  all_annotations_object(optional, List of annotation objects): All annotations for current document
462
  prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
463
  in_fully_redacted_list(optional, List of int): A list of pages to fully redact
464
+ output_folder (optional, str): The output folder for file save
465
+ prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
466
+ page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
467
+ textract_output_found (optional, bool): A boolean indicating whether textract output has already been found . Defaults to False.
468
+ progress (optional, Progress): Progress tracker for the operation
469
 
470
 
471
  Returns:
 
474
 
475
  tic = time.perf_counter()
476
  json_from_csv = False
477
+ original_cropboxes = [] # Store original CropBox values
478
+ converted_file_paths = []
479
+ image_file_paths = []
480
+ pymupdf_doc = []
481
+ all_img_details = []
482
+ review_file_csv = pd.DataFrame()
483
 
484
  if isinstance(in_fully_redacted_list, pd.DataFrame):
485
+ if not in_fully_redacted_list.empty:
486
+ in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
487
 
488
  # If this is the first time around, set variables to 0/blank
489
  if first_loop_state==True:
 
490
  latest_file_completed = 0
491
  out_message = []
492
  all_annotations_object = []
493
  else:
494
+ print("Now redacting file", str(latest_file_completed))
495
+
 
 
 
 
 
 
 
 
496
  # If out message or converted_file_paths are blank, change to a list so it can be appended to
497
+ if isinstance(out_message, str): out_message = [out_message]
 
 
 
 
 
 
498
 
499
+ if not file_paths: file_paths = []
 
500
 
501
+ if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
 
502
 
503
+ if isinstance(file_paths, str): file_path_number = 1
504
+ else: file_path_number = len(file_paths)
 
 
 
 
 
 
505
 
506
  latest_file_completed = int(latest_file_completed)
507
 
 
512
  final_out_message = '\n'.join(out_message)
513
  else:
514
  final_out_message = out_message
515
+ return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details
 
 
516
 
517
  progress(0.1, desc='Preparing file')
518
 
 
544
  if not file_path:
545
  out_message = "Please select a file."
546
  print(out_message)
547
+ raise Exception(out_message)
548
+
549
  file_extension = os.path.splitext(file_path)[1].lower()
550
 
551
  # If a pdf, load as a pymupdf document
552
  if is_pdf(file_path):
553
  pymupdf_doc = pymupdf.open(file_path)
554
+ pymupdf_pages = pymupdf_doc.page_count
555
 
556
  converted_file_path = file_path
557
+
558
+ if prepare_images==True:
559
+ image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path, prepare_for_review, input_folder, create_images=True)
560
+ else:
561
+ image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path, prepare_for_review, input_folder, create_images=False)
562
+
563
+ page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
564
 
565
  #Create base version of the annotation object that doesn't have any annotations in it
566
  if (not all_annotations_object) & (prepare_for_review == True):
 
569
  for image_path in image_file_paths:
570
  annotation = {}
571
  annotation["image"] = image_path
572
+ annotation["boxes"] = []
573
 
574
  all_annotations_object.append(annotation)
575
 
 
583
 
584
  img = Image.open(file_path) # Open the image file
585
  rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
586
+ pymupdf_page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
587
+ pymupdf_page.insert_image(rect, filename=file_path) # Insert the image into the page
588
+ pymupdf_page = pymupdf_doc.load_page(0)
589
 
590
  file_path_str = str(file_path)
591
 
592
+ image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
593
 
 
594
 
595
+ # Create a page_sizes_object
596
+ page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
597
 
598
+ converted_file_path = output_folder + file_name_with_ext
599
 
600
+ pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
601
 
602
  elif file_extension in ['.csv']:
603
  review_file_csv = read_file(file)
604
+ all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
605
  json_from_csv = True
606
  print("Converted CSV review file to json")
607
 
 
609
  if (file_extension in ['.json']) | (json_from_csv == True):
610
 
611
  if (file_extension in ['.json']) & (prepare_for_review == True):
 
612
  if isinstance(file_path, str):
613
  with open(file_path, 'r') as json_file:
614
  all_annotations_object = json.load(json_file)
 
617
  all_annotations_object = json.loads(file_path) # Use loads for string content
618
 
619
  # Assume it's a textract json
620
+ elif (file_extension == '.json') and (prepare_for_review is not True):
621
+ # Copy it to the output folder so it can be used later.
622
+ out_textract_path = os.path.join(output_folder, file_path_without_ext + "_textract.json")
623
+
624
+ # Use shutil to copy the file directly
625
+ shutil.copy2(file_path, out_textract_path) # Preserves metadata
626
+
627
+ textract_output_found = True
628
+
629
  continue
630
 
631
  # If you have an annotations object from the above code
632
  if all_annotations_object:
 
633
 
634
  # Get list of page numbers
635
  image_file_paths_pages = [
 
641
 
642
  # If PDF pages have been converted to image files, replace the current image paths in the json to this.
643
  if image_file_paths:
 
 
 
 
 
644
  for i, image_file_path in enumerate(image_file_paths):
645
 
646
  if i < len(all_annotations_object):
 
649
  annotation = {}
650
  all_annotations_object.append(annotation)
651
 
 
652
  try:
653
  if not annotation:
654
  annotation = {"image":"", "boxes": []}
655
  annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
 
656
  else:
657
  annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
658
  except Exception as e:
659
  print("Extracting page number from image failed due to:", e)
660
  annotation_page_number = 0
 
661
 
662
  # Check if the annotation page number exists in the image file paths pages
663
  if annotation_page_number in image_file_paths_pages:
 
670
 
671
  all_annotations_object[i] = annotation
672
 
673
+
674
+ if isinstance(in_fully_redacted_list, list):
675
+ in_fully_redacted_list = pd.DataFrame(data={"fully_redacted_pages_list":in_fully_redacted_list})
676
 
677
  # Get list of pages that are to be fully redacted and redact them
678
+ if not in_fully_redacted_list.empty:
679
  print("Redacting whole pages")
680
 
681
  for i, image in enumerate(image_file_paths):
682
  page = pymupdf_doc.load_page(i)
683
  rect_height = page.rect.height
684
  rect_width = page.rect.width
685
+ whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5, image_dimensions={"image_width":image_sizes_width[i], "image_height":image_sizes_height[i]})
686
 
687
  all_annotations_object.append(whole_page_img_annotation_box)
688
 
689
  # Write the response to a JSON file in output folder
690
  out_folder = output_folder + file_path_without_ext + ".json"
691
+ # with open(out_folder, 'w') as json_file:
692
+ # json.dump(all_annotations_object, json_file, separators=(",", ":"))
693
  continue
694
 
695
+ # If it's a zip, it could be extract from a Textract bulk API call. Check it's this, and load in json if found
696
+ elif file_extension in ['.zip']:
697
+
698
+ # Assume it's a Textract response object. Copy it to the output folder so it can be used later.
699
+ out_folder = os.path.join(output_folder, file_path_without_ext + "_textract.json")
700
+
701
+ # Use shutil to copy the file directly
702
+ # Open the ZIP file to check its contents
703
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
704
+ json_files = [f for f in zip_ref.namelist() if f.lower().endswith('.json')]
705
+
706
+ if len(json_files) == 1: # Ensure only one JSON file exists
707
+ json_filename = json_files[0]
708
+
709
+ # Extract the JSON file to the same directory as the ZIP file
710
+ extracted_path = os.path.join(os.path.dirname(file_path), json_filename)
711
+ zip_ref.extract(json_filename, os.path.dirname(file_path))
712
+
713
+ # Move the extracted JSON to the intended output location
714
+ shutil.move(extracted_path, out_folder)
715
+
716
+ textract_output_found = True
717
+ else:
718
+ print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
719
+
720
  # Must be something else, return with error message
721
  else:
722
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
723
  if is_pdf_or_image(file_path) == False:
724
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
725
  print(out_message)
726
+ raise Exception(out_message)
727
 
728
  elif in_redact_method == text_ocr_option:
729
  if is_pdf(file_path) == False:
730
  out_message = "Please upload a PDF file for text analysis."
731
  print(out_message)
732
+ raise Exception(out_message)
 
733
 
734
  converted_file_paths.append(converted_file_path)
735
  image_file_paths.extend(image_file_path)
 
744
 
745
  number_of_pages = len(image_file_paths)
746
 
747
+ return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details
748
 
749
+ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
750
  file_path_without_ext = get_file_name_without_type(in_file_path)
751
 
752
  out_file_paths = out_text_file_path
753
 
754
+ # Convert annotated text pdf back to image to give genuine redactions
755
+ pdf_text_image_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(out_file_paths[0], input_folder=input_folder)
 
 
756
  out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
757
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
758
 
 
 
759
  out_file_paths = [out_text_image_file_path]
760
 
761
  out_message = "PDF " + file_path_without_ext + " converted to image-based file."
762
  print(out_message)
763
 
 
 
764
  return out_message, out_file_paths
765
 
766
+ def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
767
  # Threshold for matching
768
  threshold = 5
769
 
 
791
 
792
  # Clean up extra columns
793
  final_df = final_df.drop(columns=['key'])
 
794
 
795
+ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
 
796
  '''
797
+ Remove items from the annotator object where the same page exists twice.
798
  '''
799
+ # Group items by 'image'
800
+ image_groups = defaultdict(list)
801
+ for item in data:
802
+ image_groups[item['image']].append(item)
803
+
804
+ # Process each group to prioritize items with non-empty boxes
805
+ result = []
806
+ for image, items in image_groups.items():
807
+ # Filter items with non-empty boxes
808
+ non_empty_boxes = [item for item in items if item.get('boxes')]
809
+
810
+ # Remove 'text' elements from boxes
811
+ for item in non_empty_boxes:
812
+ if 'boxes' in item:
813
+ item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
814
+
815
+ if non_empty_boxes:
816
+ # Keep the first entry with non-empty boxes
817
+ result.append(non_empty_boxes[0])
818
  else:
819
+ # If all items have empty or missing boxes, keep the first item
820
+ result.append(items[0])
821
 
822
+ return result
 
 
823
 
824
+ def divide_coordinates_by_page_sizes(review_file_df:pd.DataFrame, page_sizes_df:pd.DataFrame, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax"):
 
 
 
 
 
 
825
 
826
+ '''Convert data to same coordinate system. If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates.'''
 
827
 
828
+ review_file_df_out = review_file_df
 
829
 
830
+ if xmin in review_file_df.columns and not review_file_df.empty:
831
+ review_file_df_orig = review_file_df.copy().loc[(review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) & (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1),:]
832
+
833
+ review_file_df = review_file_df.loc[(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) & (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1),:]
834
+
835
+ review_file_df.loc[:, "page"] = pd.to_numeric(review_file_df["page"], errors="coerce")
836
+
837
+ review_file_df_div = review_file_df
838
+
839
+ if "image_width" not in review_file_df_div.columns and not page_sizes_df.empty:
840
+
841
+ page_sizes_df["image_width"] = page_sizes_df["image_width"].replace("<NA>", pd.NA)
842
+ page_sizes_df["image_height"] = page_sizes_df["image_height"].replace("<NA>", pd.NA)
843
+ review_file_df_div = review_file_df_div.merge(page_sizes_df[["page", "image_width", "image_height", "mediabox_width", "mediabox_height"]], on="page", how="left")
844
+
845
+ if "image_width" in review_file_df_div.columns:
846
+ if review_file_df_div["image_width"].isna().all(): # Check if all are NaN values. If so, assume we only have mediabox coordinates available
847
+ review_file_df_div["image_width"] = review_file_df_div["image_width"].fillna(review_file_df_div["mediabox_width"]).infer_objects()
848
+ review_file_df_div["image_height"] = review_file_df_div["image_height"].fillna(review_file_df_div["mediabox_height"]).infer_objects()
849
+
850
+ convert_type_cols = ["image_width", "image_height", xmin, xmax, ymin, ymax]
851
+ review_file_df_div[convert_type_cols] = review_file_df_div[convert_type_cols].apply(pd.to_numeric, errors="coerce")
852
+
853
+ review_file_df_div[xmin] = review_file_df_div[xmin] / review_file_df_div["image_width"]
854
+ review_file_df_div[xmax] = review_file_df_div[xmax] / review_file_df_div["image_width"]
855
+ review_file_df_div[ymin] = review_file_df_div[ymin] / review_file_df_div["image_height"]
856
+ review_file_df_div[ymax] = review_file_df_div[ymax] / review_file_df_div["image_height"]
857
+
858
+ # Concatenate the original and modified DataFrames
859
+ dfs_to_concat = [df for df in [review_file_df_orig, review_file_df_div] if not df.empty]
860
+ if dfs_to_concat: # Ensure there's at least one non-empty DataFrame
861
+ review_file_df_out = pd.concat(dfs_to_concat)
862
+ else:
863
+ review_file_df_out = review_file_df # Return an original DataFrame instead of raising an error
864
+
865
+ # Only sort if the DataFrame is not empty and contains the required columns
866
+ required_sort_columns = {"page", xmin, ymin}
867
+ if not review_file_df_out.empty and required_sort_columns.issubset(review_file_df_out.columns):
868
+ review_file_df_out.sort_values(["page", ymin, xmin], inplace=True)
869
+
870
+ review_file_df_out.drop(["image_width", "image_height", "mediabox_width", "mediabox_height"], axis=1, errors="ignore")
871
+
872
+ return review_file_df_out
873
+
874
+ def multiply_coordinates_by_page_sizes(review_file_df: pd.DataFrame, page_sizes_df: pd.DataFrame, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax"):
875
+
876
+
877
+ if xmin in review_file_df.columns and not review_file_df.empty:
878
+ # Separate absolute vs relative coordinates
879
+ review_file_df_orig = review_file_df.loc[
880
+ (review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) &
881
+ (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1), :].copy()
882
+
883
+ review_file_df = review_file_df.loc[
884
+ (review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) &
885
+ (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1), :].copy()
886
+
887
+ if review_file_df.empty:
888
+ return review_file_df_orig # If nothing is left, return the original absolute-coordinates DataFrame
889
+
890
+ review_file_df.loc[:, "page"] = pd.to_numeric(review_file_df["page"], errors="coerce")
891
+
892
+ if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
893
+ page_sizes_df[['image_width', 'image_height']] = page_sizes_df[['image_width','image_height']].replace("<NA>", pd.NA) # Ensure proper NA handling
894
+ review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
895
+
896
+ if "image_width" in review_file_df.columns:
897
+ # Split into rows with/without image size info
898
+ review_file_df_not_na = review_file_df.loc[review_file_df["image_width"].notna()].copy()
899
+ review_file_df_na = review_file_df.loc[review_file_df["image_width"].isna()].copy()
900
+
901
+ if not review_file_df_not_na.empty:
902
+ convert_type_cols = ["image_width", "image_height", xmin, xmax, ymin, ymax]
903
+ review_file_df_not_na[convert_type_cols] = review_file_df_not_na[convert_type_cols].apply(pd.to_numeric, errors="coerce")
904
+
905
+ # Multiply coordinates by image sizes
906
+ review_file_df_not_na[xmin] *= review_file_df_not_na["image_width"]
907
+ review_file_df_not_na[xmax] *= review_file_df_not_na["image_width"]
908
+ review_file_df_not_na[ymin] *= review_file_df_not_na["image_height"]
909
+ review_file_df_not_na[ymax] *= review_file_df_not_na["image_height"]
910
+
911
+ # Concatenate the modified and unmodified data
912
+ review_file_df = pd.concat([df for df in [review_file_df_not_na, review_file_df_na] if not df.empty])
913
+
914
+ # Merge with the original absolute-coordinates DataFrame
915
+ dfs_to_concat = [df for df in [review_file_df_orig, review_file_df] if not df.empty]
916
+ if dfs_to_concat: # Ensure there's at least one non-empty DataFrame
917
+ review_file_df = pd.concat(dfs_to_concat)
918
+ else:
919
+ review_file_df = pd.DataFrame() # Return an empty DataFrame instead of raising an error
920
+
921
+ # Only sort if the DataFrame is not empty and contains the required columns
922
+ required_sort_columns = {"page", "xmin", "ymin"}
923
+ if not review_file_df.empty and required_sort_columns.issubset(review_file_df.columns):
924
+ review_file_df.sort_values(["page", "xmin", "ymin"], inplace=True)
925
+
926
+ return review_file_df
927
+
928
+
929
+ def do_proximity_match_by_page_for_text(df1:pd.DataFrame, df2:pd.DataFrame):
930
+ '''
931
+ Match text from one dataframe to another based on proximity matching of coordinates page by page.
932
+ '''
933
+
934
+ if not 'text' in df2.columns: df2['text'] = ''
935
+ if not 'text' in df1.columns: df1['text'] = ''
936
+
937
+ # Create a unique key based on coordinates and label for exact merge
938
+ merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
939
+ df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
940
+ df2['key'] = df2[merge_keys].astype(str).agg('_'.join, axis=1)
941
+
942
+ # Attempt exact merge first
943
+ merged_df = df1.merge(df2[['key', 'text']], on='key', how='left', suffixes=('', '_duplicate'))
944
+
945
+ # If a match is found, keep that text; otherwise, keep the original df1 text
946
+ merged_df['text'] = np.where(
947
+ merged_df['text'].isna() | (merged_df['text'] == ''),
948
+ merged_df.pop('text_duplicate'),
949
+ merged_df['text']
950
+ )
951
+
952
+ # Define tolerance for proximity matching
953
+ tolerance = 0.02
954
+
955
+ # Precompute KDTree for each page in df2
956
+ page_trees = {}
957
+ for page in df2['page'].unique():
958
+ df2_page = df2[df2['page'] == page]
959
+ coords = df2_page[['xmin', 'ymin', 'xmax', 'ymax']].values
960
+ if np.all(np.isfinite(coords)) and len(coords) > 0:
961
+ page_trees[page] = (cKDTree(coords), df2_page)
962
+
963
+ # Perform proximity matching
964
+ for i, row in df1.iterrows():
965
+ page_number = row['page']
966
+
967
+ if page_number in page_trees:
968
+ tree, df2_page = page_trees[page_number]
969
+
970
+ # Query KDTree for nearest neighbor
971
+ dist, idx = tree.query([row[['xmin', 'ymin', 'xmax', 'ymax']].values], distance_upper_bound=tolerance)
972
+
973
+ if dist[0] < tolerance and idx[0] < len(df2_page):
974
+ merged_df.at[i, 'text'] = df2_page.iloc[idx[0]]['text']
975
+
976
+ # Drop the temporary key column
977
+ merged_df.drop(columns=['key'], inplace=True)
978
+
979
+ return merged_df
980
+
981
+
982
+ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, threshold:float=0.03):
983
+ '''
984
+ Match text from one dataframe to another based on proximity matching of coordinates across all pages.
985
+ '''
986
+
987
+ if not 'text' in df2.columns: df2['text'] = ''
988
+ if not 'text' in df1.columns: df1['text'] = ''
989
+
990
+ # Create a unique key based on coordinates and label for exact merge
991
+ merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
992
+ df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
993
+ df2['key'] = df2[merge_keys].astype(str).agg('_'.join, axis=1)
994
+
995
+ # Attempt exact merge first, renaming df2['text'] to avoid suffixes
996
+ merged_df = df1.merge(df2[['key', 'text']], on='key', how='left', suffixes=('', '_duplicate'))
997
+
998
+ # If a match is found, keep that text; otherwise, keep the original df1 text
999
+ merged_df['text'] = np.where(
1000
+ merged_df['text'].isna() | (merged_df['text'] == ''),
1001
+ merged_df.pop('text_duplicate'),
1002
+ merged_df['text']
1003
+ )
1004
+
1005
+ # Handle missing matches using a proximity-based approach
1006
+ # Convert coordinates to numpy arrays for KDTree lookup
1007
+ query_coords = np.array(df1[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=float)
1008
+
1009
+ # Check for NaN or infinite values in query_coords and filter them out
1010
+ finite_mask = np.isfinite(query_coords).all(axis=1)
1011
+ if not finite_mask.all():
1012
+ print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
1013
+ query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
1014
+ else:
1015
+ pass
1016
+
1017
+ # Proceed only if query_coords is not empty
1018
+ if query_coords.size > 0:
1019
+ # Ensure df2 is filtered for finite values before creating the KDTree
1020
+ finite_mask_df2 = np.isfinite(df2[['xmin', 'ymin', 'xmax', 'ymax']].values).all(axis=1)
1021
+ df2_finite = df2[finite_mask_df2]
1022
+
1023
+ # Create the KDTree with the filtered data
1024
+ tree = cKDTree(df2_finite[['xmin', 'ymin', 'xmax', 'ymax']].values)
1025
+
1026
+ # Find nearest neighbors within a reasonable tolerance (e.g., 1% of page)
1027
+ tolerance = threshold
1028
+ distances, indices = tree.query(query_coords, distance_upper_bound=tolerance)
1029
+
1030
+ # Assign text values where matches are found
1031
+ for i, (dist, idx) in enumerate(zip(distances, indices)):
1032
+ if dist < tolerance and idx < len(df2_finite):
1033
+ merged_df.at[i, 'text'] = df2_finite.iloc[idx]['text']
1034
+
1035
+ # Drop the temporary key column
1036
+ merged_df.drop(columns=['key'], inplace=True)
1037
+
1038
+ return merged_df
1039
+
1040
+
1041
+
1042
+
1043
+ def _extract_page_number(image_path: Any) -> int:
1044
+ """Helper function to safely extract page number."""
1045
+ if not isinstance(image_path, str):
1046
+ return 1
1047
+ match = IMAGE_NUM_REGEX.search(image_path)
1048
+ if match:
1049
+ try:
1050
+ return int(match.group(1)) + 1
1051
+ except (ValueError, TypeError):
1052
+ return 1
1053
+ return 1
1054
+
1055
+ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1056
+ '''
1057
+ Convert annotation list to DataFrame using Pandas explode and json_normalize.
1058
+ '''
1059
+ if not all_annotations:
1060
+ # Return an empty DataFrame with the expected schema if input is empty
1061
+ return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text"])
1062
+
1063
+ # 1. Create initial DataFrame from the list of annotations
1064
+ # Use list comprehensions with .get() for robustness
1065
+ df = pd.DataFrame({
1066
+ "image": [anno.get("image") for anno in all_annotations],
1067
+ # Ensure 'boxes' defaults to an empty list if missing or None
1068
+ "boxes": [anno.get("boxes") if isinstance(anno.get("boxes"), list) else [] for anno in all_annotations]
1069
+ })
1070
+
1071
+ # 2. Calculate the page number using the helper function
1072
+ df['page'] = df['image'].apply(_extract_page_number)
1073
+
1074
+ # 3. Handle empty 'boxes' lists *before* exploding.
1075
+ # Explode removes rows where the list is empty. We want to keep them
1076
+ # as rows with NA values. Replace empty lists with a list containing
1077
+ # a single placeholder dictionary.
1078
+ placeholder_box = {"xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA, "text": pd.NA}
1079
+ df['boxes'] = df['boxes'].apply(lambda x: x if x else [placeholder_box])
1080
+
1081
+ # 4. Explode the 'boxes' column. Each item in the list becomes a new row.
1082
+ df_exploded = df.explode('boxes', ignore_index=True)
1083
+
1084
+ # 5. Normalize the 'boxes' column (which now contains dictionaries or the placeholder)
1085
+ # This turns the dictionaries into separate columns.
1086
+ # Check for NaNs or non-dict items just in case, though placeholder handles most cases.
1087
+ mask = df_exploded['boxes'].notna() & df_exploded['boxes'].apply(isinstance, args=(dict,))
1088
+ normalized_boxes = pd.json_normalize(df_exploded.loc[mask, 'boxes'])
1089
+
1090
+ # 6. Combine the base data (image, page) with the normalized box data
1091
+ # Use the index of the exploded frame (where mask is True) to ensure correct alignment
1092
+ final_df = df_exploded.loc[mask, ['image', 'page']].reset_index(drop=True).join(normalized_boxes)
1093
+
1094
+ # --- Optional: Handle rows that might have had non-dict items in 'boxes' ---
1095
+ # If there were rows filtered out by 'mask', you might want to add them back
1096
+ # with NA values for box columns. However, the placeholder strategy usually
1097
+ # prevents this from being necessary.
1098
+
1099
+ # 7. Ensure essential columns exist and set column order
1100
+ essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text"]
1101
+ for col in essential_box_cols:
1102
+ if col not in final_df.columns:
1103
+ final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
1104
+
1105
+ base_cols = ["image", "page"]
1106
+ extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
1107
+ final_col_order = base_cols + essential_box_cols + sorted(extra_box_cols)
1108
+
1109
+ # Reindex to ensure consistent column order and presence of essential columns
1110
+ # Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
1111
+ # but it's good practice if columns could be missing for other reasons.
1112
+ final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
1113
+
1114
+ return final_df
1115
+
1116
+
1117
+ # def convert_annotation_data_to_dataframe(all_annotations:List[dict]):
1118
+ # '''
1119
+ # Convert an annotation list of dictionaries to a dataframe with all boxes on a separate row
1120
+ # '''
1121
+ # # Flatten the data
1122
+ # flattened_annotation_data = []
1123
+
1124
+ # for annotation in all_annotations:
1125
+ # image_path = annotation["image"]
1126
+
1127
+ # if image_path:
1128
+ # match = re.search(r'_(\d+)\.png$', image_path)
1129
+ # if match:
1130
+ # number = match.group(1)
1131
+ # reported_number = int(number) + 1
1132
+ # else:
1133
+ # reported_number = 1
1134
+ # else:
1135
+ # reported_number = 1
1136
+
1137
+ # # Check if 'boxes' is in the annotation, if not, add an empty list
1138
+ # if 'boxes' not in annotation:
1139
+ # annotation['boxes'] = []
1140
+
1141
+ # # If boxes are empty, create a row with blank values for xmin, xmax, ymin, ymax
1142
+ # if not annotation["boxes"]:
1143
+ # data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA}
1144
+ # flattened_annotation_data.append(data_to_add)
1145
+ # else:
1146
+ # for box in annotation["boxes"]:
1147
+ # if 'xmin' not in box:
1148
+ # data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, 'xmax': pd.NA, 'ymin': pd.NA, 'ymax': pd.NA}
1149
+ # elif 'text' not in box:
1150
+ # data_to_add = {"image": image_path, "page": reported_number, **box}
1151
+ # else:
1152
+ # data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
1153
+ # flattened_annotation_data.append(data_to_add)
1154
+
1155
+ # # Convert to a DataFrame
1156
+ # review_file_df = pd.DataFrame(flattened_annotation_data)
1157
+
1158
+ # return review_file_df
1159
+
1160
+ # def create_annotation_dicts_from_annotation_df(all_image_annotations_df:pd.DataFrame, page_sizes:List[dict]):
1161
+ # '''
1162
+ # From an annotation object as a dataframe, convert back to a list of dictionaries that can be used in the Gradio Image Annotator component
1163
+ # '''
1164
+ # result = []
1165
+
1166
+ # # Ensure that every page has an entry in the resulting list of dicts
1167
+ # for image_path in page_sizes:
1168
+ # annotation = {}
1169
+ # annotation["image"] = image_path["image_path"]
1170
+ # annotation["boxes"] = []
1171
+
1172
+ # result.append(annotation)
1173
+
1174
+ # # Then add in all the filled in data
1175
+ # for image, group in all_image_annotations_df.groupby('image'):
1176
+ # boxes = group[['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']].to_dict(orient='records')
1177
+ # result.append({'image': image, 'boxes': boxes})
1178
 
1179
+ # return result
1180
 
1181
+ def create_annotation_dicts_from_annotation_df(
1182
+ all_image_annotations_df: pd.DataFrame,
1183
+ page_sizes: List[Dict[str, Any]]
1184
+ ) -> List[Dict[str, Any]]:
1185
+ '''
1186
+ Convert annotation DataFrame back to list of dicts using dictionary lookup.
1187
+ Ensures all images from page_sizes are present without duplicates.
1188
+ '''
1189
+ # 1. Create a dictionary keyed by image path for efficient lookup & update
1190
+ # Initialize with all images from page_sizes. Use .get for safety.
1191
+ image_dict: Dict[str, Dict[str, Any]] = {}
1192
+ for item in page_sizes:
1193
+ image_path = item.get("image_path")
1194
+ if image_path: # Only process if image_path exists and is not None/empty
1195
+ image_dict[image_path] = {"image": image_path, "boxes": []}
1196
+
1197
+ # Check if the DataFrame is empty or lacks necessary columns
1198
+ if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
1199
+ print("Warning: Annotation DataFrame is empty or missing 'image' column.")
1200
+ return list(image_dict.values()) # Return based on page_sizes only
1201
+
1202
+ # 2. Define columns to extract for boxes and check availability
1203
+ # Make sure these columns actually exist in the DataFrame
1204
+ box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
1205
+ available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
1206
+
1207
+ if not available_cols:
1208
+ print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
1209
+ return list(image_dict.values()) # Return based on page_sizes only
1210
+
1211
+ # 3. Group the DataFrame by image and update the dictionary
1212
+ # Drop rows where essential coordinates might be NA (adjust if NA is meaningful)
1213
+ coord_cols = ['xmin', 'ymin', 'xmax', 'ymax']
1214
+ valid_box_df = all_image_annotations_df.dropna(
1215
+ subset=[col for col in coord_cols if col in available_cols]
1216
+ ).copy() # Use .copy() to avoid SettingWithCopyWarning if modifying later
1217
+
1218
+
1219
+ # Check if any valid boxes remain after dropping NAs
1220
+ if valid_box_df.empty:
1221
+ print("Warning: No valid annotation rows found in DataFrame after dropping NA coordinates.")
1222
+ return list(image_dict.values())
1223
+
1224
+
1225
+ # Process groups
1226
+ try:
1227
+ for image_path, group in valid_box_df.groupby('image', observed=True, sort=False):
1228
+ # Check if this image path exists in our target dictionary (from page_sizes)
1229
+ if image_path in image_dict:
1230
+ # Convert the relevant columns of the group to a list of dicts
1231
+ # Using only columns that are actually available
1232
+ boxes = group[available_cols].to_dict(orient='records')
1233
+ # Update the 'boxes' list in the dictionary
1234
+ image_dict[image_path]['boxes'] = boxes
1235
+ # Else: Image found in DataFrame but not required by page_sizes; ignore it.
1236
+ except KeyError:
1237
+ # This shouldn't happen due to the 'image' column check above, but handle defensively
1238
+ print("Error: Issue grouping DataFrame by 'image'.")
1239
+ return list(image_dict.values())
1240
+
1241
+
1242
+ # 4. Convert the dictionary values back into the final list format
1243
+ result = list(image_dict.values())
1244
+
1245
+ return result
1246
+
1247
+ # import pandas as pd
1248
+ # from typing import List, Dict, Any
1249
+
1250
+ # def create_annotation_dicts_from_annotation_df(
1251
+ # all_image_annotations_df: pd.DataFrame,
1252
+ # page_sizes: List[Dict[str, Any]]
1253
+ # ) -> List[Dict[str, Any]]:
1254
+ # '''
1255
+ # Convert annotation DataFrame back to list of dicts using Pandas merge.
1256
+ # Ensures all images from page_sizes are present without duplicates.
1257
+ # '''
1258
+ # # 1. Create a DataFrame containing all required image paths from page_sizes
1259
+ # if not page_sizes:
1260
+ # return []
1261
+ # all_image_paths = [item.get("image_path") for item in page_sizes if item.get("image_path")]
1262
+ # if not all_image_paths:
1263
+ # return []
1264
+ # # Use unique paths
1265
+ # pages_df = pd.DataFrame({'image': list(set(all_image_paths))})
1266
+
1267
+ # # Check if the DataFrame is empty or lacks necessary columns
1268
+ # if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
1269
+ # print("Warning: Annotation DataFrame is empty or missing 'image' column.")
1270
+ # # Add empty boxes column and return
1271
+ # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1272
+ # return pages_df.to_dict(orient='records')
1273
+
1274
+ # # 2. Define columns to extract and check availability
1275
+ # box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
1276
+ # available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
1277
+
1278
+ # if not available_cols:
1279
+ # print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
1280
+ # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1281
+ # return pages_df.to_dict(orient='records')
1282
+
1283
+ # # 3. Prepare the annotation data: drop invalid rows and aggregate boxes
1284
+ # coord_cols = ['xmin', 'ymin', 'xmax', 'ymax']
1285
+ # valid_box_df = all_image_annotations_df.dropna(
1286
+ # subset=[col for col in coord_cols if col in available_cols]
1287
+ # ).copy() # Use .copy()
1288
+
1289
+ # if valid_box_df.empty:
1290
+ # print("Warning: No valid annotation rows found after dropping NA coordinates.")
1291
+ # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1292
+ # return pages_df.to_dict(orient='records')
1293
+
1294
+
1295
+ # # Aggregate boxes into lists of dictionaries per image
1296
+ # def aggregate_boxes(group):
1297
+ # return group[available_cols].to_dict(orient='records')
1298
+
1299
+ # # Group by image and apply the aggregation
1300
+ # grouped_boxes = valid_box_df.groupby('image', observed=True, sort=False).apply(aggregate_boxes).reset_index(name='boxes')
1301
+
1302
+ # # 4. Perform a left merge: keep all images from pages_df, add boxes where they exist
1303
+ # merged_df = pd.merge(pages_df, grouped_boxes, on='image', how='left')
1304
+
1305
+ # # 5. Fill NaN in 'boxes' column (for images with no annotations) with empty lists
1306
+ # # Ensure the column exists before trying to fillna
1307
+ # if 'boxes' in merged_df.columns:
1308
+ # # Use apply with a lambda for robust filling of NAs or potential None values
1309
+ # merged_df['boxes'] = merged_df['boxes'].apply(lambda x: [] if pd.isna(x) else x)
1310
+ # else:
1311
+ # # Should not happen with left merge, but handle defensively
1312
+ # merged_df['boxes'] = [[] for _ in range(len(merged_df))]
1313
+
1314
+
1315
+ # # 6. Convert the final DataFrame to the list of dictionaries format
1316
+ # result = merged_df.to_dict(orient='records')
1317
+
1318
+ # return result
1319
+
1320
+ def convert_annotation_json_to_review_df(all_annotations:List[dict],
1321
+ redaction_decision_output:pd.DataFrame=pd.DataFrame(),
1322
+ page_sizes:pd.DataFrame=pd.DataFrame(),
1323
+ do_proximity_match:bool=True) -> pd.DataFrame:
1324
+ '''
1325
+ Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (if option selected).
1326
+ '''
1327
+
1328
+ review_file_df = convert_annotation_data_to_dataframe(all_annotations)
1329
+
1330
+ if page_sizes:
1331
+ page_sizes_df = pd.DataFrame(page_sizes)
1332
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
1333
+
1334
+ review_file_df = divide_coordinates_by_page_sizes(review_file_df, page_sizes_df)
1335
 
1336
+ redaction_decision_output = divide_coordinates_by_page_sizes(redaction_decision_output, page_sizes_df)
1337
 
1338
+ # Join on additional text data from decision output results if included, if text not already there
1339
+ if not redaction_decision_output.empty and not review_file_df.empty and do_proximity_match == True:
1340
 
1341
+ # Match text to review file to match on text
1342
+ review_file_df = do_proximity_match_all_pages_for_text(df1 = review_file_df.copy(), df2 = redaction_decision_output.copy())
1343
 
1344
  # Ensure required columns exist, filling with blank if they don't
1345
+ check_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]
1346
+
1347
+ for col in check_columns:
1348
+ if col not in review_file_df.columns:
1349
+ review_file_df[col] = ''
1350
+
1351
+ if not review_file_df.empty:
1352
+ review_file_df = review_file_df[check_columns]
1353
+ else:
1354
+ review_file_df = pd.DataFrame(columns=check_columns)
1355
 
1356
+ # If colours are saved as list, convert to tuple
1357
+ review_file_df.loc[:,"color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
1358
 
1359
+ review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
1360
 
1361
+ return review_file_df
1362
 
1363
+ def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
1364
+ image_paths:List[Image.Image],
1365
+ page_sizes:List[dict]=[]) -> List[dict]:
1366
  '''
1367
+ Convert a review csv to a json file for use by the Gradio Annotation object.
1368
  '''
1369
+ # Make sure all relevant cols are float
1370
+ float_cols = ["page", "xmin", "xmax", "ymin", "ymax"]
1371
+ for col in float_cols:
1372
+ review_file_df.loc[:, col] = pd.to_numeric(review_file_df.loc[:, col], errors='coerce')
1373
+
1374
+ # Convert relative co-ordinates into image coordinates for the image annotation output object
1375
+ if page_sizes:
1376
+ page_sizes_df = pd.DataFrame(page_sizes)
1377
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
1378
+
1379
+ review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df)
1380
+
1381
  # Keep only necessary columns
1382
+ review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax"]].drop_duplicates(subset=["image", "page", "xmin", "ymin", "xmax", "ymax", "label"])
1383
+
1384
+ # If colours are saved as list, convert to tuple
1385
+ review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
1386
 
1387
  # Group the DataFrame by the 'image' column
1388
  grouped_csv_pages = review_file_df.groupby('page')
 
1390
  # Create a list to hold the JSON data
1391
  json_data = []
1392
 
1393
+ for page_no, pdf_image_path in enumerate(page_sizes_df["image_path"]):
1394
+
1395
+ reported_page_number = int(page_no + 1)
1396
 
1397
  if reported_page_number in review_file_df["page"].values:
1398
 
1399
  # Convert each relevant group to a list of box dictionaries
1400
  selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
1401
  annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
1402
+
1403
  annotation = {
1404
  "image": pdf_image_path,
1405
  "boxes": annotation_boxes
 
1408
  else:
1409
  annotation = {}
1410
  annotation["image"] = pdf_image_path
1411
+ annotation["boxes"] = []
1412
 
1413
  # Append the structured data to the json_data list
1414
  json_data.append(annotation)
tools/file_redaction.py CHANGED
The diff for this file is too large to render. See raw diff
 
tools/find_duplicate_pages.py CHANGED
@@ -1,38 +1,34 @@
1
  import pandas as pd
2
- import argparse
3
- import glob
4
  import os
5
  import re
6
- from tools.helper_functions import output_folder
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
- import nltk
10
- from nltk.corpus import stopwords
11
- from nltk.tokenize import word_tokenize
12
- from nltk.stem import PorterStemmer
 
13
  import numpy as np
14
  import random
15
  import string
16
  from typing import List
 
17
 
18
- nltk.download('punkt')
19
- nltk.download('stopwords')
20
- nltk.download('punkt_tab')
21
 
22
- similarity_threshold = 0.9
 
 
23
 
24
- stop_words = set(stopwords.words('english'))
25
- # List of words to remove from the stopword set
26
- #words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]
27
 
28
- # Remove the specified words from the stopwords set
29
- #for word in words_to_remove:
30
- # stop_words.discard(word.lower())
31
-
32
- stemmer = PorterStemmer()
33
- vectorizer = TfidfVectorizer()
34
 
35
- def combine_ocr_output_text(input_files):
36
  """
37
  Combines text from multiple CSV files containing page and text columns.
38
  Groups text by file and page number, concatenating text within these groups.
@@ -92,7 +88,7 @@ def combine_ocr_output_text(input_files):
92
 
93
  return combined_df, output_files
94
 
95
- def process_data(df, column:str):
96
  '''
97
  Clean and stem text columns in a data frame
98
  '''
@@ -100,118 +96,130 @@ def process_data(df, column:str):
100
  def _clean_text(raw_text):
101
  # Remove HTML tags
102
  clean = re.sub(r'<.*?>', '', raw_text)
103
- clean = re.sub(r'&nbsp;', ' ', clean)
104
- clean = re.sub(r'\r\n', ' ', clean)
105
- clean = re.sub(r'&lt;', ' ', clean)
106
- clean = re.sub(r'&gt;', ' ', clean)
107
- clean = re.sub(r'<strong>', ' ', clean)
108
- clean = re.sub(r'</strong>', ' ', clean)
109
 
110
  # Replace non-breaking space \xa0 with a space
111
- clean = clean.replace(u'\xa0', u' ')
112
  # Remove extra whitespace
113
  clean = ' '.join(clean.split())
114
 
115
- # Tokenize the text
116
- words = word_tokenize(clean.lower())
117
 
118
- # Remove punctuation and numbers
119
- words = [word for word in words if word.isalpha()]
120
 
121
- # Remove stopwords
122
- words = [word for word in words if word not in stop_words]
123
 
124
  # Join the cleaned words back into a string
125
- return ' '.join(words)
126
-
127
- # Function to apply stemming
128
- def _apply_stemming(text):
129
- # Tokenize the text
130
- words = word_tokenize(text.lower())
131
-
132
- # Apply stemming to each word
133
- stemmed_words = [stemmer.stem(word) for word in words]
134
-
135
- # Join the stemmed words back into a single string
136
- return ' '.join(stemmed_words)
137
-
138
-
139
-
140
-
141
  df['text_clean'] = df[column].apply(_clean_text)
142
- df['text_clean'] = df['text_clean'].apply(_apply_stemming)
 
143
 
144
  return df
145
 
146
- def identify_similar_pages(input_files:List[str]):
147
-
148
  output_paths = []
 
 
149
 
 
150
  df, output_files = combine_ocr_output_text(input_files)
151
-
152
  output_paths.extend(output_files)
 
153
 
154
- # Clean text
155
- df = process_data(df, 'text')
156
-
157
- # Vectorise text
158
  tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
159
 
160
- # Calculate cosine similarity
161
- similarity_matrix = cosine_similarity(tfidf_matrix)
162
 
163
- # Find the indices of the most similar pages
164
- np.fill_diagonal(similarity_matrix, 0) # Ignore self-comparisons
165
- similar_pages = np.argwhere(similarity_matrix > similarity_threshold) # Threshold of similarity
166
 
167
- #print(similar_pages)
 
 
168
 
169
- # Create a DataFrame for similar pairs and their scores
170
- similarity_df = pd.DataFrame({
171
- 'Page1_Index': similar_pages[:, 0],
172
- 'Page2_Index': similar_pages[:, 1],
173
- 'Page1_File': similar_pages[:, 0],
174
- 'Page2_File': similar_pages[:, 1],
175
- 'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
176
- })
177
 
178
- # Filter out duplicate pairs (keep only one direction)
 
 
 
179
  similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
180
 
181
- # Map the indices to their corresponding text and metadata
182
- similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
183
- similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])
 
 
 
 
 
 
184
 
185
- similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
186
- similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])
187
 
188
- similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
189
- similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])
 
190
 
 
 
 
 
 
 
 
 
 
 
 
191
  similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
192
  similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
193
 
194
- # Save detailed results to a CSV file
 
 
 
 
 
195
  similarity_file_output_path = output_folder + 'page_similarity_results.csv'
196
  similarity_df_out.to_csv(similarity_file_output_path, index=False)
197
-
198
  output_paths.append(similarity_file_output_path)
199
 
200
- if not similarity_df_out.empty:
201
- unique_files = similarity_df_out['Page2_File'].unique()
202
- for redact_file in unique_files:
203
- output_file_name = output_folder + redact_file + "_whole_page.csv"
204
- whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
205
- whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)
206
-
207
- output_paths.append(output_file_name)
208
-
209
 
210
  return similarity_df_out, output_paths
211
 
212
  # Perturb text
213
  # Apply the perturbation function with a 10% error probability
214
- def perturb_text_with_errors(series):
215
 
216
  def _perturb_text(text, error_probability=0.1):
217
  words = text.split() # Split text into words
@@ -241,36 +249,3 @@ def perturb_text_with_errors(series):
241
  series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
242
 
243
  return series
244
-
245
- # Run through command line
246
- # def main():
247
- # parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
248
- # parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
249
- # parser.add_argument('--output', '-o', default='combined_text.csv',
250
- # help='Output CSV file path (default: combined_text.csv)')
251
-
252
- # args = parser.parse_args()
253
-
254
- # # Get list of input files
255
- # input_files = glob.glob(args.input_pattern)
256
-
257
- # if not input_files:
258
- # print(f"No files found matching pattern: {args.input_pattern}")
259
- # return
260
-
261
- # print(f"Processing {len(input_files)} files...")
262
-
263
- # try:
264
- # # Combine the text from all files
265
- # combined_df = combine_ocr_output_text(input_files)
266
-
267
- # # Save to CSV
268
- # combined_df.to_csv(args.output, index=False)
269
- # print(f"Successfully created combined output: {args.output}")
270
- # print(f"Total pages processed: {len(combined_df)}")
271
-
272
- # except Exception as e:
273
- # print(f"Error processing files: {str(e)}")
274
-
275
- # if __name__ == "__main__":
276
- # main()
 
1
  import pandas as pd
2
+ #import argparse
3
+ #import glob
4
  import os
5
  import re
6
+ from tools.helper_functions import OUTPUT_FOLDER
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
+ # import nltk
10
+ # from nltk.corpus import stopwords
11
+ # from nltk.tokenize import word_tokenize
12
+ # from nltk.stem import PorterStemmer
13
+ #import spacy
14
  import numpy as np
15
  import random
16
  import string
17
  from typing import List
18
+ from gradio import Progress
19
 
20
+ import en_core_web_lg #en_core_web_sm
21
+ nlp = en_core_web_lg.load()
22
+ #from tqdm import tqdm
23
 
24
+ # nltk.download('punkt')
25
+ # nltk.download('stopwords')
26
+ # nltk.download('punkt_tab')
27
 
28
+ similarity_threshold = 0.9
 
 
29
 
 
 
 
 
 
 
30
 
31
+ def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
32
  """
33
  Combines text from multiple CSV files containing page and text columns.
34
  Groups text by file and page number, concatenating text within these groups.
 
88
 
89
  return combined_df, output_files
90
 
91
+ def process_data(df:pd.DataFrame, column:str):
92
  '''
93
  Clean and stem text columns in a data frame
94
  '''
 
96
  def _clean_text(raw_text):
97
  # Remove HTML tags
98
  clean = re.sub(r'<.*?>', '', raw_text)
99
+ # clean = re.sub(r'&nbsp;', ' ', clean)
100
+ # clean = re.sub(r'\r\n', ' ', clean)
101
+ # clean = re.sub(r'&lt;', ' ', clean)
102
+ # clean = re.sub(r'&gt;', ' ', clean)
103
+ # clean = re.sub(r'<strong>', ' ', clean)
104
+ # clean = re.sub(r'</strong>', ' ', clean)
105
 
106
  # Replace non-breaking space \xa0 with a space
107
+ # clean = clean.replace(u'\xa0', u' ')
108
  # Remove extra whitespace
109
  clean = ' '.join(clean.split())
110
 
111
+ # # Tokenize the text
112
+ # words = word_tokenize(clean.lower())
113
 
114
+ # # Remove punctuation and numbers
115
+ # words = [word for word in words if word.isalpha()]
116
 
117
+ # # Remove stopwords
118
+ # words = [word for word in words if word not in stop_words]
119
 
120
  # Join the cleaned words back into a string
121
+ return clean
122
+
123
+ # Function to apply lemmatization and remove stopwords
124
+ def _apply_lemmatization(text):
125
+ doc = nlp(text)
126
+ # Keep only alphabetic tokens and remove stopwords
127
+ lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
128
+ return ' '.join(lemmatized_words)
129
+
 
 
 
 
 
 
 
130
  df['text_clean'] = df[column].apply(_clean_text)
131
+
132
+ df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
133
 
134
  return df
135
 
136
+ def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
 
137
  output_paths = []
138
+
139
+ progress(0.1, desc="Cleaning input texts")
140
 
141
+ # Load and clean data
142
  df, output_files = combine_ocr_output_text(input_files)
 
143
  output_paths.extend(output_files)
144
+ df = process_data(df, 'text') # Assume this returns 'text_clean', 'file', and 'page' columns
145
 
146
+ # Vectorize text
147
+ vectorizer = TfidfVectorizer()
 
 
148
  tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
149
 
150
+ progress(0.3, desc="Calculating text similarity")
 
151
 
152
+ # Compute sparse cosine similarity
153
+ similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False) # Keep sparse format
 
154
 
155
+ # Extract indices of similar pages above threshold
156
+ coo_matrix = similarity_matrix.tocoo()
157
+ similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])
158
 
159
+ if similar_pages.size == 0:
160
+ return pd.DataFrame(), output_paths # Return empty if no matches
161
+
162
+
 
 
 
 
163
 
164
+ # Create a DataFrame for similar pairs
165
+ similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
166
+
167
+ # Remove duplicate pairs (keep one direction)
168
  similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
169
 
170
+ progress(0.8, desc="Mapping back results")
171
+ # Map indices to metadata
172
+ # index_map = df[['file', 'page', 'text']].to_dict(orient='index')
173
+ # similarity_df['Page1_File'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['file'])
174
+ # similarity_df['Page2_File'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['file'])
175
+ # similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['page'])
176
+ # similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['page'])
177
+ # similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['text'][0:200])
178
+ # similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['text'][0:200])
179
 
180
+ # Create a DataFrame with the metadata
181
+ metadata_df = df[['file', 'page', 'text']].reset_index()
182
 
183
+ # Merge to get the metadata for Page1
184
+ similarity_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_on='index', suffixes=('', '_Page1'))
185
+ similarity_df = similarity_df.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
186
 
187
+ # Merge to get the metadata for Page2
188
+ similarity_df = similarity_df.merge(metadata_df, left_on='Page2_Index', right_on='index', suffixes=('', '_Page2'))
189
+ similarity_df = similarity_df.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
190
+
191
+ # Optionally, drop the index columns if not needed
192
+ #similarity_df = similarity_df.drop(columns=['index_Page1', 'index_Page2'])
193
+
194
+
195
+ similarity_df["Similarity_Score"] = similarity_df["Similarity_Score"].round(3)
196
+
197
+ # Sort results
198
  similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
199
  similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
200
 
201
+ similarity_df_out['Page1_Text'] = similarity_df_out['Page1_Text'][0:100]
202
+ similarity_df_out['Page2_Text'] = similarity_df_out['Page2_Text'][0:100]
203
+
204
+ progress(0.8, desc="Saving output files")
205
+
206
+ # Save results
207
  similarity_file_output_path = output_folder + 'page_similarity_results.csv'
208
  similarity_df_out.to_csv(similarity_file_output_path, index=False)
 
209
  output_paths.append(similarity_file_output_path)
210
 
211
+ # Save per-file redaction lists
212
+ for redact_file in similarity_df_out['Page2_File'].unique():
213
+ output_file_name = output_folder + redact_file + "_whole_page.csv"
214
+ whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
215
+ whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
216
+ output_paths.append(output_file_name)
 
 
 
217
 
218
  return similarity_df_out, output_paths
219
 
220
  # Perturb text
221
  # Apply the perturbation function with a 10% error probability
222
+ def perturb_text_with_errors(series:pd.Series):
223
 
224
  def _perturb_text(text, error_probability=0.1):
225
  words = text.split() # Split text into words
 
249
  series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
250
 
251
  return series
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/helper_functions.py CHANGED
@@ -7,47 +7,21 @@ import pandas as pd
7
  import numpy as np
8
  import unicodedata
9
  from typing import List
 
10
  from gradio_image_annotation import image_annotator
11
- from tools.auth import user_pool_id
12
-
13
-
14
- def get_or_create_env_var(var_name, default_value):
15
- # Get the environment variable if it exists
16
- value = os.environ.get(var_name)
17
-
18
- # If it doesn't exist, set it to the default value
19
- if value is None:
20
- os.environ[var_name] = default_value
21
- value = default_value
22
-
23
- return value
24
-
25
 
26
  # Names for options labels
27
  text_ocr_option = "Local model - selectable text"
28
  tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
29
  textract_option = "AWS Textract service - all PDF types"
30
 
 
31
  local_pii_detector = "Local"
32
  aws_pii_detector = "AWS Comprehend"
33
 
34
- output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
35
- print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
36
-
37
- input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
38
- print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
39
-
40
- # Retrieving or setting CUSTOM_HEADER
41
- CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
42
- print(f'CUSTOM_HEADER found')
43
-
44
- # Retrieving or setting CUSTOM_HEADER_VALUE
45
- CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
46
- print(f'CUSTOM_HEADER_VALUE found')
47
-
48
-
49
  def reset_state_vars():
50
- return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
51
  label="Modify redaction boxes",
52
  label_list=["Redaction"],
53
  label_colors=[(0, 0, 0)],
@@ -57,18 +31,39 @@ def reset_state_vars():
57
  show_share_button=False,
58
  show_remove_button=False,
59
  interactive=False
60
- ), [], [], [], pd.DataFrame(), pd.DataFrame()
61
-
62
- def reset_review_vars():
63
- return [], pd.DataFrame(), pd.DataFrame()
64
 
 
 
65
 
 
 
66
 
67
  def load_in_default_allow_list(allow_list_file_path):
68
  if isinstance(allow_list_file_path, str):
69
  allow_list_file_path = [allow_list_file_path]
70
  return allow_list_file_path
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def get_file_name_without_type(file_path):
74
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
@@ -113,24 +108,21 @@ def read_file(filename):
113
  elif file_type == 'parquet':
114
  return pd.read_parquet(filename)
115
 
116
- def ensure_output_folder_exists():
117
- """Checks if the 'output/' folder exists, creates it if not."""
118
-
119
- folder_name = "output/"
120
 
121
- if not os.path.exists(folder_name):
122
  # Create the folder if it doesn't exist
123
- os.makedirs(folder_name)
124
- print(f"Created the 'output/' folder.")
125
  else:
126
- print(f"The 'output/' folder already exists.")
127
 
128
- def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
129
  '''
130
  When file is loaded, update the column dropdown choices and write to relevant data states.
131
  '''
132
-
133
- custom_regex = pd.DataFrame()
134
 
135
  if in_file:
136
  file_list = [string.name for string in in_file]
@@ -138,22 +130,24 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
138
  regex_file_names = [string for string in file_list if "csv" in string.lower()]
139
  if regex_file_names:
140
  regex_file_name = regex_file_names[0]
141
- custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
142
- #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
 
 
 
143
 
144
- custom_regex.columns = custom_regex.columns.astype(str)
145
 
146
  output_text = file_type + " file loaded."
147
-
148
  print(output_text)
149
  else:
150
  output_text = "No file provided."
151
  print(output_text)
152
- return output_text, custom_regex
153
 
154
- return output_text, custom_regex
155
 
156
- def put_columns_in_df(in_file):
157
  new_choices = []
158
  concat_choices = []
159
  all_sheet_names = []
@@ -197,6 +191,16 @@ def put_columns_in_df(in_file):
197
  else:
198
  return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
199
 
 
 
 
 
 
 
 
 
 
 
200
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
201
  def add_folder_to_path(folder_path: str):
202
  '''
@@ -223,7 +227,7 @@ def add_folder_to_path(folder_path: str):
223
  def reveal_feedback_buttons():
224
  return gr.Radio(visible=True, label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document."), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
225
 
226
- def wipe_logs(feedback_logs_loc, usage_logs_loc):
227
  try:
228
  os.remove(feedback_logs_loc)
229
  except Exception as e:
@@ -233,7 +237,7 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
233
  except Exception as e:
234
  print("Could not remove usage logs file", e)
235
 
236
- def merge_csv_files(file_list):
237
 
238
  # Initialise an empty list to hold DataFrames
239
  dataframes = []
@@ -267,26 +271,9 @@ def merge_csv_files(file_list):
267
 
268
  return output_files
269
 
 
270
 
271
-
272
- async def get_connection_params(request: gr.Request):
273
- base_folder = ""
274
-
275
- #print("request user:", request.username)
276
-
277
- #request_data = await request.json() # Parse JSON body
278
- #print("All request data:", request_data)
279
- #context_value = request_data.get('context')
280
- #if 'context' in request_data:
281
- # print("Request context dictionary:", request_data['context'])
282
-
283
- # print("Request headers dictionary:", request.headers)
284
- # print("All host elements", request.client)
285
- # print("IP address:", request.client.host)
286
- # print("Query parameters:", dict(request.query_params))
287
- # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
288
- #print("Request dictionary to object:", request.request.body())
289
- print("Session hash:", request.session_hash)
290
 
291
  if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
292
  if CUSTOM_HEADER in request.headers:
@@ -304,23 +291,20 @@ async def get_connection_params(request: gr.Request):
304
 
305
  if request.username:
306
  out_session_hash = request.username
307
- base_folder = "user-files/"
308
- print("Request username found:", out_session_hash)
309
 
310
  elif 'x-cognito-id' in request.headers:
311
  out_session_hash = request.headers['x-cognito-id']
312
- base_folder = "user-files/"
313
- print("Cognito ID found:", out_session_hash)
314
 
315
  elif 'x-amzn-oidc-identity' in request.headers:
316
  out_session_hash = request.headers['x-amzn-oidc-identity']
317
- base_folder = "user-files/"
318
 
319
  # Fetch email address using Cognito client
320
  cognito_client = boto3.client('cognito-idp')
321
  try:
322
  response = cognito_client.admin_get_user(
323
- UserPoolId=user_pool_id, # Replace with your User Pool ID
324
  Username=out_session_hash
325
  )
326
  email = next(attr['Value'] for attr in response['UserAttributes'] if attr['Name'] == 'email')
@@ -331,23 +315,26 @@ async def get_connection_params(request: gr.Request):
331
  print("Error fetching user details:", e)
332
  email = None
333
 
334
-
335
  print("Cognito ID found:", out_session_hash)
336
 
337
  else:
338
  out_session_hash = request.session_hash
339
- base_folder = "temp-files/"
340
- # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
341
 
342
- output_folder = base_folder + out_session_hash + "/"
343
- #if bucket_name:
344
- # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
 
 
 
 
 
 
345
 
346
- return out_session_hash, output_folder, out_session_hash
347
-
348
 
349
- def clean_unicode_text(text):
350
- # Step 1: Normalize unicode characters to decompose any special forms
 
 
351
  normalized_text = unicodedata.normalize('NFKC', text)
352
 
353
  # Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
@@ -365,4 +352,137 @@ def clean_unicode_text(text):
365
  # Comment this line if you want to keep all Unicode characters.
366
  cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
367
 
368
- return cleaned_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import numpy as np
8
  import unicodedata
9
  from typing import List
10
+ from math import ceil
11
  from gradio_image_annotation import image_annotator
12
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
16
  tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
17
  textract_option = "AWS Textract service - all PDF types"
18
 
19
+ no_redaction_option = "Only extract text (no redaction)"
20
  local_pii_detector = "Local"
21
  aws_pii_detector = "AWS Comprehend"
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def reset_state_vars():
24
+ return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
25
  label="Modify redaction boxes",
26
  label_list=["Redaction"],
27
  label_colors=[(0, 0, 0)],
 
31
  show_share_button=False,
32
  show_remove_button=False,
33
  interactive=False
34
+ ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], ""
 
 
 
35
 
36
+ def reset_ocr_results_state():
37
+ return pd.DataFrame(), pd.DataFrame(), []
38
 
39
+ def reset_review_vars():
40
+ return pd.DataFrame(), pd.DataFrame()
41
 
42
  def load_in_default_allow_list(allow_list_file_path):
43
  if isinstance(allow_list_file_path, str):
44
  allow_list_file_path = [allow_list_file_path]
45
  return allow_list_file_path
46
 
47
+ def load_in_default_cost_codes(cost_codes_path:str):
48
+ cost_codes_df = pd.read_csv(cost_codes_path)
49
+
50
+ dropdown_choices = cost_codes_df.iloc[:,0].to_list()
51
+ dropdown_choices.insert(0, "")
52
+
53
+
54
+ out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
55
+
56
+ return cost_codes_df, out_dropdown
57
+
58
+ def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
59
+ if enforce_cost_code_textbox == "True":
60
+ if not cost_code_choice:
61
+ raise Exception("Please choose a cost code before continuing")
62
+ return
63
+
64
+ def update_dataframe(df:pd.DataFrame):
65
+ df_copy = df.copy()
66
+ return df_copy
67
 
68
  def get_file_name_without_type(file_path):
69
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
 
108
  elif file_type == 'parquet':
109
  return pd.read_parquet(filename)
110
 
111
+ def ensure_output_folder_exists(output_folder:str):
112
+ """Checks if the specified folder exists, creates it if not."""
 
 
113
 
114
+ if not os.path.exists(output_folder):
115
  # Create the folder if it doesn't exist
116
+ os.makedirs(output_folder)
117
+ print(f"Created the {output_folder} folder.")
118
  else:
119
+ print(f"The {output_folder} folder already exists.")
120
 
121
+ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
122
  '''
123
  When file is loaded, update the column dropdown choices and write to relevant data states.
124
  '''
125
+ custom_regex_df = pd.DataFrame()
 
126
 
127
  if in_file:
128
  file_list = [string.name for string in in_file]
 
130
  regex_file_names = [string for string in file_list if "csv" in string.lower()]
131
  if regex_file_names:
132
  regex_file_name = regex_file_names[0]
133
+ custom_regex_df = pd.read_csv(regex_file_name, low_memory=False, header=None)
134
+
135
+ # Select just first columns
136
+ custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:,[0]])
137
+ custom_regex_df.rename(columns={0:file_type}, inplace=True)
138
 
139
+ custom_regex_df.columns = custom_regex_df.columns.astype(str)
140
 
141
  output_text = file_type + " file loaded."
 
142
  print(output_text)
143
  else:
144
  output_text = "No file provided."
145
  print(output_text)
146
+ return output_text, custom_regex_df
147
 
148
+ return output_text, custom_regex_df
149
 
150
+ def put_columns_in_df(in_file:List[str]):
151
  new_choices = []
152
  concat_choices = []
153
  all_sheet_names = []
 
191
  else:
192
  return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
193
 
194
+ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, output_folder:str=OUTPUT_FOLDER):
195
+ textract_output_path = os.path.join(output_folder, doc_file_name_no_extension_textbox + "_textract.json")
196
+
197
+ if os.path.exists(textract_output_path):
198
+ print("Existing Textract file found.")
199
+ return True
200
+
201
+ else:
202
+ return False
203
+
204
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
205
  def add_folder_to_path(folder_path: str):
206
  '''
 
227
  def reveal_feedback_buttons():
228
  return gr.Radio(visible=True, label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document."), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
229
 
230
+ def wipe_logs(feedback_logs_loc:str, usage_logs_loc:str):
231
  try:
232
  os.remove(feedback_logs_loc)
233
  except Exception as e:
 
237
  except Exception as e:
238
  print("Could not remove usage logs file", e)
239
 
240
+ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
241
 
242
  # Initialise an empty list to hold DataFrames
243
  dataframes = []
 
271
 
272
  return output_files
273
 
274
+ async def get_connection_params(request: gr.Request, output_folder_textbox:str=OUTPUT_FOLDER, input_folder_textbox:str=INPUT_FOLDER, session_output_folder:str=SESSION_OUTPUT_FOLDER):
275
 
276
+ #print("Session hash:", request.session_hash)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
279
  if CUSTOM_HEADER in request.headers:
 
291
 
292
  if request.username:
293
  out_session_hash = request.username
294
+ #print("Request username found:", out_session_hash)
 
295
 
296
  elif 'x-cognito-id' in request.headers:
297
  out_session_hash = request.headers['x-cognito-id']
298
+ #print("Cognito ID found:", out_session_hash)
 
299
 
300
  elif 'x-amzn-oidc-identity' in request.headers:
301
  out_session_hash = request.headers['x-amzn-oidc-identity']
 
302
 
303
  # Fetch email address using Cognito client
304
  cognito_client = boto3.client('cognito-idp')
305
  try:
306
  response = cognito_client.admin_get_user(
307
+ UserPoolId=AWS_USER_POOL_ID, # Replace with your User Pool ID
308
  Username=out_session_hash
309
  )
310
  email = next(attr['Value'] for attr in response['UserAttributes'] if attr['Name'] == 'email')
 
315
  print("Error fetching user details:", e)
316
  email = None
317
 
 
318
  print("Cognito ID found:", out_session_hash)
319
 
320
  else:
321
  out_session_hash = request.session_hash
 
 
322
 
323
+ if session_output_folder == 'True':
324
+ output_folder = output_folder_textbox + out_session_hash + "/"
325
+ input_folder = input_folder_textbox + out_session_hash + "/"
326
+ else:
327
+ output_folder = output_folder_textbox
328
+ input_folder = input_folder_textbox
329
+
330
+ if not os.path.exists(output_folder): os.mkdir(output_folder)
331
+ if not os.path.exists(input_folder): os.mkdir(input_folder)
332
 
 
 
333
 
334
+ return out_session_hash, output_folder, out_session_hash, input_folder
335
+
336
+ def clean_unicode_text(text:str):
337
+ # Step 1: Normalise unicode characters to decompose any special forms
338
  normalized_text = unicodedata.normalize('NFKC', text)
339
 
340
  # Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
 
352
  # Comment this line if you want to keep all Unicode characters.
353
  cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
354
 
355
+ return cleaned_text
356
+
357
+ def load_all_output_files(folder_path:str=OUTPUT_FOLDER) -> List[str]:
358
+ """Get the file paths of all files in the given folder."""
359
+ file_paths = []
360
+
361
+ # List all files in the specified folder
362
+ for filename in os.listdir(folder_path):
363
+ # Construct full file path
364
+ full_path = os.path.join(folder_path, filename)
365
+ # Check if it's a file (not a directory)
366
+ if os.path.isfile(full_path):
367
+ file_paths.append(full_path)
368
+
369
+ return file_paths
370
+
371
+ def calculate_aws_costs(number_of_pages:str,
372
+ text_extract_method_radio:str,
373
+ handwrite_signature_checkbox:List[str],
374
+ pii_identification_method:str,
375
+ textract_output_found_checkbox:bool,
376
+ only_extract_text_radio:bool,
377
+ textract_page_cost:float=1.5/1000,
378
+ textract_signature_cost:float=2.0/1000,
379
+ comprehend_unit_cost:float=0.0001,
380
+ comprehend_size_unit_average:float=250,
381
+ average_characters_per_page:float=2000,
382
+ textract_option:str=textract_option,
383
+ no_redaction_option:str=no_redaction_option,
384
+ aws_pii_detector:str=aws_pii_detector):
385
+ '''
386
+ Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
387
+
388
+ - number_of_pages: The number of pages in the uploaded document(s).
389
+ - text_extract_method_radio: The method of text extraction.
390
+ - handwrite_signature_checkbox: Whether signatures are being extracted or not.
391
+ - pii_identification_method_drop: The method of personally-identifiable information removal.
392
+ - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
393
+ - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
394
+ - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
395
+ - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
396
+ - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
397
+ - comprehend_size_unit_average (float, optional): Average size of a 'unit' of text passed to AWS Comprehend by the app through the batching process
398
+ - average_characters_per_page (float, optional): Average number of characters on an A4 page.
399
+ - textract_option (str, optional): String label for the text_extract_method_radio button for AWS Textract.
400
+ - no_redaction_option (str, optional): String label for pii_identification_method_drop for no redaction.
401
+ - aws_pii_detector (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
402
+ '''
403
+ text_extraction_cost = 0
404
+ pii_identification_cost = 0
405
+ calculated_aws_cost = 0
406
+ number_of_pages = int(number_of_pages)
407
+
408
+ if textract_output_found_checkbox != True:
409
+ if text_extract_method_radio == textract_option:
410
+ text_extraction_cost = number_of_pages * textract_page_cost
411
+
412
+ if "Extract signatures" in handwrite_signature_checkbox:
413
+ text_extraction_cost += (textract_signature_cost * number_of_pages)
414
+
415
+ if pii_identification_method != no_redaction_option:
416
+ if pii_identification_method == aws_pii_detector:
417
+ comprehend_page_cost = ceil(average_characters_per_page / comprehend_size_unit_average) * comprehend_unit_cost
418
+ pii_identification_cost = comprehend_page_cost * number_of_pages
419
+
420
+ calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
421
+
422
+ return calculated_aws_cost
423
+
424
+ def calculate_time_taken(number_of_pages:str,
425
+ text_extract_method_radio:str,
426
+ pii_identification_method:str,
427
+ textract_output_found_checkbox:bool,
428
+ only_extract_text_radio:bool,
429
+ convert_page_time:float=0.5,
430
+ textract_page_time:float=1,
431
+ comprehend_page_time:float=1,
432
+ local_text_extraction_page_time:float=0.3,
433
+ local_pii_redaction_page_time:float=0.5,
434
+ local_ocr_extraction_page_time:float=1.5,
435
+ textract_option:str=textract_option,
436
+ text_ocr_option:str=text_ocr_option,
437
+ local_ocr_option:str=tesseract_ocr_option,
438
+ no_redaction_option:str=no_redaction_option,
439
+ aws_pii_detector:str=aws_pii_detector):
440
+ '''
441
+ Calculate the approximate time to redact a document.
442
+
443
+ - number_of_pages: The number of pages in the uploaded document(s).
444
+ - text_extract_method_radio: The method of text extraction.
445
+ - pii_identification_method_drop: The method of personally-identifiable information removal.
446
+ - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
447
+ - textract_page_time (float, optional): Approximate time to query AWS Textract.
448
+ - comprehend_page_time (float, optional): Approximate time to query text on a page with AWS Comprehend.
449
+ - local_text_redaction_page_time (float, optional): Approximate time to extract text on a page with the local text redaction option.
450
+ - local_pii_redaction_page_time (float, optional): Approximate time to redact text on a page with the local text redaction option.
451
+ - local_ocr_extraction_page_time (float, optional): Approximate time to extract text from a page with the local OCR redaction option.
452
+ - textract_option (str, optional): String label for the text_extract_method_radio button for AWS Textract.
453
+ - text_ocr_option (str, optional): String label for text_extract_method_radio for text extraction.
454
+ - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
455
+ - no_redaction_option (str, optional): String label for pii_identification_method_drop for no redaction.
456
+ - aws_pii_detector (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
457
+ '''
458
+ calculated_time_taken = 0
459
+ page_conversion_time_taken = 0
460
+ page_extraction_time_taken = 0
461
+ page_redaction_time_taken = 0
462
+
463
+ number_of_pages = int(number_of_pages)
464
+
465
+ # Page preparation/conversion to image time
466
+ if (text_extract_method_radio != text_ocr_option) and (textract_output_found_checkbox != True):
467
+ page_conversion_time_taken = number_of_pages * convert_page_time
468
+
469
+ # Page text extraction time
470
+ if text_extract_method_radio == textract_option:
471
+ if textract_output_found_checkbox != True:
472
+ page_extraction_time_taken = number_of_pages * textract_page_time
473
+ elif text_extract_method_radio == local_ocr_option:
474
+ page_extraction_time_taken = number_of_pages * local_ocr_extraction_page_time
475
+ elif text_extract_method_radio == text_ocr_option:
476
+ page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
477
+
478
+ # Page redaction time
479
+ if pii_identification_method != no_redaction_option:
480
+ if pii_identification_method == aws_pii_detector:
481
+ page_redaction_time_taken = number_of_pages * comprehend_page_time
482
+ else:
483
+ page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time
484
+
485
+ calculated_time_taken = (page_conversion_time_taken + page_extraction_time_taken + page_redaction_time_taken)/60
486
+
487
+ return calculated_time_taken
488
+
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -1,4 +1,3 @@
1
- # %%
2
  from typing import List
3
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
@@ -11,14 +10,14 @@ import Levenshtein
11
  import re
12
  import gradio as gr
13
 
14
- model_name = "en_core_web_sm" #"en_core_web_trf"
15
  score_threshold = 0.001
16
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
17
 
18
  #Load spacy model
19
  try:
20
- import en_core_web_sm
21
- nlp = en_core_web_sm.load()
22
  print("Successfully imported spaCy model")
23
 
24
  except:
 
 
1
  from typing import List
2
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
3
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 
10
  import re
11
  import gradio as gr
12
 
13
+ model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
14
  score_threshold = 0.001
15
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
16
 
17
  #Load spacy model
18
  try:
19
+ import en_core_web_lg #en_core_web_sm
20
+ nlp = en_core_web_lg.load() #en_core_web_sm.load()
21
  print("Successfully imported spaCy model")
22
 
23
  except:
tools/presidio_analyzer_custom.py CHANGED
@@ -1,8 +1,8 @@
1
  import gradio as gr
2
  from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
3
- from tqdm import tqdm
4
 
5
- from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
6
  from presidio_analyzer.nlp_engine import NlpArtifacts
7
 
8
  def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
 
1
  import gradio as gr
2
  from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
3
+ #from tqdm import tqdm
4
 
5
+ from presidio_analyzer import DictAnalyzerResult, RecognizerResult
6
  from presidio_analyzer.nlp_engine import NlpArtifacts
7
 
8
  def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
tools/redaction_review.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
@@ -7,37 +9,36 @@ import uuid
7
  from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
- from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
11
- from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
12
- from tools.file_redaction import redact_page_with_pymupdf
13
- import json
14
- import os
15
  import pymupdf
16
- from fitz import Document
17
  from PIL import ImageDraw, Image
18
- from collections import defaultdict
19
 
20
- Image.MAX_IMAGE_PIXELS = None
 
 
 
 
 
21
 
22
  def decrease_page(number:int):
23
  '''
24
  Decrease page number for review redactions page.
25
  '''
26
- #print("number:", str(number))
27
  if number > 1:
28
  return number - 1, number - 1
29
  else:
30
  return 1, 1
31
 
32
- def increase_page(number:int, image_annotator_object:AnnotatedImageData):
33
  '''
34
  Increase page number for review redactions page.
35
  '''
36
 
37
- if not image_annotator_object:
38
  return 1, 1
39
 
40
- max_pages = len(image_annotator_object)
41
 
42
  if number < max_pages:
43
  return number + 1, number + 1
@@ -54,137 +55,294 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
54
 
55
  return current_zoom_level, annotate_current_page
56
 
57
- def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
 
58
  '''
59
- Remove items from the annotator object where the same page exists twice.
60
  '''
61
- # Group items by 'image'
62
- image_groups = defaultdict(list)
63
- for item in data:
64
- image_groups[item['image']].append(item)
65
-
66
- # Process each group to prioritize items with non-empty boxes
67
- result = []
68
- for image, items in image_groups.items():
69
- # Filter items with non-empty boxes
70
- non_empty_boxes = [item for item in items if item.get('boxes')]
71
-
72
- # Remove 'text' elements from boxes
73
- for item in non_empty_boxes:
74
- if 'boxes' in item:
75
- item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
76
-
77
- if non_empty_boxes:
78
- # Keep the first entry with non-empty boxes
79
- result.append(non_empty_boxes[0])
80
  else:
81
- # If all items have empty or missing boxes, keep the first item
82
- result.append(items[0])
83
-
84
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- def get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr):
87
  recogniser_entities_list = ["Redaction"]
88
- recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
89
- recogniser_dataframe_out = recogniser_dataframe_gr
 
90
 
91
  try:
92
- review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
93
- recogniser_entities = review_dataframe["label"].unique().tolist()
94
- recogniser_entities.append("ALL")
95
- recogniser_entities_for_drop = sorted(recogniser_entities)
 
 
 
 
96
 
 
 
97
 
98
- recogniser_dataframe_out = gr.Dataframe(review_dataframe)
99
- recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_for_drop[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
100
 
101
- recogniser_entities_list = [entity for entity in recogniser_entities_for_drop if entity != 'Redaction' and entity != 'ALL'] # Remove any existing 'Redaction'
102
- recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
 
103
 
104
  except Exception as e:
105
  print("Could not extract recogniser information:", e)
106
- recogniser_dataframe_out = recogniser_dataframe_gr
107
- recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
 
 
 
 
 
108
  recogniser_entities_list = ["Redaction"]
 
 
109
 
110
- return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list
111
 
112
- def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
 
 
113
  '''
114
- Update a gradio_image_annotation object with new annotation data
115
- '''
116
  recogniser_entities_list = ["Redaction"]
117
  recogniser_dataframe_out = pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- if recogniser_dataframe_gr.empty:
120
- recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
121
- elif recogniser_dataframe_gr.iloc[0,0] == "":
122
- recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
123
- else:
124
- review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
125
- recogniser_dataframe_out = gr.Dataframe(review_dataframe)
126
- recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
127
 
128
- recogniser_entities_list = sorted(recogniser_entities_list)
129
- recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] # Remove any existing 'Redaction'
130
- recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
 
 
131
 
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  zoom_str = str(zoom) + '%'
134
- recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
 
135
 
136
- if not image_annotator_object:
 
 
 
 
137
  page_num_reported = 1
138
 
139
- out_image_annotator = image_annotator(
140
- None,
141
- boxes_alpha=0.1,
142
- box_thickness=1,
143
- label_list=recogniser_entities_list,
144
- label_colors=recogniser_colour_list,
145
- show_label=False,
146
- height=zoom_str,
147
- width=zoom_str,
148
- box_min_size=1,
149
- box_selected_thickness=2,
150
- handle_size=4,
151
- sources=None,#["upload"],
152
- show_clear_button=False,
153
- show_share_button=False,
154
- show_remove_button=False,
155
- handles_cursor=True,
156
- interactive=True
157
- )
158
- number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
159
 
160
- return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
 
 
 
161
 
162
- #print("page_num at start of update_annotator function:", page_num)
163
 
164
- if page_num is None:
165
- page_num = 0
166
 
167
- # Check bounding values for current page and page max
168
- if page_num > 0:
169
- page_num_reported = page_num
170
 
171
- elif page_num == 0: page_num_reported = 1
172
 
173
- else:
174
- page_num = 0
175
- page_num_reported = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- page_max_reported = len(image_annotator_object)
178
 
179
- if page_num_reported > page_max_reported:
180
- page_num_reported = page_max_reported
181
 
182
- image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
 
183
 
 
184
 
 
 
 
 
 
 
 
185
 
186
- out_image_annotator = image_annotator(
187
- value = image_annotator_object[page_num_reported - 1],
188
  boxes_alpha=0.1,
189
  box_thickness=1,
190
  label_list=recogniser_entities_list,
@@ -201,77 +359,125 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
201
  show_remove_button=False,
202
  handles_cursor=True,
203
  interactive=True
204
- )
205
 
206
- number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
 
209
 
210
- def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True),recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), clear_all:bool=False):
211
- '''
212
- Overwrite current image annotations with modifications
213
- '''
214
 
215
- if not current_page:
216
- current_page = 1
 
 
 
217
 
218
- #If no previous page or is 0, i.e. first time run, then rewrite current page
219
- #if not previous_page:
220
- # previous_page = current_page
221
 
222
- #print("image_annotated:", image_annotated)
223
-
224
- image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
 
 
225
 
226
- if clear_all == False:
227
- all_image_annotations[previous_page - 1] = image_annotated
228
- else:
229
- all_image_annotations[previous_page - 1]["boxes"] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- #print("all_image_annotations:", all_image_annotations)
 
 
232
 
233
- # Rewrite all_image_annotations search dataframe with latest updates
234
- try:
235
- review_dataframe = convert_review_json_to_pandas_df(all_image_annotations)[["page", "label"]]
236
- #print("review_dataframe['label']", review_dataframe["label"])
237
- recogniser_entities = review_dataframe["label"].unique().tolist()
238
- recogniser_entities.append("ALL")
239
- recogniser_entities = sorted(recogniser_entities)
240
-
241
- recogniser_dataframe_out = gr.Dataframe(review_dataframe)
242
- #recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
243
- recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_drop, choices=recogniser_entities, allow_custom_value=True, interactive=True)
244
- except Exception as e:
245
- print("Could not extract recogniser information:", e)
246
- recogniser_dataframe_out = recogniser_dataframe
247
 
248
- return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
 
249
 
250
- def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
251
  '''
252
  Apply modified redactions to a pymupdf and export review files
253
  '''
254
- #print("all_image_annotations:", all_image_annotations)
255
 
256
  output_files = []
257
  output_log_files = []
258
  pdf_doc = []
 
259
 
260
- #print("File paths in apply_redactions:", file_paths)
261
-
262
- image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
263
 
264
- all_image_annotations[current_page - 1] = image_annotated
 
 
265
 
266
- if not image_annotated:
267
- print("No image annotations found")
268
- return doc, all_image_annotations
269
 
270
  if isinstance(file_paths, str):
271
  file_paths = [file_paths]
272
 
273
  for file_path in file_paths:
274
- #print("file_path:", file_path)
275
  file_name_without_ext = get_file_name_without_type(file_path)
276
  file_name_with_ext = os.path.basename(file_path)
277
 
@@ -282,11 +488,9 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
282
  if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
283
  image = Image.open(file_paths[-1])
284
 
285
- #image = pdf_doc
286
-
287
  draw = ImageDraw.Draw(image)
288
 
289
- for img_annotation_box in image_annotated['boxes']:
290
  coords = [img_annotation_box["xmin"],
291
  img_annotation_box["ymin"],
292
  img_annotation_box["xmax"],
@@ -294,6 +498,25 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
294
 
295
  fill = img_annotation_box["color"]
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  draw.rectangle(coords, fill=fill)
298
 
299
  output_image_path = output_folder + file_name_without_ext + "_redacted.png"
@@ -301,12 +524,10 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
301
 
302
  output_files.append(output_image_path)
303
 
304
- print("Redactions saved to image file")
305
-
306
  doc = [image]
307
 
308
  elif file_extension in '.csv':
309
- print("This is a csv")
310
  pdf_doc = []
311
 
312
  # If working with pdfs
@@ -317,38 +538,40 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
317
  output_files.append(orig_pdf_file_path)
318
 
319
  number_of_pages = pdf_doc.page_count
 
320
 
321
- print("Saving pages to file.")
 
322
 
323
  for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
324
-
325
- #print("Saving page", str(i))
326
-
327
  image_loc = all_image_annotations[i]['image']
328
- #print("Image location:", image_loc)
329
 
330
  # Load in image object
331
  if isinstance(image_loc, np.ndarray):
332
  image = Image.fromarray(image_loc.astype('uint8'))
333
- #all_image_annotations[i]['image'] = image_loc.tolist()
334
  elif isinstance(image_loc, Image.Image):
335
  image = image_loc
336
- #image_out_folder = output_folder + file_name_without_ext + "_page_" + str(i) + ".png"
337
- #image_loc.save(image_out_folder)
338
- #all_image_annotations[i]['image'] = image_out_folder
339
  elif isinstance(image_loc, str):
340
- image = Image.open(image_loc)
 
 
 
 
 
341
 
342
  pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
343
- pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
 
344
 
 
345
  else:
346
  print("File type not recognised.")
347
 
348
  #try:
349
  if pdf_doc:
350
  out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
351
- pdf_doc.save(out_pdf_file_path)
352
  output_files.append(out_pdf_file_path)
353
 
354
  else:
@@ -361,40 +584,238 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
361
  output_files.append(orig_pdf_file_path)
362
 
363
  try:
364
- #print("Saving annotations to JSON")
365
-
366
- out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
367
- with open(out_annotation_file_path, 'w') as f:
368
- json.dump(all_image_annotations, f)
369
- output_log_files.append(out_annotation_file_path)
370
-
371
- #print("Saving annotations to CSV review file")
372
-
373
- #print("review_file_state:", review_file_state)
374
-
375
- # Convert json to csv and also save this
376
- review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
377
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
 
378
  review_df.to_csv(out_review_file_file_path, index=None)
379
  output_files.append(out_review_file_file_path)
380
 
381
  except Exception as e:
382
- print("Could not save annotations to json or csv file:", e)
383
 
384
- return doc, all_image_annotations, output_files, output_log_files
385
 
386
  def get_boxes_json(annotations:AnnotatedImageData):
387
  return annotations["boxes"]
388
 
389
- def update_entities_df(choice:str, df:pd.DataFrame):
390
- if choice=="ALL":
391
- return df
392
- else:
393
- return df.loc[df["label"]==choice,:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
 
396
  row_value_page = evt.row_value[0] # This is the page number value
397
- return row_value_page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
400
  '''
@@ -406,10 +827,13 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
406
  - image_width: Width of the source image
407
  - image_height: Height of the source image
408
  - x1, y1, x2, y2: Coordinates in image space
 
409
 
410
  Returns:
411
  - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
412
  '''
 
 
413
 
414
  # Calculate scaling factors
415
  scale_width = pdf_page_width / image_width
@@ -430,12 +854,30 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
430
 
431
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
432
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
- def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str]):
 
 
 
 
 
 
435
  '''
436
  Create an xfdf file from a review csv file and a pdf
437
  '''
438
-
 
439
  # Create root element
440
  xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
441
 
@@ -445,23 +887,93 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
445
 
446
  # Add annots
447
  annots = SubElement(xfdf, 'annots')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
- for _, row in df.iterrows():
 
 
450
  page_python_format = int(row["page"])-1
451
 
452
  pymupdf_page = pymupdf_doc.load_page(page_python_format)
453
 
454
- pdf_page_height = pymupdf_page.rect.height
455
- pdf_page_width = pymupdf_page.rect.width
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
- image = image_paths[page_python_format]
 
458
 
459
- #print("image:", image)
 
460
 
461
- if isinstance(image, str):
462
- image = Image.open(image)
463
 
464
- image_page_width, image_page_height = image.size
 
 
 
 
 
 
 
 
 
 
465
 
466
  # Create redaction annotation
467
  redact_annot = SubElement(annots, 'redact')
@@ -473,17 +985,23 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
473
  # Set page number (subtract 1 as PDF pages are 0-based)
474
  redact_annot.set('page', str(int(row['page']) - 1))
475
 
476
- # Convert coordinates
477
- x1, y1, x2, y2 = convert_image_coords_to_adobe(
478
- pdf_page_width,
479
- pdf_page_height,
480
- image_page_width,
481
- image_page_height,
482
- row['xmin'],
 
 
 
 
 
 
 
483
  row['ymin'],
484
  row['xmax'],
485
- row['ymax']
486
- )
487
 
488
  if CUSTOM_BOX_COLOUR == "grey":
489
  colour_str = "0.5,0.5,0.5"
@@ -535,12 +1053,13 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
535
 
536
  return xml_str
537
 
538
- def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
539
  '''
540
  Load in files to convert a review file into an Adobe comment file format
541
  '''
542
  output_paths = []
543
  pdf_name = ""
 
544
 
545
  if isinstance(input_files, str):
546
  file_paths_list = [input_files]
@@ -557,36 +1076,36 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
557
  else:
558
  file_path = file.name
559
 
560
- file_path_name = get_file_name_without_type(file_path)
561
- file_path_end = detect_file_type(file_path)
562
 
563
- if file_path_end == "pdf":
564
- pdf_name = os.path.basename(file_path)
565
 
566
- if file_path_end == "csv":
567
- # If no pdf name, just get the name of the file path
568
- if not pdf_name:
569
- pdf_name = file_path_name
570
- # Read CSV file
571
- df = pd.read_csv(file_path)
572
 
573
- df.fillna('', inplace=True) # Replace NaN with an empty string
574
 
575
- xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
576
 
577
- output_path = output_folder + file_path_name + "_adobe.xfdf"
578
-
579
- with open(output_path, 'w', encoding='utf-8') as f:
580
- f.write(xfdf_content)
581
 
582
- output_paths.append(output_path)
583
 
584
  return output_paths
585
 
586
 
587
  ### Convert xfdf coordinates back to image for app
588
 
589
- def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
590
  '''
591
  Converts coordinates from Adobe PDF space to image space.
592
 
@@ -620,7 +1139,7 @@ def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width,
620
 
621
  return image_x1, image_y1, image_x2, image_y2
622
 
623
- def parse_xfdf(xfdf_path):
624
  '''
625
  Parse the XFDF file and extract redaction annotations.
626
 
@@ -641,8 +1160,6 @@ def parse_xfdf(xfdf_path):
641
  # Find all redact elements using the namespace
642
  for redact in root.findall('.//xfdf:redact', namespaces=namespace):
643
 
644
- #print("redact:", redact)
645
-
646
  redaction_info = {
647
  'image': '', # Image will be filled in later
648
  'page': int(redact.get('page')) + 1, # Convert to 1-based index
@@ -655,12 +1172,10 @@ def parse_xfdf(xfdf_path):
655
  'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
656
  }
657
  redactions.append(redaction_info)
658
-
659
- print("redactions:", redactions)
660
 
661
  return redactions
662
 
663
- def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
664
  '''
665
  Convert redaction annotations from XFDF and associated images into a DataFrame.
666
 
@@ -676,8 +1191,6 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
676
  xfdf_paths = []
677
  df = pd.DataFrame()
678
 
679
- #print("Image paths:", image_paths)
680
-
681
  # Sort the file paths so that the pdfs come first
682
  file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
683
 
@@ -693,7 +1206,6 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
693
 
694
  if file_path_end == "pdf":
695
  pdf_name = os.path.basename(file_path)
696
- #print("pymupdf_doc:", pymupdf_doc)
697
 
698
  # Add pdf to outputs
699
  output_paths.append(file_path)
@@ -704,18 +1216,10 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
704
  message = "Original PDF needed to convert from .xfdf format"
705
  print(message)
706
  raise ValueError(message)
707
-
708
  xfdf_path = file
709
 
710
- # if isinstance(xfdf_paths, str):
711
- # xfdf_path = xfdf_paths.name
712
- # else:
713
- # xfdf_path = xfdf_paths[0].name
714
-
715
  file_path_name = get_file_name_without_type(xfdf_path)
716
 
717
- #print("file_path_name:", file_path_name)
718
-
719
  # Parse the XFDF file
720
  redactions = parse_xfdf(xfdf_path)
721
 
@@ -734,8 +1238,6 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
734
 
735
  image_path = image_paths[page_python_format]
736
 
737
- #print("image_path:", image_path)
738
-
739
  if isinstance(image_path, str):
740
  image = Image.open(image_path)
741
 
@@ -747,7 +1249,6 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
747
  df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
748
 
749
  # Optionally, you can add the image path or other relevant information
750
- #print("Image path:", image_path)
751
  df.loc[_, 'image'] = image_path
752
 
753
  #print('row:', row)
 
1
+ import os
2
+ import re
3
  import gradio as gr
4
  import pandas as pd
5
  import numpy as np
 
9
  from typing import List
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
+ from pymupdf import Document, Rect
 
 
 
 
13
  import pymupdf
14
+ #from fitz
15
  from PIL import ImageDraw, Image
 
16
 
17
+ from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
18
+ from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes
19
+ from tools.helper_functions import get_file_name_without_type, detect_file_type
20
+ from tools.file_redaction import redact_page_with_pymupdf
21
+
22
+ if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
23
 
24
  def decrease_page(number:int):
25
  '''
26
  Decrease page number for review redactions page.
27
  '''
 
28
  if number > 1:
29
  return number - 1, number - 1
30
  else:
31
  return 1, 1
32
 
33
+ def increase_page(number:int, page_image_annotator_object:AnnotatedImageData):
34
  '''
35
  Increase page number for review redactions page.
36
  '''
37
 
38
+ if not page_image_annotator_object:
39
  return 1, 1
40
 
41
+ max_pages = len(page_image_annotator_object)
42
 
43
  if number < max_pages:
44
  return number + 1, number + 1
 
55
 
56
  return current_zoom_level, annotate_current_page
57
 
58
+
59
+ def update_dropdown_list_based_on_dataframe(df:pd.DataFrame, column:str) -> List["str"]:
60
  '''
61
+ Gather unique elements from a string pandas Series, then append 'ALL' to the start and return the list.
62
  '''
63
+ if isinstance(df, pd.DataFrame):
64
+ # Check if the Series is empty or all NaN
65
+ if column not in df.columns or df[column].empty or df[column].isna().all():
66
+ return ["ALL"]
67
+ elif column != "page":
68
+ entities = df[column].astype(str).unique().tolist()
69
+ entities_for_drop = sorted(entities)
70
+ entities_for_drop.insert(0, "ALL")
 
 
 
 
 
 
 
 
 
 
 
71
  else:
72
+ # Ensure the column can be converted to int - assumes it is the page column
73
+ try:
74
+ entities = df[column].astype(int).unique()
75
+ entities_for_drop = sorted(entities)
76
+ entities_for_drop = [str(e) for e in entities_for_drop] # Convert back to string
77
+ entities_for_drop.insert(0, "ALL")
78
+ except ValueError:
79
+ return ["ALL"] # Handle case where conversion fails
80
+
81
+ return entities_for_drop # Ensure to return the list
82
+ else:
83
+ return ["ALL"]
84
+
85
+ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:AnnotatedImageData,
86
+ recogniser_dataframe_base:pd.DataFrame,
87
+ recogniser_dropdown_value:str,
88
+ text_dropdown_value:str,
89
+ page_dropdown_value:str,
90
+ review_df:pd.DataFrame=[],
91
+ page_sizes:List[str]=[]):
92
+ '''
93
+ Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
94
+ '''
95
 
 
96
  recogniser_entities_list = ["Redaction"]
97
+ recogniser_dataframe_out = recogniser_dataframe_base
98
+ recogniser_dataframe_out_gr = gr.Dataframe()
99
+ review_dataframe = review_df
100
 
101
  try:
102
+ review_dataframe = convert_annotation_json_to_review_df(page_image_annotator_object, review_df, page_sizes)
103
+
104
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
105
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
106
+
107
+ # This is the choice list for entities when creating a new redaction box
108
+ recogniser_entities_list = [entity for entity in recogniser_entities_for_drop.copy() if entity != 'Redaction' and entity != 'ALL'] # Remove any existing 'Redaction'
109
+ recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
110
 
111
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "text")
112
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value, choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
113
 
114
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
115
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
116
 
117
+ recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
118
+
119
+ recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
120
 
121
  except Exception as e:
122
  print("Could not extract recogniser information:", e)
123
+ recogniser_dataframe_out = recogniser_dataframe_base[["page", "label", "text"]]
124
+
125
+ label_choices = review_dataframe["label"].astype(str).unique().tolist()
126
+ text_choices = review_dataframe["text"].astype(str).unique().tolist()
127
+ page_choices = review_dataframe["page"].astype(str).unique().tolist()
128
+
129
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=label_choices, allow_custom_value=True, interactive=True)
130
  recogniser_entities_list = ["Redaction"]
131
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value, choices=text_choices, allow_custom_value=True, interactive=True)
132
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_choices, allow_custom_value=True, interactive=True)
133
 
134
+ return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
135
 
136
+ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
137
+ '''
138
+ Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
139
  '''
 
 
140
  recogniser_entities_list = ["Redaction"]
141
  recogniser_dataframe_out = pd.DataFrame()
142
+ recogniser_dataframe_out_gr = gr.Dataframe()
143
+
144
+ # If base recogniser dataframe is empy, need to create it.
145
+ if recogniser_dataframe_base.empty:
146
+ recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object, recogniser_dataframe_base, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
147
+ elif recogniser_dataframe_base.iloc[0,0] == "":
148
+ recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object, recogniser_dataframe_base, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
149
+ else:
150
+ recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_dropdown, recogniser_entities_list, text_dropdown, page_dropdown = get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object, recogniser_dataframe_base, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
151
+
152
+ review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
153
+
154
+ recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
155
+
156
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
157
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
158
+
159
+ recogniser_entities_list_base = recogniser_dataframe_out["label"].astype(str).unique().tolist()
160
+
161
+ # Recogniser entities list is the list of choices that appear when you make a new redaction box
162
+ recogniser_entities_list = [entity for entity in recogniser_entities_list_base if entity != 'Redaction']
163
+ recogniser_entities_list.insert(0, 'Redaction')
164
+
165
+ return recogniser_entities_list, recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, text_entities_drop, page_entities_drop
166
+
167
+ def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
168
+ return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
169
+
170
+ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
171
+ image_file_paths:List[str],
172
+ page_sizes:List[dict],
173
+ current_page:int,
174
+ previous_page:int,
175
+ current_image_annotations_state:List[str],
176
+ current_page_annotator:object):
177
+ '''
178
+ Update the visible annotation object with the latest review file information
179
+ '''
180
+ out_image_annotations_state = current_image_annotations_state
181
+ out_current_page_annotator = current_page_annotator
182
+
183
+ print("page_sizes:", page_sizes)
184
+
185
+ review_df.to_csv(OUTPUT_FOLDER + "review_df_in_update_annotator.csv")
186
+
187
+ if not review_df.empty:
188
+
189
+ out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
190
+
191
+ print("out_image_annotations_state[current_page-1]:", out_image_annotations_state[current_page-1])
192
+
193
+ if previous_page == current_page:
194
+ out_current_page_annotator = out_image_annotations_state[current_page-1]
195
+
196
+ return out_current_page_annotator, out_image_annotations_state
197
+
198
+
199
+
200
+
201
+ def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
202
+ selected_rows_df: pd.DataFrame,
203
+ image_file_paths:List[str],
204
+ page_sizes:List[dict],
205
+ image_annotations_state:dict,
206
+ recogniser_entity_dataframe_base:pd.DataFrame):
207
+ '''
208
+ Remove selected items from the review dataframe from the annotation object and review dataframe.
209
+ '''
210
+
211
+ backup_review_state = review_df
212
+ backup_image_annotations_state = image_annotations_state
213
+ backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
214
 
215
+ if not selected_rows_df.empty and not review_df.empty:
216
+ # Ensure selected_rows_df has the same relevant columns
217
+ selected_subset = selected_rows_df[['label', 'page', 'text']].drop_duplicates(subset=['label', 'page', 'text'])
 
 
 
 
 
218
 
219
+ # Perform anti-join using merge with an indicator column
220
+ merged_df = review_df.merge(selected_subset, on=['label', 'page', 'text'], how='left', indicator=True)
221
+
222
+ # Keep only the rows that do not have a match in selected_rows_df
223
+ out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
224
 
225
+ out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
226
 
227
+ out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
228
+
229
+ # Either there is nothing left in the selection dataframe, or the review dataframe
230
+ else:
231
+ out_review_df = review_df
232
+ out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
233
+
234
+ out_image_annotations_state = image_annotations_state
235
+
236
+ return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
237
+
238
+ def update_annotator_object_and_filter_df(
239
+ all_image_annotations:List[AnnotatedImageData],
240
+ gradio_annotator_current_page_number:int,
241
+ recogniser_entities_dropdown_value:str="ALL",
242
+ page_dropdown_value:str="ALL",
243
+ text_dropdown_value:str="ALL",
244
+ recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True),
245
+ zoom:int=100,
246
+ review_df:pd.DataFrame=[],
247
+ page_sizes:List[dict]=[],
248
+ doc_full_file_name_textbox:str='',
249
+ input_folder:str=INPUT_FOLDER):
250
+ '''
251
+ Update a gradio_image_annotation object with new annotation data.
252
+ '''
253
  zoom_str = str(zoom) + '%'
254
+
255
+ if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
256
 
257
+ # Check bounding values for current page and page max
258
+ if gradio_annotator_current_page_number > 0: page_num_reported = gradio_annotator_current_page_number
259
+ elif gradio_annotator_current_page_number == 0: page_num_reported = 1 # minimum possible reported page is 1
260
+ else:
261
+ gradio_annotator_current_page_number = 0
262
  page_num_reported = 1
263
 
264
+ # Ensure page displayed can't exceed number of pages in document
265
+ page_max_reported = len(all_image_annotations)
266
+ if page_num_reported > page_max_reported: page_num_reported = page_max_reported
267
+
268
+ page_num_reported_zero_indexed = page_num_reported - 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
+ # First, check that the image on the current page is valid, replace with what exists in page_sizes object if not
271
+ page_image_annotator_object, all_image_annotations = replace_images_in_image_annotation_object(all_image_annotations, all_image_annotations[page_num_reported_zero_indexed], page_sizes, page_num_reported)
272
+
273
+ all_image_annotations[page_num_reported_zero_indexed] = page_image_annotator_object
274
 
275
+ current_image_path = all_image_annotations[page_num_reported_zero_indexed]['image']
276
 
277
+ # If image path is still not valid, load in a new image an overwrite it. Then replace all items in the image annotation object for all pages based on the updated information.
278
+ page_sizes_df = pd.DataFrame(page_sizes)
279
 
280
+ if not os.path.exists(current_image_path):
 
 
281
 
282
+ page_num, replaced_image_path, width, height = process_single_page_for_image_conversion(doc_full_file_name_textbox, page_num_reported_zero_indexed, input_folder=input_folder)
283
 
284
+ # Overwrite page_sizes values
285
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"] = width
286
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_height"] = height
287
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_path"] = replaced_image_path
288
+
289
+ else:
290
+ if not page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"].isnull().all():
291
+ width = page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"].max()
292
+ height = page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_height"].max()
293
+ else:
294
+ image = Image.open(current_image_path)
295
+ width = image.width
296
+ height = image.height
297
+
298
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"] = width
299
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_height"] = height
300
+
301
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_path"] = current_image_path
302
+
303
+ replaced_image_path = current_image_path
304
+
305
+ if review_df.empty: review_df = pd.DataFrame(columns=["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"])
306
+
307
+ ##
308
+
309
+ review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
310
+
311
+ # Update dropdowns and review selection dataframe with the updated annotator object
312
+ recogniser_entities_list, recogniser_dataframe_out_gr, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_entities_drop, page_entities_drop = update_recogniser_dataframes(all_image_annotations, recogniser_dataframe_base, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df.copy(), page_sizes)
313
+
314
+ recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
315
+
316
+ # page_sizes_df has been changed - save back to page_sizes_object
317
+ page_sizes = page_sizes_df.to_dict(orient='records')
318
+
319
+ images_list = list(page_sizes_df["image_path"])
320
+ images_list[page_num_reported_zero_indexed] = replaced_image_path
321
+
322
+ all_image_annotations[page_num_reported_zero_indexed]['image'] = replaced_image_path
323
+
324
+ # Multiply out image_annotation coordinates from relative to absolute if necessary
325
+ all_image_annotations_df = convert_annotation_data_to_dataframe(all_image_annotations)
326
 
327
+ all_image_annotations_df = multiply_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
328
 
329
+ all_image_annotations = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
 
330
 
331
+ # Remove blank duplicate entries
332
+ all_image_annotations = remove_duplicate_images_with_blank_boxes(all_image_annotations)
333
 
334
+ current_page_image_annotator_object = all_image_annotations[page_num_reported_zero_indexed]
335
 
336
+ page_number_reported_gradio = gr.Number(label = "Current page", value=page_num_reported, precision=0)
337
+
338
+ ###
339
+ # If no data, present a blank page
340
+ if not all_image_annotations:
341
+ print("No all_image_annotation object found")
342
+ page_num_reported = 1
343
 
344
+ out_image_annotator = image_annotator(
345
+ value = None,
346
  boxes_alpha=0.1,
347
  box_thickness=1,
348
  label_list=recogniser_entities_list,
 
359
  show_remove_button=False,
360
  handles_cursor=True,
361
  interactive=True
362
+ )
363
 
364
+ return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
365
+
366
+ else:
367
+ ### Present image_annotator outputs
368
+ out_image_annotator = image_annotator(
369
+ value = current_page_image_annotator_object,
370
+ boxes_alpha=0.1,
371
+ box_thickness=1,
372
+ label_list=recogniser_entities_list,
373
+ label_colors=recogniser_colour_list,
374
+ show_label=False,
375
+ height=zoom_str,
376
+ width=zoom_str,
377
+ box_min_size=1,
378
+ box_selected_thickness=2,
379
+ handle_size=4,
380
+ sources=None,#["upload"],
381
+ show_clear_button=False,
382
+ show_share_button=False,
383
+ show_remove_button=False,
384
+ handles_cursor=True,
385
+ interactive=True
386
+ )
387
 
388
+ #print("all_image_annotations at end of update_annotator...:", all_image_annotations)
389
+ #print("review_df at end of update_annotator_object:", review_df)
390
 
391
+ return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
 
 
 
392
 
393
+ def replace_images_in_image_annotation_object(
394
+ all_image_annotations:List[dict],
395
+ page_image_annotator_object:AnnotatedImageData,
396
+ page_sizes:List[dict],
397
+ page:int):
398
 
399
+ '''
400
+ Check if the image value in an AnnotatedImageData dict is a placeholder or np.array. If either of these, replace the value with the file path of the image that is hopefully already loaded into the app related to this page.
401
+ '''
402
 
403
+ page_zero_index = page - 1
404
+
405
+ if isinstance(all_image_annotations[page_zero_index]["image"], np.ndarray) or "placeholder_image" in all_image_annotations[page_zero_index]["image"] or isinstance(page_image_annotator_object['image'], np.ndarray):
406
+ page_sizes_df = pd.DataFrame(page_sizes)
407
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
408
 
409
+ # Check for matching pages
410
+ matching_paths = page_sizes_df.loc[page_sizes_df['page'] == page, "image_path"].unique()
411
+
412
+ if matching_paths.size > 0:
413
+ image_path = matching_paths[0]
414
+ page_image_annotator_object['image'] = image_path
415
+ all_image_annotations[page_zero_index]["image"] = image_path
416
+ else:
417
+ print(f"No image path found for page {page}.")
418
+
419
+ return page_image_annotator_object, all_image_annotations
420
+
421
+ def update_all_page_annotation_object_based_on_previous_page(
422
+ page_image_annotator_object:AnnotatedImageData,
423
+ current_page:int,
424
+ previous_page:int,
425
+ all_image_annotations:List[AnnotatedImageData],
426
+ page_sizes:List[dict]=[],
427
+ clear_all:bool=False
428
+ ):
429
+ '''
430
+ Overwrite image annotations on the page we are moving from with modifications.
431
+ '''
432
 
433
+ previous_page_zero_index = previous_page -1
434
+
435
+ if not current_page: current_page = 1
436
 
437
+ #print("page_image_annotator_object at start of update_all_page_annotation_object:", page_image_annotator_object)
438
+
439
+ page_image_annotator_object, all_image_annotations = replace_images_in_image_annotation_object(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
440
+
441
+ #print("page_image_annotator_object after replace_images in update_all_page_annotation_object:", page_image_annotator_object)
 
 
 
 
 
 
 
 
 
442
 
443
+ if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
444
+ else: all_image_annotations[previous_page_zero_index]["boxes"] = []
445
 
446
+ return all_image_annotations, current_page, current_page
447
+
448
+ def apply_redactions_to_review_df_and_files(page_image_annotator_object:AnnotatedImageData,
449
+ file_paths:List[str],
450
+ doc:Document,
451
+ all_image_annotations:List[AnnotatedImageData],
452
+ current_page:int,
453
+ review_file_state:pd.DataFrame,
454
+ output_folder:str = OUTPUT_FOLDER,
455
+ save_pdf:bool=True,
456
+ page_sizes:List[dict]=[],
457
+ progress=gr.Progress(track_tqdm=True)):
458
  '''
459
  Apply modified redactions to a pymupdf and export review files
460
  '''
 
461
 
462
  output_files = []
463
  output_log_files = []
464
  pdf_doc = []
465
+ review_df = review_file_state
466
 
467
+ page_image_annotator_object = all_image_annotations[current_page - 1]
 
 
468
 
469
+ # This replaces the numpy array image object with the image file path
470
+ page_image_annotator_object, all_image_annotations = replace_images_in_image_annotation_object(all_image_annotations, page_image_annotator_object, page_sizes, current_page)
471
+ page_image_annotator_object['image'] = all_image_annotations[current_page - 1]["image"]
472
 
473
+ if not page_image_annotator_object:
474
+ print("No image annotations object found for page")
475
+ return doc, all_image_annotations, output_files, output_log_files, review_df
476
 
477
  if isinstance(file_paths, str):
478
  file_paths = [file_paths]
479
 
480
  for file_path in file_paths:
 
481
  file_name_without_ext = get_file_name_without_type(file_path)
482
  file_name_with_ext = os.path.basename(file_path)
483
 
 
488
  if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
489
  image = Image.open(file_paths[-1])
490
 
 
 
491
  draw = ImageDraw.Draw(image)
492
 
493
+ for img_annotation_box in page_image_annotator_object['boxes']:
494
  coords = [img_annotation_box["xmin"],
495
  img_annotation_box["ymin"],
496
  img_annotation_box["xmax"],
 
498
 
499
  fill = img_annotation_box["color"]
500
 
501
+ # Ensure fill is a valid RGB tuple
502
+ if isinstance(fill, tuple) and len(fill) == 3:
503
+ # Check if all elements are integers in the range 0-255
504
+ if all(isinstance(c, int) and 0 <= c <= 255 for c in fill):
505
+ pass
506
+ #print("fill:", fill)
507
+ else:
508
+ print(f"Invalid color values: {fill}. Defaulting to black.")
509
+ fill = (0, 0, 0) # Default to black if invalid
510
+ else:
511
+ print(f"Invalid fill format: {fill}. Defaulting to black.")
512
+ fill = (0, 0, 0) # Default to black if not a valid tuple
513
+
514
+ # Ensure the image is in RGB mode
515
+ if image.mode not in ("RGB", "RGBA"):
516
+ image = image.convert("RGB")
517
+
518
+ draw = ImageDraw.Draw(image)
519
+
520
  draw.rectangle(coords, fill=fill)
521
 
522
  output_image_path = output_folder + file_name_without_ext + "_redacted.png"
 
524
 
525
  output_files.append(output_image_path)
526
 
 
 
527
  doc = [image]
528
 
529
  elif file_extension in '.csv':
530
+ #print("This is a csv")
531
  pdf_doc = []
532
 
533
  # If working with pdfs
 
538
  output_files.append(orig_pdf_file_path)
539
 
540
  number_of_pages = pdf_doc.page_count
541
+ original_cropboxes = []
542
 
543
+ page_sizes_df = pd.DataFrame(page_sizes)
544
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
545
 
546
  for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
547
+
 
 
548
  image_loc = all_image_annotations[i]['image']
 
549
 
550
  # Load in image object
551
  if isinstance(image_loc, np.ndarray):
552
  image = Image.fromarray(image_loc.astype('uint8'))
 
553
  elif isinstance(image_loc, Image.Image):
554
  image = image_loc
 
 
 
555
  elif isinstance(image_loc, str):
556
+ if not os.path.exists(image_loc):
557
+ image=page_sizes_df.loc[page_sizes_df['page']==i, "image_path"]
558
+ try:
559
+ image = Image.open(image_loc)
560
+ except Exception as e:
561
+ image = None
562
 
563
  pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
564
+ original_cropboxes.append(pymupdf_page.cropbox)
565
+ pymupdf_page.set_cropbox(pymupdf_page.mediabox)
566
 
567
+ pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1], page_sizes_df= page_sizes_df) # image=image,
568
  else:
569
  print("File type not recognised.")
570
 
571
  #try:
572
  if pdf_doc:
573
  out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
574
+ pdf_doc.save(out_pdf_file_path, garbage=4, deflate=True, clean=True)
575
  output_files.append(out_pdf_file_path)
576
 
577
  else:
 
584
  output_files.append(orig_pdf_file_path)
585
 
586
  try:
587
+ review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
 
 
 
 
 
 
 
 
 
 
 
 
588
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
589
+
590
  review_df.to_csv(out_review_file_file_path, index=None)
591
  output_files.append(out_review_file_file_path)
592
 
593
  except Exception as e:
594
+ print("In apply redactions function, could not save annotations to csv file:", e)
595
 
596
+ return doc, all_image_annotations, output_files, output_log_files, review_df
597
 
598
  def get_boxes_json(annotations:AnnotatedImageData):
599
  return annotations["boxes"]
600
 
601
+ def update_all_entity_df_dropdowns(df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str, text_dropdown_value:str):
602
+ '''
603
+ Update all dropdowns based on rows that exist in a dataframe
604
+ '''
605
+
606
+ if isinstance(label_dropdown_value, str):
607
+ label_dropdown_value = [label_dropdown_value]
608
+ if isinstance(page_dropdown_value, str):
609
+ page_dropdown_value = [page_dropdown_value]
610
+ if isinstance(text_dropdown_value, str):
611
+ text_dropdown_value = [text_dropdown_value]
612
+
613
+ filtered_df = df.copy()
614
+
615
+ # Apply filtering based on dropdown selections
616
+ # if not "ALL" in page_dropdown_value:
617
+ # filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
618
+
619
+ # if not "ALL" in text_dropdown_value:
620
+ # filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
621
+
622
+ # if not "ALL" in label_dropdown_value:
623
+ # filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
624
+
625
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
626
+ recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
627
+
628
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
629
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
630
+
631
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
632
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
633
+
634
+ return recogniser_entities_drop, text_entities_drop, page_entities_drop
635
+
636
+ def update_entities_df_recogniser_entities(choice:str, df:pd.DataFrame, page_dropdown_value:str, text_dropdown_value:str):
637
+ '''
638
+ Update the rows in a dataframe depending on the user choice from a dropdown
639
+ '''
640
+
641
+ if isinstance(choice, str):
642
+ choice = [choice]
643
+ if isinstance(page_dropdown_value, str):
644
+ page_dropdown_value = [page_dropdown_value]
645
+ if isinstance(text_dropdown_value, str):
646
+ text_dropdown_value = [text_dropdown_value]
647
+
648
+ filtered_df = df.copy()
649
+
650
+ # Apply filtering based on dropdown selections
651
+ if not "ALL" in page_dropdown_value:
652
+ filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
653
+
654
+ if not "ALL" in text_dropdown_value:
655
+ filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
656
+
657
+ if not "ALL" in choice:
658
+ filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
659
+
660
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
661
+ recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
662
+
663
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
664
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
665
+
666
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
667
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
668
+
669
+ return filtered_df, text_entities_drop, page_entities_drop
670
+
671
+ def update_entities_df_page(choice:str, df:pd.DataFrame, label_dropdown_value:str, text_dropdown_value:str):
672
+ '''
673
+ Update the rows in a dataframe depending on the user choice from a dropdown
674
+ '''
675
+ if isinstance(choice, str):
676
+ choice = [choice]
677
+ if isinstance(label_dropdown_value, str):
678
+ label_dropdown_value = [label_dropdown_value]
679
+ if isinstance(text_dropdown_value, str):
680
+ text_dropdown_value = [text_dropdown_value]
681
+
682
+ filtered_df = df.copy()
683
+
684
+ # Apply filtering based on dropdown selections
685
+ if not "ALL" in text_dropdown_value:
686
+ filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
687
+
688
+ if not "ALL" in label_dropdown_value:
689
+ filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
690
+
691
+ if not "ALL" in choice:
692
+ filtered_df = filtered_df[filtered_df["page"].astype(str).isin(choice)]
693
+
694
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
695
+ recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
696
+
697
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
698
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
699
+
700
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
701
+ page_entities_drop = gr.Dropdown(value=choice[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
702
+
703
+ return filtered_df, recogniser_entities_drop, text_entities_drop
704
+
705
+ def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str):
706
+ '''
707
+ Update the rows in a dataframe depending on the user choice from a dropdown
708
+ '''
709
+ if isinstance(choice, str):
710
+ choice = [choice]
711
+ if isinstance(label_dropdown_value, str):
712
+ label_dropdown_value = [label_dropdown_value]
713
+ if isinstance(page_dropdown_value, str):
714
+ page_dropdown_value = [page_dropdown_value]
715
+
716
+ filtered_df = df.copy()
717
+
718
+ # Apply filtering based on dropdown selections
719
+ if not "ALL" in page_dropdown_value:
720
+ filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
721
+
722
+ if not "ALL" in label_dropdown_value:
723
+ filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
724
+
725
+ if not "ALL" in choice:
726
+ filtered_df = filtered_df[filtered_df["text"].astype(str).isin(choice)]
727
+
728
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
729
+ recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
730
+
731
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
732
+ text_entities_drop = gr.Dropdown(value=choice[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
733
+
734
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
735
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
736
+
737
+ return filtered_df, recogniser_entities_drop, page_entities_drop
738
+
739
+ def reset_dropdowns(df:pd.DataFrame):
740
+ '''
741
+ Return Gradio dropdown objects with value 'ALL'.
742
+ '''
743
+
744
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "label")
745
+ recogniser_entities_drop = gr.Dropdown(value="ALL", choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
746
+
747
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "text")
748
+ text_entities_drop = gr.Dropdown(value="ALL", choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
749
+
750
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "page")
751
+ page_entities_drop = gr.Dropdown(value="ALL", choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
752
+
753
+ return recogniser_entities_drop, text_entities_drop, page_entities_drop
754
 
755
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
756
+
757
  row_value_page = evt.row_value[0] # This is the page number value
758
+ row_value_label = evt.row_value[1] # This is the label number value
759
+ row_value_text = evt.row_value[2] # This is the text number value
760
+
761
+ row_value_df = pd.DataFrame(data={"page":[row_value_page], "label":[row_value_label], "text":[row_value_text]})
762
+
763
+ return row_value_page, row_value_df
764
+
765
+ def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
766
+
767
+ row_value_code = evt.row_value[0] # This is the value for cost code
768
+ row_value_label = evt.row_value[1] # This is the label number value
769
+
770
+ #row_value_df = pd.DataFrame(data={"page":[row_value_code], "label":[row_value_label]})
771
+
772
+ return row_value_code
773
+
774
+ def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, colour:tuple=(0,0,255)):
775
+ '''
776
+ Update the colour of a single redaction box based on the values in a selection row
777
+ '''
778
+ colour_tuple = str(tuple(colour))
779
+
780
+ if "color" not in review_df.columns: review_df["color"] = None
781
+
782
+ # Reset existing highlight colours
783
+ review_df.loc[review_df["color"]==colour_tuple, "color"] = review_df.loc[review_df["color"]==colour_tuple, "color"].apply(lambda _: '(0, 0, 0)')
784
+
785
+ review_df = review_df.merge(redaction_row_selection, on=["page", "label", "text"], indicator=True, how="left")
786
+ review_df.loc[review_df["_merge"]=="both", "color"] = review_df.loc[review_df["_merge"] == "both", "color"].apply(lambda _: '(0, 0, 255)')
787
+
788
+ review_df.drop("_merge", axis=1, inplace=True)
789
+
790
+ review_df.to_csv(OUTPUT_FOLDER + "review_df_in_update_selected_review.csv")
791
+
792
+ return review_df
793
+
794
+ def update_boxes_color(images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)):
795
+ """
796
+ Update the color of bounding boxes in the images list based on redaction_row_selection.
797
+
798
+ Parameters:
799
+ - images (list): List of dictionaries containing image paths and box metadata.
800
+ - redaction_row_selection (pd.DataFrame): DataFrame with 'page', 'label', and optionally 'text' columns.
801
+ - colour (tuple): RGB tuple for the new color.
802
+
803
+ Returns:
804
+ - Updated list with modified colors.
805
+ """
806
+ # Convert DataFrame to a set for fast lookup
807
+ selection_set = set(zip(redaction_row_selection["page"], redaction_row_selection["label"]))
808
+
809
+ for page_idx, image_obj in enumerate(images):
810
+ if "boxes" in image_obj:
811
+ for box in image_obj["boxes"]:
812
+ if (page_idx, box["label"]) in selection_set:
813
+ box["color"] = colour # Update color
814
+
815
+ return images
816
+
817
+ def update_other_annotator_number_from_current(page_number_first_counter:int):
818
+ return page_number_first_counter
819
 
820
  def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
821
  '''
 
827
  - image_width: Width of the source image
828
  - image_height: Height of the source image
829
  - x1, y1, x2, y2: Coordinates in image space
830
+ - page_sizes: List of dicts containing sizes of page as pymupdf page or PIL image
831
 
832
  Returns:
833
  - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
834
  '''
835
+
836
+
837
 
838
  # Calculate scaling factors
839
  scale_width = pdf_page_width / image_width
 
854
 
855
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
856
 
857
+ def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float, pdf_page_height: float):
858
+ """
859
+ Converts coordinates from PyMuPDF (fitz) space to Adobe PDF space.
860
+
861
+ Parameters:
862
+ - x1, y1, x2, y2: Coordinates in PyMuPDF space
863
+ - pdf_page_height: Total height of the PDF page
864
+
865
+ Returns:
866
+ - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
867
+ """
868
 
869
+ # PyMuPDF uses (0,0) at the bottom-left, while Adobe uses (0,0) at the top-left
870
+ adobe_y1 = pdf_page_height - y2 # Convert top coordinate
871
+ adobe_y2 = pdf_page_height - y1 # Convert bottom coordinate
872
+
873
+ return x1, adobe_y1, x2, adobe_y2
874
+
875
+ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[], page_sizes:List[dict]=[]):
876
  '''
877
  Create an xfdf file from a review csv file and a pdf
878
  '''
879
+ pages_are_images = True
880
+
881
  # Create root element
882
  xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
883
 
 
887
 
888
  # Add annots
889
  annots = SubElement(xfdf, 'annots')
890
+
891
+ # Check if page size object exists, and if current coordinates are in relative format or image coordinates format.
892
+ if page_sizes:
893
+ page_sizes_df = pd.DataFrame(page_sizes)
894
+
895
+ # If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
896
+ #if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
897
+ print("Using pymupdf coordinates for conversion.")
898
+
899
+ pages_are_images = False
900
+
901
+ if "mediabox_width" not in review_file_df.columns:
902
+ review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
903
+
904
+ # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
905
+ if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
906
+ review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["mediabox_width"]
907
+ review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["mediabox_width"]
908
+ review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["mediabox_height"]
909
+ review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["mediabox_height"]
910
+
911
+ # If all nulls, then can do image coordinate conversion
912
+ if len(page_sizes_df.loc[page_sizes_df["mediabox_width"].isnull(),"mediabox_width"]) == len(page_sizes_df["mediabox_width"]):
913
+
914
+ pages_are_images = True
915
+
916
+ review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
917
+
918
+ # if "image_width" not in review_file_df.columns:
919
+ # review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
920
+
921
+ # # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
922
+ # if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
923
+ # review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
924
+ # review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
925
+ # review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
926
+ # review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
927
+
928
+
929
 
930
+ # Go through each row of the review_file_df, create an entry in the output Adobe xfdf file.
931
+ for _, row in review_file_df.iterrows():
932
+ page_num_reported = row["page"]
933
  page_python_format = int(row["page"])-1
934
 
935
  pymupdf_page = pymupdf_doc.load_page(page_python_format)
936
 
937
+ # Load cropbox sizes. Set cropbox to the original cropbox sizes from when the document was loaded into the app.
938
+ if document_cropboxes:
939
+
940
+ # Extract numbers safely using regex
941
+ match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])
942
+
943
+ if match and len(match) == 4:
944
+ rect_values = list(map(float, match)) # Convert extracted strings to floats
945
+ pymupdf_page.set_cropbox(Rect(*rect_values))
946
+ else:
947
+ raise ValueError(f"Invalid cropbox format: {document_cropboxes[page_python_format]}")
948
+ else:
949
+ print("Document cropboxes not found.")
950
+
951
+
952
+ pdf_page_height = pymupdf_page.mediabox.height
953
+ pdf_page_width = pymupdf_page.mediabox.width
954
+
955
+ # Check if image dimensions for page exist in page_sizes_df
956
+ # image_dimensions = {}
957
 
958
+ # image_dimensions['image_width'] = page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"].max()
959
+ # image_dimensions['image_height'] = page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_height"].max()
960
 
961
+ # if pd.isna(image_dimensions['image_width']):
962
+ # image_dimensions = {}
963
 
964
+ # image = image_paths[page_python_format]
 
965
 
966
+ # if image_dimensions:
967
+ # image_page_width, image_page_height = image_dimensions["image_width"], image_dimensions["image_height"]
968
+ # if isinstance(image, str) and 'placeholder' not in image:
969
+ # image = Image.open(image)
970
+ # image_page_width, image_page_height = image.size
971
+ # else:
972
+ # try:
973
+ # image = Image.open(image)
974
+ # image_page_width, image_page_height = image.size
975
+ # except Exception as e:
976
+ # print("Could not get image sizes due to:", e)
977
 
978
  # Create redaction annotation
979
  redact_annot = SubElement(annots, 'redact')
 
985
  # Set page number (subtract 1 as PDF pages are 0-based)
986
  redact_annot.set('page', str(int(row['page']) - 1))
987
 
988
+ # # Convert coordinates
989
+ # if pages_are_images == True:
990
+ # x1, y1, x2, y2 = convert_image_coords_to_adobe(
991
+ # pdf_page_width,
992
+ # pdf_page_height,
993
+ # image_page_width,
994
+ # image_page_height,
995
+ # row['xmin'],
996
+ # row['ymin'],
997
+ # row['xmax'],
998
+ # row['ymax']
999
+ # )
1000
+ # else:
1001
+ x1, y1, x2, y2 = convert_pymupdf_coords_to_adobe(row['xmin'],
1002
  row['ymin'],
1003
  row['xmax'],
1004
+ row['ymax'], pdf_page_height)
 
1005
 
1006
  if CUSTOM_BOX_COLOUR == "grey":
1007
  colour_str = "0.5,0.5,0.5"
 
1053
 
1054
  return xml_str
1055
 
1056
+ def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=[], page_sizes:List[dict]=[]):
1057
  '''
1058
  Load in files to convert a review file into an Adobe comment file format
1059
  '''
1060
  output_paths = []
1061
  pdf_name = ""
1062
+ file_path_name = ""
1063
 
1064
  if isinstance(input_files, str):
1065
  file_paths_list = [input_files]
 
1076
  else:
1077
  file_path = file.name
1078
 
1079
+ file_path_name = get_file_name_without_type(file_path)
1080
+ file_path_end = detect_file_type(file_path)
1081
 
1082
+ if file_path_end == "pdf":
1083
+ pdf_name = os.path.basename(file_path)
1084
 
1085
+ if file_path_end == "csv":
1086
+ # If no pdf name, just get the name of the file path
1087
+ if not pdf_name:
1088
+ pdf_name = file_path_name
1089
+ # Read CSV file
1090
+ review_file_df = pd.read_csv(file_path)
1091
 
1092
+ review_file_df.fillna('', inplace=True) # Replace NaN in review file with an empty string
1093
 
1094
+ xfdf_content = create_xfdf(review_file_df, pdf_name, pdf_doc, image_paths, document_cropboxes, page_sizes)
1095
 
1096
+ output_path = output_folder + file_path_name + "_adobe.xfdf"
1097
+
1098
+ with open(output_path, 'w', encoding='utf-8') as f:
1099
+ f.write(xfdf_content)
1100
 
1101
+ output_paths.append(output_path)
1102
 
1103
  return output_paths
1104
 
1105
 
1106
  ### Convert xfdf coordinates back to image for app
1107
 
1108
+ def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
1109
  '''
1110
  Converts coordinates from Adobe PDF space to image space.
1111
 
 
1139
 
1140
  return image_x1, image_y1, image_x2, image_y2
1141
 
1142
+ def parse_xfdf(xfdf_path:str):
1143
  '''
1144
  Parse the XFDF file and extract redaction annotations.
1145
 
 
1160
  # Find all redact elements using the namespace
1161
  for redact in root.findall('.//xfdf:redact', namespaces=namespace):
1162
 
 
 
1163
  redaction_info = {
1164
  'image': '', # Image will be filled in later
1165
  'page': int(redact.get('page')) + 1, # Convert to 1-based index
 
1172
  'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
1173
  }
1174
  redactions.append(redaction_info)
 
 
1175
 
1176
  return redactions
1177
 
1178
+ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_paths:List[str], output_folder:str=OUTPUT_FOLDER):
1179
  '''
1180
  Convert redaction annotations from XFDF and associated images into a DataFrame.
1181
 
 
1191
  xfdf_paths = []
1192
  df = pd.DataFrame()
1193
 
 
 
1194
  # Sort the file paths so that the pdfs come first
1195
  file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
1196
 
 
1206
 
1207
  if file_path_end == "pdf":
1208
  pdf_name = os.path.basename(file_path)
 
1209
 
1210
  # Add pdf to outputs
1211
  output_paths.append(file_path)
 
1216
  message = "Original PDF needed to convert from .xfdf format"
1217
  print(message)
1218
  raise ValueError(message)
 
1219
  xfdf_path = file
1220
 
 
 
 
 
 
1221
  file_path_name = get_file_name_without_type(xfdf_path)
1222
 
 
 
1223
  # Parse the XFDF file
1224
  redactions = parse_xfdf(xfdf_path)
1225
 
 
1238
 
1239
  image_path = image_paths[page_python_format]
1240
 
 
 
1241
  if isinstance(image_path, str):
1242
  image = Image.open(image_path)
1243
 
 
1249
  df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
1250
 
1251
  # Optionally, you can add the image path or other relevant information
 
1252
  df.loc[_, 'image'] = image_path
1253
 
1254
  #print('row:', row)
tools/textract_batch_call.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import time
3
+ import os
4
+ import json
5
+ import logging
6
+ from urllib.parse import urlparse
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
+
11
+ def analyze_pdf_with_textract(
12
+ local_pdf_path: str,
13
+ s3_bucket_name: str,
14
+ s3_input_prefix: str,
15
+ s3_output_prefix: str,
16
+ local_output_dir: str,
17
+ aws_region: str = None, # Optional: specify region if not default
18
+ poll_interval_seconds: int = 5,
19
+ max_polling_attempts: int = 120 # ~10 minutes total wait time
20
+ ):
21
+ """
22
+ Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
23
+ waits for completion, and downloads the output JSON from S3 to a local directory.
24
+
25
+ Args:
26
+ local_pdf_path (str): Path to the local PDF file.
27
+ s3_bucket_name (str): Name of the S3 bucket to use.
28
+ s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
29
+ s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
30
+ local_output_dir (str): Local directory to save the downloaded JSON results.
31
+ aws_region (str, optional): AWS region name. Defaults to boto3 default region.
32
+ poll_interval_seconds (int): Seconds to wait between polling Textract status.
33
+ max_polling_attempts (int): Maximum number of times to poll Textract status.
34
+
35
+ Returns:
36
+ str: Path to the downloaded local JSON output file, or None if failed.
37
+
38
+ Raises:
39
+ FileNotFoundError: If the local_pdf_path does not exist.
40
+ boto3.exceptions.NoCredentialsError: If AWS credentials are not found.
41
+ Exception: For other AWS errors or job failures.
42
+ """
43
+
44
+ if not os.path.exists(local_pdf_path):
45
+ raise FileNotFoundError(f"Input PDF not found: {local_pdf_path}")
46
+
47
+ if not os.path.exists(local_output_dir):
48
+ os.makedirs(local_output_dir)
49
+ logging.info(f"Created local output directory: {local_output_dir}")
50
+
51
+ # Initialize boto3 clients
52
+ session = boto3.Session(region_name=aws_region)
53
+ s3_client = session.client('s3')
54
+ textract_client = session.client('textract')
55
+
56
+ # --- 1. Upload PDF to S3 ---
57
+ pdf_filename = os.path.basename(local_pdf_path)
58
+ s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
59
+
60
+ logging.info(f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'...")
61
+ try:
62
+ s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
63
+ logging.info("Upload successful.")
64
+ except Exception as e:
65
+ logging.error(f"Failed to upload PDF to S3: {e}")
66
+ raise
67
+
68
+ # --- 2. Start Textract Document Analysis ---
69
+ logging.info("Starting Textract document analysis job...")
70
+ try:
71
+ response = textract_client.start_document_analysis(
72
+ DocumentLocation={
73
+ 'S3Object': {
74
+ 'Bucket': s3_bucket_name,
75
+ 'Name': s3_input_key
76
+ }
77
+ },
78
+ FeatureTypes=['SIGNATURES', 'FORMS', 'TABLES'], # Analyze for signatures, forms, and tables
79
+ OutputConfig={
80
+ 'S3Bucket': s3_bucket_name,
81
+ 'S3Prefix': s3_output_prefix
82
+ }
83
+ # Optional: Add NotificationChannel for SNS topic notifications
84
+ # NotificationChannel={
85
+ # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
86
+ # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
87
+ # }
88
+ )
89
+ job_id = response['JobId']
90
+ logging.info(f"Textract job started with JobId: {job_id}")
91
+
92
+ except Exception as e:
93
+ logging.error(f"Failed to start Textract job: {e}")
94
+ raise
95
+
96
+ # --- 3. Poll for Job Completion ---
97
+ job_status = 'IN_PROGRESS'
98
+ attempts = 0
99
+ logging.info("Polling Textract for job completion status...")
100
+
101
+ while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
102
+ attempts += 1
103
+ try:
104
+ response = textract_client.get_document_analysis(JobId=job_id)
105
+ job_status = response['JobStatus']
106
+ logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
107
+
108
+ if job_status == 'IN_PROGRESS':
109
+ time.sleep(poll_interval_seconds)
110
+ elif job_status == 'SUCCEEDED':
111
+ logging.info("Textract job succeeded.")
112
+ break
113
+ elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
114
+ status_message = response.get('StatusMessage', 'No status message provided.')
115
+ warnings = response.get('Warnings', [])
116
+ logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
117
+ if warnings:
118
+ logging.warning(f"Warnings: {warnings}")
119
+ # Decide if PARTIAL_SUCCESS should proceed or raise error
120
+ # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
121
+ raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
122
+ else:
123
+ # Should not happen based on documentation, but handle defensively
124
+ raise Exception(f"Unexpected Textract job status: {job_status}")
125
+
126
+ except textract_client.exceptions.InvalidJobIdException:
127
+ logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
128
+ raise
129
+ except Exception as e:
130
+ logging.error(f"Error while polling Textract status for job {job_id}: {e}")
131
+ raise
132
+
133
+ if job_status != 'SUCCEEDED':
134
+ raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
135
+
136
+ # --- 4. Download Output JSON from S3 ---
137
+ # Textract typically creates output under s3_output_prefix/job_id/
138
+ # There might be multiple JSON files if pagination occurred during writing.
139
+ # Usually, for smaller docs, there's one file, often named '1'.
140
+ # For robust handling, list objects and find the JSON(s).
141
+
142
+ s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
143
+ logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
144
+
145
+ downloaded_file_path = None
146
+ try:
147
+ list_response = s3_client.list_objects_v2(
148
+ Bucket=s3_bucket_name,
149
+ Prefix=s3_output_key_prefix
150
+ )
151
+
152
+ output_files = list_response.get('Contents', [])
153
+ if not output_files:
154
+ # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
155
+ logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
156
+ time.sleep(5)
157
+ list_response = s3_client.list_objects_v2(
158
+ Bucket=s3_bucket_name,
159
+ Prefix=s3_output_key_prefix
160
+ )
161
+ output_files = list_response.get('Contents', [])
162
+
163
+ if not output_files:
164
+ logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
165
+ # You could alternatively try getting results via get_document_analysis pagination here
166
+ # but sticking to the request to download from S3 output path.
167
+ raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
168
+
169
+ # Usually, we only need the first/main JSON output file(s)
170
+ # For simplicity, download the first one found. A more complex scenario might merge multiple files.
171
+ # Filter out potential directory markers if any key ends with '/'
172
+ json_files_to_download = [f for f in output_files if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/')]
173
+
174
+ if not json_files_to_download:
175
+ logging.error(f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}")
176
+ raise FileNotFoundError(f"Textract output JSON files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
177
+
178
+ # Let's download the first JSON found. Often it's the only one or the main one.
179
+ s3_output_key = json_files_to_download[0]['Key']
180
+ output_filename_base = os.path.basename(pdf_filename).replace('.pdf', '')
181
+ local_output_filename = f"{output_filename_base}_textract_output_{job_id}.json"
182
+ local_output_path = os.path.join(local_output_dir, local_output_filename)
183
+
184
+ logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
185
+ s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
186
+ logging.info("Download successful.")
187
+ downloaded_file_path = local_output_path
188
+
189
+ # Log if multiple files were found, as user might need to handle them
190
+ if len(json_files_to_download) > 1:
191
+ logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
192
+
193
+ except Exception as e:
194
+ logging.error(f"Failed to download or process Textract output from S3: {e}")
195
+ raise
196
+
197
+ return downloaded_file_path
198
+
199
+ # --- Example Usage ---
200
+ if __name__ == '__main__':
201
+ # --- Configuration --- (Replace with your actual values)
202
+ MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
203
+ MY_S3_BUCKET = "your-textract-demo-bucket-name" # MUST BE UNIQUE GLOBALLY
204
+ MY_S3_INPUT_PREFIX = "textract-inputs" # Folder in the bucket for uploads
205
+ MY_S3_OUTPUT_PREFIX = "textract-outputs" # Folder in the bucket for results
206
+ MY_LOCAL_OUTPUT_DIR = "./textract_results" # Local folder to save JSON
207
+ MY_AWS_REGION = "us-east-1" # e.g., 'us-east-1', 'eu-west-1'
208
+
209
+ # --- Create a dummy PDF for testing if you don't have one ---
210
+ # Requires 'reportlab' library: pip install reportlab
211
+ try:
212
+ from reportlab.pdfgen import canvas
213
+ from reportlab.lib.pagesizes import letter
214
+ if not os.path.exists(MY_LOCAL_PDF):
215
+ print(f"Creating dummy PDF: {MY_LOCAL_PDF}")
216
+ c = canvas.Canvas(MY_LOCAL_PDF, pagesize=letter)
217
+ c.drawString(100, 750, "This is a test document for AWS Textract.")
218
+ c.drawString(100, 700, "It includes some text and a placeholder for a signature.")
219
+ c.drawString(100, 650, "Signed:")
220
+ # Draw a simple line/scribble for signature placeholder
221
+ c.line(150, 630, 250, 645)
222
+ c.line(250, 645, 300, 620)
223
+ c.save()
224
+ print("Dummy PDF created.")
225
+ except ImportError:
226
+ if not os.path.exists(MY_LOCAL_PDF):
227
+ print(f"Warning: reportlab not installed and '{MY_LOCAL_PDF}' not found. Cannot run example without an input PDF.")
228
+ exit() # Exit if no PDF available for the example
229
+ except Exception as e:
230
+ print(f"Error creating dummy PDF: {e}")
231
+ exit()
232
+
233
+
234
+ # --- Run the analysis ---
235
+ try:
236
+ output_json_path = analyze_pdf_with_textract(
237
+ local_pdf_path=MY_LOCAL_PDF,
238
+ s3_bucket_name=MY_S3_BUCKET,
239
+ s3_input_prefix=MY_S3_INPUT_PREFIX,
240
+ s3_output_prefix=MY_S3_OUTPUT_PREFIX,
241
+ local_output_dir=MY_LOCAL_OUTPUT_DIR,
242
+ aws_region=MY_AWS_REGION
243
+ )
244
+
245
+ if output_json_path:
246
+ print(f"\n--- Analysis Complete ---")
247
+ print(f"Textract output JSON saved to: {output_json_path}")
248
+
249
+ # Optional: Load and print some info from the JSON
250
+ with open(output_json_path, 'r') as f:
251
+ results = json.load(f)
252
+ print(f"Detected {results.get('DocumentMetadata', {}).get('Pages', 'N/A')} page(s).")
253
+ # Find signature blocks (Note: This is basic, real parsing might be more complex)
254
+ signature_blocks = [block for block in results.get('Blocks', []) if block.get('BlockType') == 'SIGNATURE']
255
+ print(f"Found {len(signature_blocks)} potential signature block(s).")
256
+ if signature_blocks:
257
+ print(f"First signature confidence: {signature_blocks[0].get('Confidence', 'N/A')}")
258
+
259
+
260
+ except FileNotFoundError as e:
261
+ print(f"\nError: Input file not found. {e}")
262
+ except Exception as e:
263
+ print(f"\nAn error occurred during the process: {e}")
264
+
265
+ import boto3
266
+ import time
267
+ import os
268
+
269
+ def download_textract_output(job_id, output_bucket, output_prefix, local_folder):
270
+ """
271
+ Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
272
+
273
+ :param job_id: The Textract job ID.
274
+ :param output_bucket: The S3 bucket where the output is stored.
275
+ :param output_prefix: The prefix (folder path) in S3 where the output file is stored.
276
+ :param local_folder: The local directory where the ZIP file should be saved.
277
+ """
278
+ textract_client = boto3.client('textract')
279
+ s3_client = boto3.client('s3')
280
+
281
+ # Check job status
282
+ while True:
283
+ response = textract_client.get_document_analysis(JobId=job_id)
284
+ status = response['JobStatus']
285
+
286
+ if status == 'SUCCEEDED':
287
+ print("Job completed successfully.")
288
+ break
289
+ elif status == 'FAILED':
290
+ print("Job failed:", response.get("StatusMessage", "No error message provided."))
291
+ return
292
+ else:
293
+ print(f"Job is still {status}, waiting...")
294
+ time.sleep(10) # Wait before checking again
295
+
296
+ # Find output ZIP file in S3
297
+ output_file_key = f"{output_prefix}/{job_id}.zip"
298
+ local_file_path = os.path.join(local_folder, f"{job_id}.zip")
299
+
300
+ # Download file
301
+ try:
302
+ s3_client.download_file(output_bucket, output_file_key, local_file_path)
303
+ print(f"Output file downloaded to: {local_file_path}")
304
+ except Exception as e:
305
+ print(f"Error downloading file: {e}")
306
+
307
+ # Example usage:
308
+ # download_textract_output("your-job-id", "your-output-bucket", "your-output-prefix", "/path/to/local/folder")