seanpedrickcase commited on
Commit
f0f9378
·
1 Parent(s): aaf0acb

Added support for AWS Comprehend for PII identification. OCR and detection results now written to main output

Browse files
app.py CHANGED
@@ -7,7 +7,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
7
  from gradio_image_annotation import image_annotator
8
 
9
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
10
- from tools.aws_functions import upload_file_to_s3
11
  from tools.file_redaction import choose_and_run_redactor
12
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
13
  from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
@@ -25,8 +25,14 @@ add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
25
 
26
  ensure_output_folder_exists()
27
 
28
- chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
 
 
 
 
 
29
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 
30
  language = 'en'
31
 
32
  host_name = socket.gethostname()
@@ -35,6 +41,21 @@ feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
35
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
36
  usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Create the gradio interface
39
  app = gr.Blocks(theme = gr.themes.Base())
40
 
@@ -109,7 +130,9 @@ with app:
109
  with gr.Tab("PDFs/images"):
110
  with gr.Accordion("Redact document", open = True):
111
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
112
- in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - docs with handwriting/signatures (AWS Textract)"])
 
 
113
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
114
  document_redact_btn = gr.Button("Redact document(s)", variant="primary")
115
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -201,21 +224,30 @@ with app:
201
  with gr.Row():
202
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
203
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
204
- with gr.Row():
205
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
206
- with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
207
- anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
208
 
209
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
210
- in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
211
  with gr.Row():
212
- in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
213
- # Upload 'Allow list' for terms not to be redacted
214
- with gr.Row():
215
  in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
216
  gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
217
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
218
- log_files_output = gr.File(label="Log file output", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  # If a custom allow list is uploaded
221
  in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
@@ -227,12 +259,12 @@ with app:
227
 
228
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
229
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
230
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
231
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
232
  #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
233
 
234
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
235
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
236
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
237
 
238
  # If a file has been completed, the function will continue onto the next document
@@ -318,9 +350,9 @@ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
318
 
319
  if __name__ == "__main__":
320
  if os.environ['COGNITO_AUTH'] == "1":
321
- app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
322
  else:
323
- app.queue().launch(show_error=True, inbrowser=True, max_file_size='100mb')
324
 
325
 
326
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
 
7
  from gradio_image_annotation import image_annotator
8
 
9
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
10
+ from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
11
  from tools.file_redaction import choose_and_run_redactor
12
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
13
  from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
 
25
 
26
  ensure_output_folder_exists()
27
 
28
+ chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
29
+
30
+ full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
31
+
32
+ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
33
+
34
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
35
+
36
  language = 'en'
37
 
38
  host_name = socket.gethostname()
 
41
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
42
  usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
43
 
44
+
45
+ text_ocr_option = "Simple text analysis - PDFs with selectable text"
46
+ tesseract_ocr_option = "Quick image analysis - typed text"
47
+ textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
48
+
49
+ local_pii_detector = "Local"
50
+ aws_pii_detector = "AWS Comprehend"
51
+
52
+ if RUN_AWS_FUNCTIONS == "1":
53
+ default_ocr_val = textract_option
54
+ default_pii_detector = aws_pii_detector
55
+ else:
56
+ default_ocr_val = text_ocr_option
57
+ default_pii_detector = local_pii_detector
58
+
59
  # Create the gradio interface
60
  app = gr.Blocks(theme = gr.themes.Base())
61
 
 
130
  with gr.Tab("PDFs/images"):
131
  with gr.Accordion("Redact document", open = True):
132
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
133
+ in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = text_ocr_option, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
134
+ pii_identification_method_drop = gr.Radio(label = "Choose PII detection method", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
135
+
136
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
137
  document_redact_btn = gr.Button("Redact document(s)", variant="primary")
138
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
 
224
  with gr.Row():
225
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
226
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
227
+
228
+
229
+
230
+
231
 
232
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
 
233
  with gr.Row():
 
 
 
234
  in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
235
  gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
236
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
237
+
238
+ in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
239
+
240
+ in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
241
+
242
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
243
+ #with gr.Row():
244
+ in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
245
+
246
+
247
+ with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
248
+ anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
249
+
250
+ log_files_output = gr.File(label="Log file output", interactive=False)
251
 
252
  # If a custom allow list is uploaded
253
  in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
 
259
 
260
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
261
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
262
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
263
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
264
  #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
265
 
266
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
267
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop],
268
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
269
 
270
  # If a file has been completed, the function will continue onto the next document
 
350
 
351
  if __name__ == "__main__":
352
  if os.environ['COGNITO_AUTH'] == "1":
353
+ app.queue(max_size=5).launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
354
  else:
355
+ app.queue(max_size=5).launch(show_error=True, inbrowser=True, max_file_size='100mb')
356
 
357
 
358
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
tools/aws_functions.py CHANGED
@@ -7,24 +7,22 @@ from tools.helper_functions import get_or_create_env_var
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
 
10
- # Get AWS credentials if required
11
  bucket_name=""
12
- aws_var = "RUN_AWS_FUNCTIONS"
13
- aws_var_default = "0"
14
- aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
15
- print(f'The value of {aws_var} is {aws_var_val}')
16
 
17
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
18
  print(f'The value of AWS_REGION is {AWS_REGION}')
19
 
20
- if aws_var_val == "1":
21
- try:
22
- bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
23
- session = boto3.Session() # profile_name="default"
24
- except Exception as e:
25
- print(e)
26
 
27
- def get_assumed_role_info():
28
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
29
  sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
30
  response = sts.get_caller_identity()
@@ -37,14 +35,28 @@ if aws_var_val == "1":
37
 
38
  return assumed_role_arn, assumed_role_name
39
 
 
 
 
 
 
 
 
 
 
 
40
  try:
41
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
42
 
43
  print("Assumed Role ARN:", assumed_role_arn)
44
  print("Assumed Role Name:", assumed_role_name)
 
45
  except Exception as e:
46
  print(e)
47
 
 
 
 
48
  # Download direct from S3 - requires login credentials
49
  def download_file_from_s3(bucket_name, key, local_file_path):
50
 
 
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
 
10
+ # Get AWS credentials
11
  bucket_name=""
12
+
13
+ RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
14
+ print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 
15
 
16
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
17
  print(f'The value of AWS_REGION is {AWS_REGION}')
18
 
19
+ try:
20
+ comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
21
+ except Exception as e:
22
+ print(e)
23
+ comprehend_client = ""
 
24
 
25
+ def get_assumed_role_info():
26
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
27
  sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
28
  response = sts.get_caller_identity()
 
35
 
36
  return assumed_role_arn, assumed_role_name
37
 
38
+ if RUN_AWS_FUNCTIONS == "1":
39
+ try:
40
+ bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
41
+ session = boto3.Session()
42
+ # Initialize the Boto3 client for Comprehend
43
+
44
+
45
+ except Exception as e:
46
+ print(e)
47
+
48
  try:
49
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
50
 
51
  print("Assumed Role ARN:", assumed_role_arn)
52
  print("Assumed Role Name:", assumed_role_name)
53
+
54
  except Exception as e:
55
  print(e)
56
 
57
+
58
+
59
+
60
  # Download direct from S3 - requires login credentials
61
  def download_file_from_s3(bucket_name, key, local_file_path):
62
 
tools/custom_image_analyser_engine.py CHANGED
@@ -10,6 +10,8 @@ from PIL import ImageDraw, ImageFont, Image
10
  from typing import Optional, Tuple, Union
11
  from copy import deepcopy
12
  from tools.helper_functions import clean_unicode_text
 
 
13
  #import string # Import string to get a list of common punctuation characters
14
 
15
  @dataclass
@@ -459,6 +461,8 @@ class CustomImageAnalyzerEngine:
459
  self,
460
  line_level_ocr_results: List[OCRResult],
461
  ocr_results_with_children: Dict[str, Dict],
 
 
462
  **text_analyzer_kwargs
463
  ) -> List[CustomImageRecognizerResult]:
464
  # Define English as default language, if not specified
@@ -472,10 +476,34 @@ class CustomImageAnalyzerEngine:
472
 
473
  combined_results = []
474
  for i, line_level_ocr_result in enumerate(line_level_ocr_results):
 
 
 
475
  # Analyze each OCR result (line) individually
476
- analyzer_result = self.analyzer_engine.analyze(
477
- text=line_level_ocr_result.text, **text_analyzer_kwargs
478
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
 
480
  if i < len(ocr_results_with_children): # Check if i is a valid index
481
  child_level_key = list(ocr_results_with_children.keys())[i]
 
10
  from typing import Optional, Tuple, Union
11
  from copy import deepcopy
12
  from tools.helper_functions import clean_unicode_text
13
+ from tools.aws_functions import comprehend_client
14
+ from tools.presidio_analyzer_custom import recognizer_result_from_dict
15
  #import string # Import string to get a list of common punctuation characters
16
 
17
  @dataclass
 
461
  self,
462
  line_level_ocr_results: List[OCRResult],
463
  ocr_results_with_children: Dict[str, Dict],
464
+ chosen_redact_comprehend_entities:List[str],
465
+ pii_identification_method:str="Local",
466
  **text_analyzer_kwargs
467
  ) -> List[CustomImageRecognizerResult]:
468
  # Define English as default language, if not specified
 
476
 
477
  combined_results = []
478
  for i, line_level_ocr_result in enumerate(line_level_ocr_results):
479
+
480
+ analyzer_result = []
481
+
482
  # Analyze each OCR result (line) individually
483
+
484
+ if pii_identification_method == "Local":
485
+ analyzer_result = self.analyzer_engine.analyze(
486
+ text=line_level_ocr_result.text, **text_analyzer_kwargs
487
+ )
488
+
489
+ elif pii_identification_method == "AWS Comprehend":
490
+
491
+ # Call the detect_pii_entities method
492
+ response = comprehend_client.detect_pii_entities(
493
+ Text=line_level_ocr_result.text,
494
+ LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
495
+ )
496
+
497
+ for result in response["Entities"]:
498
+ result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
499
+
500
+ if result_text not in allow_list:
501
+
502
+ if result.get("Type") in chosen_redact_comprehend_entities:
503
+
504
+ recogniser_entity = recognizer_result_from_dict(result)
505
+ analyzer_result.append(recogniser_entity)
506
+
507
 
508
  if i < len(ocr_results_with_children): # Check if i is a valid index
509
  child_level_key = list(ocr_results_with_children.keys())[i]
tools/data_anonymise.py CHANGED
@@ -23,6 +23,27 @@ fake = Faker("en_UK")
23
  def fake_first_name(x):
24
  return fake.first_name()
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
27
  output = []
28
 
 
23
  def fake_first_name(x):
24
  return fake.first_name()
25
 
26
+ def initial_clean(text):
27
+ #### Some of my cleaning functions
28
+ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
29
+ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
30
+ non_ascii_pattern = r'[^\x00-\x7F]+'
31
+ multiple_spaces_regex = r'\s{2,}'
32
+
33
+ # Define a list of patterns and their replacements
34
+ patterns = [
35
+ (html_pattern_regex, ' '),
36
+ (html_start_pattern_end_dots_regex, ' '),
37
+ (non_ascii_pattern, ' '),
38
+ (multiple_spaces_regex, ' ')
39
+ ]
40
+
41
+ # Apply each regex replacement
42
+ for pattern, replacement in patterns:
43
+ text = re.sub(pattern, replacement, text)
44
+
45
+ return text
46
+
47
  def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
48
  output = []
49
 
tools/file_redaction.py CHANGED
@@ -24,13 +24,17 @@ import gradio as gr
24
  from gradio import Progress
25
  from collections import defaultdict # For efficient grouping
26
 
 
 
27
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
28
  from tools.file_conversion import process_file
29
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
30
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
31
- from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
32
- from tools.data_anonymise import generate_decision_process_output
33
- from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
 
 
34
 
35
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
36
 
@@ -62,12 +66,12 @@ def sum_numbers_before_seconds(string:str):
62
 
63
  return sum_of_numbers
64
 
65
-
66
  def choose_and_run_redactor(file_paths:List[str],
67
  prepared_pdf_file_paths:List[str],
68
  prepared_pdf_image_paths:List[str],
69
  language:str,
70
  chosen_redact_entities:List[str],
 
71
  in_redact_method:str,
72
  in_allow_list:List[List[str]]=None,
73
  latest_file_completed:int=0,
@@ -86,6 +90,7 @@ def choose_and_run_redactor(file_paths:List[str],
86
  pymupdf_doc=[],
87
  current_loop_page:int=0,
88
  page_break_return:bool=False,
 
89
  progress=gr.Progress(track_tqdm=True)):
90
  '''
91
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -94,7 +99,8 @@ def choose_and_run_redactor(file_paths:List[str],
94
  - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
95
  - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
96
  - language (str): The language of the text in the files.
97
- - chosen_redact_entities (List[str]): A list of entity types to redact from the files.
 
98
  - in_redact_method (str): The method to use for redaction.
99
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
100
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
@@ -113,6 +119,7 @@ def choose_and_run_redactor(file_paths:List[str],
113
  - pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
114
  - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
115
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
 
116
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
117
 
118
  The function returns a redacted document along with processing logs.
@@ -121,12 +128,12 @@ def choose_and_run_redactor(file_paths:List[str],
121
  tic = time.perf_counter()
122
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
123
 
 
124
  # If this is the first time around, set variables to 0/blank
125
  if first_loop_state==True:
126
  print("First_loop_state is True")
127
  latest_file_completed = 0
128
  current_loop_page = 0
129
- #out_message = []
130
  out_file_paths = []
131
  estimate_total_processing_time = 0
132
  estimated_time_taken_state = 0
@@ -136,10 +143,6 @@ def choose_and_run_redactor(file_paths:List[str],
136
  current_loop_page = 0
137
 
138
 
139
- # If out message is string or out_file_paths are blank, change to a list so it can be appended to
140
- #if isinstance(out_message, str):
141
- # out_message = [out_message]
142
-
143
  if not out_file_paths:
144
  out_file_paths = []
145
 
@@ -152,11 +155,6 @@ def choose_and_run_redactor(file_paths:List[str],
152
  else:
153
  number_of_files = len(file_paths)
154
 
155
-
156
- print("\nIn choose_and_run_redactor function, latest_file_completed is:", latest_file_completed)
157
- print("current_loop_page is:", current_loop_page)
158
-
159
-
160
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
161
  if latest_file_completed >= number_of_files:
162
 
@@ -242,7 +240,26 @@ def choose_and_run_redactor(file_paths:List[str],
242
 
243
  print("Redacting file " + file_path_without_ext + " as an image-based file")
244
 
245
- pymupdf_doc, all_decision_process_table, logging_file_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox, "", current_loop_page, page_break_return, prepared_pdf_image_paths, annotations_all_pages, all_line_level_ocr_results_df, all_decision_process_table, pymupdf_doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  # Save Textract request metadata (if exists)
248
  if new_request_metadata:
@@ -260,7 +277,21 @@ def choose_and_run_redactor(file_paths:List[str],
260
  # Analyse text-based pdf
261
  print('Redacting file as text-based PDF')
262
 
263
- pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text", current_loop_page, page_break_return, annotations_all_pages, all_line_level_ocr_results_df, all_decision_process_table, pymupdf_doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  else:
266
  out_message = "No redaction method selected"
@@ -287,27 +318,37 @@ def choose_and_run_redactor(file_paths:List[str],
287
  pymupdf_doc.save(out_image_file_path)
288
 
289
  out_file_paths.append(out_image_file_path)
 
290
  if logging_file_paths:
291
  log_files_output_paths.extend(logging_file_paths)
292
 
293
- #if isinstance(out_message, list):
294
- # out_message.append("File '" + file_path_without_ext + "' successfully redacted")
295
-
296
  logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
297
  all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
298
- log_files_output_paths.append(logs_output_file_name)
 
299
 
300
  all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
301
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
302
- log_files_output_paths.append(all_text_output_file_name)
 
303
 
304
  # Make a combined message for the file
305
  if isinstance(out_message, list):
306
  combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
307
  else: combined_out_message = out_message
308
 
 
 
 
 
309
  out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
310
  combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
 
 
 
 
 
 
311
 
312
  # Increase latest file completed count unless we are at the last file
313
  # if latest_file_completed != len(file_paths):
@@ -348,15 +389,6 @@ def choose_and_run_redactor(file_paths:List[str],
348
  combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
349
  else: combined_out_message = out_message
350
 
351
- out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
352
- combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
353
-
354
- estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
355
- print("Estimated total processing time:", str(estimate_total_processing_time))
356
-
357
- toc = time.perf_counter()
358
- time_taken = toc - tic
359
- estimated_time_taken_state = estimated_time_taken_state + time_taken
360
 
361
  # If textract requests made, write to logging file
362
  if all_request_metadata:
@@ -392,10 +424,6 @@ def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
392
  rect_height = pymupdf_page.rect.height
393
  rect_width = pymupdf_page.rect.width
394
 
395
- # Calculate scaling factors
396
- #scale_height = rect_height / mediabox_height if mediabox_height else 1
397
- #scale_width = rect_width / mediabox_width if mediabox_width else 1
398
-
399
  # Adjust coordinates based on scaling factors
400
  page_x_adjust = (rect_width - mediabox_width) / 2 # Center adjustment
401
  page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
@@ -504,16 +532,13 @@ def move_page_info(file_path: str) -> str:
504
 
505
  return new_file_path
506
 
507
- def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, scale=(1,1)):
508
 
509
  mediabox_height = page.mediabox[3] - page.mediabox[1]
510
  mediabox_width = page.mediabox[2] - page.mediabox[0]
511
  rect_height = page.rect.height
512
  rect_width = page.rect.width
513
 
514
- #print("page_rect_height:", page.rect.height)
515
- #print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
516
-
517
  out_annotation_boxes = {}
518
  all_image_annotation_boxes = []
519
  image_path = ""
@@ -525,16 +550,11 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
525
  image_path = image
526
  image = Image.open(image_path)
527
 
528
- #print("annotations_on_page:", annotations_on_page)
529
-
530
  # Check if this is an object used in the Gradio Annotation component
531
  if isinstance (annotations_on_page, dict):
532
  annotations_on_page = annotations_on_page["boxes"]
533
- #print("annotations on page:", annotations_on_page)
534
 
535
  for annot in annotations_on_page:
536
- #print("annot:", annot)
537
-
538
  # Check if an Image recogniser result, or a Gradio annotation object
539
  if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
540
 
@@ -600,7 +620,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
600
  rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
601
 
602
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
603
- #print("rect_single_pixel_height:", rect_single_pixel_height)
604
  page.add_redact_annot(rect_single_pixel_height)
605
 
606
  # Set up drawing a black box over the whole rect
@@ -614,14 +633,9 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
614
  "boxes": all_image_annotation_boxes
615
  }
616
 
617
- #print("out_annotation_boxes:", out_annotation_boxes)
618
-
619
  page.apply_redactions(images=0, graphics=0)
620
  page.clean_contents()
621
 
622
- #print("Everything is fine at end of redact_page_with_pymupdf")
623
- #print("\nout_annotation_boxes:", out_annotation_boxes)
624
-
625
  return page, out_annotation_boxes
626
 
627
  def bounding_boxes_overlap(box1, box2):
@@ -668,10 +682,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
668
  combined_text = " ".join(word['text'] for word in relevant_words)
669
 
670
  # Calculate new dimensions for the merged box
671
-
672
-
673
-
674
-
675
  reconstructed_bbox = CustomImageRecognizerResult(
676
  bbox.entity_type,
677
  bbox.start,
@@ -740,7 +750,29 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
740
 
741
  return merged_bboxes
742
 
743
- def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", current_loop_page:int=0, page_break_return:bool=False, images=[], annotations_all_pages:List=[], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(), pymupdf_doc = [], page_break_val:int=int(page_break_value), logging_file_paths:List=[], max_time:int=int(max_time_value), progress=Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
 
745
  '''
746
  This function redacts sensitive information from a PDF document. It takes the following parameters:
@@ -749,6 +781,7 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
749
  - prepared_pdf_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
750
  - language (str): The language of the text in the PDF.
751
  - chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
 
752
  - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
753
  - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
754
  - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
@@ -756,13 +789,19 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
756
  - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
757
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
758
  - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
759
- - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
760
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
 
 
 
 
 
 
761
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
762
- - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
 
763
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
764
 
765
- The function returns a redacted PDF document.
766
  '''
767
  file_name = get_file_path_end(file_path)
768
  fill = (0, 0, 0) # Fill colour
@@ -901,20 +940,31 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
901
 
902
  # Step 2: Analyze text and identify PII
903
  if chosen_redact_entities:
 
 
 
904
  redaction_bboxes = image_analyser.analyze_text(
905
  line_level_ocr_results,
906
  line_level_ocr_results_with_children,
 
 
907
  language=language,
908
  entities=chosen_redact_entities,
909
  allow_list=allow_list,
910
- score_threshold=score_threshold,
911
- )
 
 
 
 
 
 
 
 
 
912
  else:
913
  redaction_bboxes = []
914
-
915
- #print("\nsignature_recogniser_boxes:", signature_recogniser_results)
916
- #print("\nhandwriting_recogniser_boxes:", handwriting_recogniser_results)
917
- #print("\nredaction_bboxes:", redaction_bboxes)
918
 
919
  if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
920
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
@@ -1049,53 +1099,6 @@ def get_text_container_characters(text_container:LTTextContainer):
1049
  return characters
1050
  return []
1051
 
1052
-
1053
- def initial_clean(text):
1054
- #### Some of my cleaning functions
1055
- html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
1056
- html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
1057
- non_ascii_pattern = r'[^\x00-\x7F]+'
1058
- multiple_spaces_regex = r'\s{2,}'
1059
-
1060
- # Define a list of patterns and their replacements
1061
- patterns = [
1062
- (html_pattern_regex, ' '),
1063
- (html_start_pattern_end_dots_regex, ' '),
1064
- (non_ascii_pattern, ' '),
1065
- (multiple_spaces_regex, ' ')
1066
- ]
1067
-
1068
- # Apply each regex replacement
1069
- for pattern, replacement in patterns:
1070
- text = re.sub(pattern, replacement, text)
1071
-
1072
- return text
1073
-
1074
-
1075
- def analyse_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
1076
- '''
1077
- Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
1078
- '''
1079
-
1080
- analyser_results = []
1081
-
1082
- #text_to_analyse = initial_clean(text_container.text).strip()
1083
-
1084
- text_to_analyse = initial_clean(text_container.text)
1085
-
1086
- if chosen_redact_entities:
1087
- #print("Running Presidio analyze method. text_to_analyse:", text_to_analyse)
1088
-
1089
- analyser_results = nlp_analyser.analyze(text=text_to_analyse,
1090
- language=language,
1091
- entities=chosen_redact_entities,
1092
- score_threshold=score_threshold,
1093
- return_decision_process=True,
1094
- allow_list=allow_list)
1095
-
1096
- return analyser_results
1097
-
1098
-
1099
  def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
1100
  '''
1101
  Create an OCRResult object based on a list of pdfminer LTChar objects.
@@ -1292,6 +1295,53 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
1292
 
1293
  return analysed_bounding_boxes
1294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1295
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1296
  decision_process_table = pd.DataFrame()
1297
 
@@ -1335,6 +1385,7 @@ def redact_text_pdf(
1335
  prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
1336
  language: str, # Language of the PDF content
1337
  chosen_redact_entities: List[str], # List of entities to be redacted
 
1338
  allow_list: List[str] = None, # Optional list of allowed entities
1339
  page_min: int = 0, # Minimum page number to start redaction
1340
  page_max: int = 999, # Maximum page number to end redaction
@@ -1345,11 +1396,12 @@ def redact_text_pdf(
1345
  all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
1346
  all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
1347
  pymupdf_doc: List = [], # List of PyMuPDF documents
 
1348
  page_break_val: int = int(page_break_value), # Value for page break
1349
- max_time: int = int(max_time_value),
1350
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
1351
  ):
1352
-
1353
  '''
1354
  Redact chosen entities from a PDF that is made up of multiple pages that are not images.
1355
 
@@ -1358,19 +1410,20 @@ def redact_text_pdf(
1358
  - prepared_pdf_image_path: Path to the prepared PDF image for redaction
1359
  - language: Language of the PDF content
1360
  - chosen_redact_entities: List of entities to be redacted
 
1361
  - allow_list: Optional list of allowed entities
1362
  - page_min: Minimum page number to start redaction
1363
  - page_max: Maximum page number to end redaction
1364
  - analysis_type: Type of analysis to perform
1365
  - current_loop_page: Current page being processed in the loop
1366
  - page_break_return: Flag to indicate if a page break should be returned
1367
- - images: List of images (not used in this function)
1368
  - annotations_all_pages: List of annotations across all pages
1369
  - all_line_level_ocr_results_df: DataFrame for OCR results
1370
  - all_decision_process_table: DataFrame for decision process table
1371
  - pymupdf_doc: List of PyMuPDF documents
 
1372
  - page_break_val: Value for page break
1373
- - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1374
  - progress: Progress tracking object
1375
  '''
1376
 
@@ -1393,7 +1446,6 @@ def redact_text_pdf(
1393
  if current_loop_page == 0: page_loop_start = 0
1394
  else: page_loop_start = current_loop_page
1395
 
1396
- #progress_bar = progress.tqdm(range(current_loop_page, number_of_pages), unit="pages", desc="Redacting pages")
1397
  progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
1398
 
1399
  #for page_no in range(0, number_of_pages):
@@ -1414,14 +1466,8 @@ def redact_text_pdf(
1414
  image_annotations = {"image": image, "boxes": []}
1415
  pymupdf_page = pymupdf_doc.load_page(page_no)
1416
 
1417
- #print("pymupdf page loaded")
1418
-
1419
- #print("Page number is:", str(page_no + 1))
1420
-
1421
  if page_min <= page_no < page_max:
1422
 
1423
- #print("Page is in range of pages to redact")
1424
-
1425
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1426
 
1427
  page_analyser_results = []
@@ -1465,7 +1511,16 @@ def redact_text_pdf(
1465
  text_line_analyser_result = []
1466
  text_line_bounding_boxes = []
1467
 
1468
- text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
 
 
 
 
 
 
 
 
 
1469
 
1470
  # Merge bounding boxes for the line if multiple found close together
1471
  if text_line_analyser_result:
 
24
  from gradio import Progress
25
  from collections import defaultdict # For efficient grouping
26
 
27
+ from presidio_analyzer import RecognizerResult
28
+
29
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
30
  from tools.file_conversion import process_file
31
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
32
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
33
+ from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
34
+ # from tools.data_anonymise import generate_decision_process_output
35
+ from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
36
+ from tools.aws_functions import comprehend_client
37
+ from tools.presidio_analyzer_custom import recognizer_result_from_dict
38
 
39
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
40
 
 
66
 
67
  return sum_of_numbers
68
 
 
69
  def choose_and_run_redactor(file_paths:List[str],
70
  prepared_pdf_file_paths:List[str],
71
  prepared_pdf_image_paths:List[str],
72
  language:str,
73
  chosen_redact_entities:List[str],
74
+ chosen_redact_comprehend_entities:List[str],
75
  in_redact_method:str,
76
  in_allow_list:List[List[str]]=None,
77
  latest_file_completed:int=0,
 
90
  pymupdf_doc=[],
91
  current_loop_page:int=0,
92
  page_break_return:bool=False,
93
+ pii_identification_method:str="Local",
94
  progress=gr.Progress(track_tqdm=True)):
95
  '''
96
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
 
99
  - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
100
  - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
101
  - language (str): The language of the text in the files.
102
+ - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
103
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
104
  - in_redact_method (str): The method to use for redaction.
105
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
106
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
 
119
  - pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
120
  - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
121
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
122
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
123
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
124
 
125
  The function returns a redacted document along with processing logs.
 
128
  tic = time.perf_counter()
129
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
130
 
131
+
132
  # If this is the first time around, set variables to 0/blank
133
  if first_loop_state==True:
134
  print("First_loop_state is True")
135
  latest_file_completed = 0
136
  current_loop_page = 0
 
137
  out_file_paths = []
138
  estimate_total_processing_time = 0
139
  estimated_time_taken_state = 0
 
143
  current_loop_page = 0
144
 
145
 
 
 
 
 
146
  if not out_file_paths:
147
  out_file_paths = []
148
 
 
155
  else:
156
  number_of_files = len(file_paths)
157
 
 
 
 
 
 
158
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
159
  if latest_file_completed >= number_of_files:
160
 
 
240
 
241
  print("Redacting file " + file_path_without_ext + " as an image-based file")
242
 
243
+ pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path,
244
+ prepared_pdf_image_paths,
245
+ language,
246
+ chosen_redact_entities,
247
+ chosen_redact_comprehend_entities,
248
+ in_allow_list_flat,
249
+ is_a_pdf,
250
+ page_min,
251
+ page_max,
252
+ in_redact_method,
253
+ handwrite_signature_checkbox,
254
+ "",
255
+ current_loop_page,
256
+ page_break_return,
257
+ prepared_pdf_image_paths,
258
+ annotations_all_pages,
259
+ all_line_level_ocr_results_df,
260
+ all_decision_process_table,
261
+ pymupdf_doc,
262
+ pii_identification_method)
263
 
264
  # Save Textract request metadata (if exists)
265
  if new_request_metadata:
 
277
  # Analyse text-based pdf
278
  print('Redacting file as text-based PDF')
279
 
280
+ pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path,
281
+ prepared_pdf_image_paths,language,
282
+ chosen_redact_entities,
283
+ chosen_redact_comprehend_entities,
284
+ in_allow_list_flat,
285
+ page_min,
286
+ page_max,
287
+ "Simple text analysis - PDFs with selectable text",
288
+ current_loop_page,
289
+ page_break_return,
290
+ annotations_all_pages,
291
+ all_line_level_ocr_results_df,
292
+ all_decision_process_table,
293
+ pymupdf_doc,
294
+ pii_identification_method)
295
 
296
  else:
297
  out_message = "No redaction method selected"
 
318
  pymupdf_doc.save(out_image_file_path)
319
 
320
  out_file_paths.append(out_image_file_path)
321
+
322
  if logging_file_paths:
323
  log_files_output_paths.extend(logging_file_paths)
324
 
 
 
 
325
  logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
326
  all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
327
+ #log_files_output_paths.append(logs_output_file_name)
328
+ out_file_paths.append(logs_output_file_name)
329
 
330
  all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
331
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
332
+ #log_files_output_paths.append(all_text_output_file_name)
333
+ out_file_paths.append(all_text_output_file_name)
334
 
335
  # Make a combined message for the file
336
  if isinstance(out_message, list):
337
  combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
338
  else: combined_out_message = out_message
339
 
340
+ toc = time.perf_counter()
341
+ time_taken = toc - tic
342
+ estimated_time_taken_state = estimated_time_taken_state + time_taken
343
+
344
  out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
345
  combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
346
+
347
+ estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
348
+ print("Estimated total processing time:", str(estimate_total_processing_time))
349
+
350
+ #out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
351
+ #combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
352
 
353
  # Increase latest file completed count unless we are at the last file
354
  # if latest_file_completed != len(file_paths):
 
389
  combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
390
  else: combined_out_message = out_message
391
 
 
 
 
 
 
 
 
 
 
392
 
393
  # If textract requests made, write to logging file
394
  if all_request_metadata:
 
424
  rect_height = pymupdf_page.rect.height
425
  rect_width = pymupdf_page.rect.width
426
 
 
 
 
 
427
  # Adjust coordinates based on scaling factors
428
  page_x_adjust = (rect_width - mediabox_width) / 2 # Center adjustment
429
  page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
 
532
 
533
  return new_file_path
534
 
535
+ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
536
 
537
  mediabox_height = page.mediabox[3] - page.mediabox[1]
538
  mediabox_width = page.mediabox[2] - page.mediabox[0]
539
  rect_height = page.rect.height
540
  rect_width = page.rect.width
541
 
 
 
 
542
  out_annotation_boxes = {}
543
  all_image_annotation_boxes = []
544
  image_path = ""
 
550
  image_path = image
551
  image = Image.open(image_path)
552
 
 
 
553
  # Check if this is an object used in the Gradio Annotation component
554
  if isinstance (annotations_on_page, dict):
555
  annotations_on_page = annotations_on_page["boxes"]
 
556
 
557
  for annot in annotations_on_page:
 
 
558
  # Check if an Image recogniser result, or a Gradio annotation object
559
  if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
560
 
 
620
  rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
621
 
622
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
 
623
  page.add_redact_annot(rect_single_pixel_height)
624
 
625
  # Set up drawing a black box over the whole rect
 
633
  "boxes": all_image_annotation_boxes
634
  }
635
 
 
 
636
  page.apply_redactions(images=0, graphics=0)
637
  page.clean_contents()
638
 
 
 
 
639
  return page, out_annotation_boxes
640
 
641
  def bounding_boxes_overlap(box1, box2):
 
682
  combined_text = " ".join(word['text'] for word in relevant_words)
683
 
684
  # Calculate new dimensions for the merged box
 
 
 
 
685
  reconstructed_bbox = CustomImageRecognizerResult(
686
  bbox.entity_type,
687
  bbox.start,
 
750
 
751
  return merged_bboxes
752
 
753
+ def redact_image_pdf(file_path:str,
754
+ prepared_pdf_file_paths:List[str],
755
+ language:str,
756
+ chosen_redact_entities:List[str],
757
+ chosen_redact_comprehend_entities:List[str],
758
+ allow_list:List[str]=None,
759
+ is_a_pdf:bool=True,
760
+ page_min:int=0,
761
+ page_max:int=999,
762
+ analysis_type:str="Quick image analysis - typed text",
763
+ handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
764
+ request_metadata:str="", current_loop_page:int=0,
765
+ page_break_return:bool=False,
766
+ images=[],
767
+ annotations_all_pages:List=[],
768
+ all_line_level_ocr_results_df = pd.DataFrame(),
769
+ all_decision_process_table = pd.DataFrame(),
770
+ pymupdf_doc = [],
771
+ pii_identification_method:str="Local",
772
+ page_break_val:int=int(page_break_value),
773
+ logging_file_paths:List=[],
774
+ max_time:int=int(max_time_value),
775
+ progress=Progress(track_tqdm=True)):
776
 
777
  '''
778
  This function redacts sensitive information from a PDF document. It takes the following parameters:
 
781
  - prepared_pdf_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
782
  - language (str): The language of the text in the PDF.
783
  - chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
784
+ - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from the list allowed by the AWS Comprehend service.
785
  - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
786
  - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
787
  - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
 
789
  - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
790
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
791
  - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
 
792
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
793
+ - images (list, optional): List of image objects for each PDF page.
794
+ - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
795
+ - all_line_level_ocr_results_df (pd.DataFrame(), optional): All line level OCR results for the document as a Pandas dataframe,
796
+ - all_decision_process_table (pd.DataFrame(), optional): All redaction decisions for document as a Pandas dataframe.
797
+ - pymupdf_doc (List, optional): The document as a PyMupdf object.
798
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
799
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
800
+ - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
801
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
802
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
803
 
804
+ The function returns a fully or partially-redacted PDF document.
805
  '''
806
  file_name = get_file_path_end(file_path)
807
  fill = (0, 0, 0) # Fill colour
 
940
 
941
  # Step 2: Analyze text and identify PII
942
  if chosen_redact_entities:
943
+
944
+ pii_identification_method= "AWS Comprehend" #"Local"
945
+
946
  redaction_bboxes = image_analyser.analyze_text(
947
  line_level_ocr_results,
948
  line_level_ocr_results_with_children,
949
+ chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
950
+ pii_identification_method = pii_identification_method,
951
  language=language,
952
  entities=chosen_redact_entities,
953
  allow_list=allow_list,
954
+ score_threshold=score_threshold
955
+ )
956
+
957
+ # redaction_bboxes = choose_redaction_method_and_analyse_pii(line_level_ocr_results,
958
+ # line_level_ocr_results_with_children,
959
+ # language,
960
+ # chosen_redact_entities,
961
+ # allow_list,
962
+ # score_threshold,
963
+ # pii_identification_method)
964
+
965
  else:
966
  redaction_bboxes = []
967
+
 
 
 
968
 
969
  if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
970
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
 
1099
  return characters
1100
  return []
1101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1102
  def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
1103
  '''
1104
  Create an OCRResult object based on a list of pdfminer LTChar objects.
 
1295
 
1296
  return analysed_bounding_boxes
1297
 
1298
+ def identify_pii_in_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], chosen_redact_comprehend_entities:List[str], score_threshold:float, allow_list:List[str], pii_identification_method:str="Local") -> List[RecognizerResult]:
1299
+ '''
1300
+ Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
1301
+ '''
1302
+
1303
+ analyser_results = []
1304
+
1305
+ #text_to_analyse = initial_clean(text_container.text).strip()
1306
+
1307
+ text_to_analyse = text_container.text
1308
+
1309
+ if chosen_redact_entities:
1310
+ if pii_identification_method == "Local":
1311
+ analyser_results = nlp_analyser.analyze(text=text_to_analyse,
1312
+ language=language,
1313
+ entities=chosen_redact_entities,
1314
+ score_threshold=score_threshold,
1315
+ return_decision_process=True,
1316
+ allow_list=allow_list)
1317
+
1318
+ elif pii_identification_method == "AWS Comprehend":
1319
+
1320
+ # Call the detect_pii_entities method
1321
+ response = comprehend_client.detect_pii_entities(
1322
+ Text=text_to_analyse,
1323
+ LanguageCode=language # Specify the language of the text
1324
+ )
1325
+
1326
+ for result in response["Entities"]:
1327
+
1328
+ result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
1329
+
1330
+ if result_text not in allow_list:
1331
+ if result.get("Type") in chosen_redact_comprehend_entities:
1332
+
1333
+ recogniser_entity = recognizer_result_from_dict(result)
1334
+
1335
+ analyser_results.append(recogniser_entity)
1336
+
1337
+ else:
1338
+ analyser_results = []
1339
+ else:
1340
+ analyser_results = []
1341
+
1342
+
1343
+ return analyser_results
1344
+
1345
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1346
  decision_process_table = pd.DataFrame()
1347
 
 
1385
  prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
1386
  language: str, # Language of the PDF content
1387
  chosen_redact_entities: List[str], # List of entities to be redacted
1388
+ chosen_redact_comprehend_entities: List[str],
1389
  allow_list: List[str] = None, # Optional list of allowed entities
1390
  page_min: int = 0, # Minimum page number to start redaction
1391
  page_max: int = 999, # Maximum page number to end redaction
 
1396
  all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
1397
  all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
1398
  pymupdf_doc: List = [], # List of PyMuPDF documents
1399
+ pii_identification_method: str = "Local",
1400
  page_break_val: int = int(page_break_value), # Value for page break
1401
+ max_time: int = int(max_time_value),
1402
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
1403
  ):
1404
+
1405
  '''
1406
  Redact chosen entities from a PDF that is made up of multiple pages that are not images.
1407
 
 
1410
  - prepared_pdf_image_path: Path to the prepared PDF image for redaction
1411
  - language: Language of the PDF content
1412
  - chosen_redact_entities: List of entities to be redacted
1413
+ - chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
1414
  - allow_list: Optional list of allowed entities
1415
  - page_min: Minimum page number to start redaction
1416
  - page_max: Maximum page number to end redaction
1417
  - analysis_type: Type of analysis to perform
1418
  - current_loop_page: Current page being processed in the loop
1419
  - page_break_return: Flag to indicate if a page break should be returned
 
1420
  - annotations_all_pages: List of annotations across all pages
1421
  - all_line_level_ocr_results_df: DataFrame for OCR results
1422
  - all_decision_process_table: DataFrame for decision process table
1423
  - pymupdf_doc: List of PyMuPDF documents
1424
+ - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
1425
  - page_break_val: Value for page break
1426
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1427
  - progress: Progress tracking object
1428
  '''
1429
 
 
1446
  if current_loop_page == 0: page_loop_start = 0
1447
  else: page_loop_start = current_loop_page
1448
 
 
1449
  progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
1450
 
1451
  #for page_no in range(0, number_of_pages):
 
1466
  image_annotations = {"image": image, "boxes": []}
1467
  pymupdf_page = pymupdf_doc.load_page(page_no)
1468
 
 
 
 
 
1469
  if page_min <= page_no < page_max:
1470
 
 
 
1471
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1472
 
1473
  page_analyser_results = []
 
1511
  text_line_analyser_result = []
1512
  text_line_bounding_boxes = []
1513
 
1514
+ # text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
1515
+
1516
+ #pii_identification_method="AWS Comprehend"#"Local"
1517
+
1518
+ if chosen_redact_entities:
1519
+
1520
+ text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
1521
+
1522
+ else:
1523
+ text_line_analyser_result = []
1524
 
1525
  # Merge bounding boxes for the line if multiple found close together
1526
  if text_line_analyser_result:
tools/presidio_analyzer_custom.py CHANGED
@@ -5,7 +5,28 @@ from tqdm import tqdm
5
  from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
6
  from presidio_analyzer.nlp_engine import NlpArtifacts
7
 
8
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def analyze_iterator_custom(
11
  self,
 
5
  from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
6
  from presidio_analyzer.nlp_engine import NlpArtifacts
7
 
8
+ def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
9
+ """
10
+ Create RecognizerResult from a dictionary.
11
+
12
+ :param data: e.g. {
13
+ "entity_type": "NAME",
14
+ "start": 24,
15
+ "end": 32,
16
+ "score": 0.8,
17
+ "recognition_metadata": None
18
+ }
19
+ :return: RecognizerResult
20
+ """
21
+
22
+ entity_type = data.get("Type")
23
+ start = data.get("BeginOffset")
24
+ end = data.get("EndOffset")
25
+ score = data.get("Score")
26
+ analysis_explanation = None
27
+ recognition_metadata = None
28
+
29
+ return RecognizerResult(entity_type, start, end, score, analysis_explanation, recognition_metadata)
30
 
31
  def analyze_iterator_custom(
32
  self,