seanpedrickcase commited on
Commit
42180e4
·
1 Parent(s): dea568f

Fixed issues with log file list picking up logs from other file runs. Updated packages.

Browse files
Files changed (4) hide show
  1. app.py +26 -25
  2. requirements.txt +13 -13
  3. tools/file_redaction.py +21 -104
  4. tools/redaction_review.py +2 -2
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import socket
3
 
4
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
5
- os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
6
 
7
  import gradio as gr
8
  import pandas as pd
@@ -65,7 +65,8 @@ with app:
65
  ###
66
  # STATE VARIABLES
67
  ###
68
-
 
69
  pdf_doc_state = gr.State([])
70
  all_image_annotations_state = gr.State([])
71
 
@@ -73,12 +74,12 @@ with app:
73
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
74
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
75
 
76
- session_hash_state = gr.State()
77
- s3_output_folder_state = gr.State()
78
 
79
- first_loop_state = gr.State(True)
80
- second_loop_state = gr.State(False)
81
- do_not_save_pdf_state = gr.State(False)
82
 
83
  prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
84
  images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
@@ -86,18 +87,18 @@ with app:
86
  output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
87
  output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
88
  text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
89
- log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False) #gr.State([])
90
 
91
 
92
  # Logging state
93
  log_file_name = 'log.csv'
94
 
95
- feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
96
- feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
97
- access_logs_state = gr.State(access_logs_folder + log_file_name)
98
- access_s3_logs_loc_state = gr.State(access_logs_folder)
99
- usage_logs_state = gr.State(usage_logs_folder + log_file_name)
100
- usage_s3_logs_loc_state = gr.State(usage_logs_folder)
101
 
102
  # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
103
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -121,11 +122,11 @@ with app:
121
 
122
  ## Annotator zoom value
123
  annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
124
- zoom_true_bool = gr.State(True)
125
- zoom_false_bool = gr.State(False)
126
 
127
- clear_all_page_redactions = gr.State(True)
128
- prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
129
 
130
  ## Settings page variables
131
  default_allow_list_file_name = "default_allow_list.csv"
@@ -148,11 +149,11 @@ with app:
148
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
149
 
150
  # Base dataframe for recognisers that is not modified subsequent to load
151
- recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
152
 
153
  # Duplicate page detection
154
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
155
- duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
156
 
157
 
158
 
@@ -178,8 +179,8 @@ with app:
178
  with gr.Accordion("Redact document", open = True):
179
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
180
  if RUN_AWS_FUNCTIONS == "1":
181
- in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
182
- pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
183
  else:
184
  in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
185
  pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
@@ -336,7 +337,7 @@ with app:
336
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
337
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
338
 
339
- with gr.Accordion("AWS Textract specific options", open = False):
340
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
341
  #with gr.Row():
342
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
@@ -542,8 +543,8 @@ print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
542
  ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
543
  print(f'The value of ROOT_PATH is {ROOT_PATH}')
544
 
545
- DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
546
- print(f'The value of ROOT_PATH is {DEFAULT_CONCURRENCY_LIMIT}')
547
 
548
  if __name__ == "__main__":
549
 
 
2
  import socket
3
 
4
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
5
+ #os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
6
 
7
  import gradio as gr
8
  import pandas as pd
 
65
  ###
66
  # STATE VARIABLES
67
  ###
68
+
69
+ # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
70
  pdf_doc_state = gr.State([])
71
  all_image_annotations_state = gr.State([])
72
 
 
74
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
75
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
76
 
77
+ session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False) #.State()
78
+ s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False) #.State()
79
 
80
+ first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False) #.State(True)
81
+ second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False) #.State(False)
82
+ do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False) #.State(False)
83
 
84
  prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
85
  images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
 
87
  output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
88
  output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
89
  text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
90
+ log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=True) #gr.State([])
91
 
92
 
93
  # Logging state
94
  log_file_name = 'log.csv'
95
 
96
+ feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False) #State(feedback_logs_folder + log_file_name)
97
+ feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False) #State(feedback_logs_folder)
98
+ access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False) #State(access_logs_folder + log_file_name)
99
+ access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False) #State(access_logs_folder)
100
+ usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False) #State(usage_logs_folder + log_file_name)
101
+ usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False) #State(usage_logs_folder)
102
 
103
  # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
104
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
 
122
 
123
  ## Annotator zoom value
124
  annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
125
+ zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False) #State(True)
126
+ zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False) #State(False)
127
 
128
+ clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False) #State(True)
129
+ prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
130
 
131
  ## Settings page variables
132
  default_allow_list_file_name = "default_allow_list.csv"
 
149
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
150
 
151
  # Base dataframe for recognisers that is not modified subsequent to load
152
+ recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False, label="recogniser_entity_dataframe_base")
153
 
154
  # Duplicate page detection
155
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
156
+ duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
157
 
158
 
159
 
 
179
  with gr.Accordion("Redact document", open = True):
180
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
181
  if RUN_AWS_FUNCTIONS == "1":
182
+ in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
183
+ pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
184
  else:
185
  in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
186
  pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
 
337
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
338
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
339
 
340
+ with gr.Accordion("AWS Textract options", open = False):
341
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
342
  #with gr.Row():
343
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
 
543
  ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
544
  print(f'The value of ROOT_PATH is {ROOT_PATH}')
545
 
546
+ DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
547
+ print(f'The value of DEFAULT_CONCURRENCY_LIMIT is {DEFAULT_CONCURRENCY_LIMIT}')
548
 
549
  if __name__ == "__main__":
550
 
requirements.txt CHANGED
@@ -1,22 +1,22 @@
1
- pdfminer.six==20231228
2
  pdf2image==1.17.0
3
- pymupdf==1.24.10
4
  opencv-python==4.10.0.84
5
- presidio_analyzer==2.2.355
6
- presidio_anonymizer==2.2.355
7
- presidio-image-redactor==0.0.53
8
- pikepdf==8.15.1
9
  pandas==2.2.3
10
  nltk==3.9.1
11
- scikit-learn==1.5.2
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.16.0
16
- boto3==1.36.15
17
- pyarrow==18.1.0
18
- openpyxl==3.1.2
19
- Faker==22.2.0
20
  python-levenshtein==0.26.1
21
  spaczz==0.6.1
22
  #gradio_image_annotation==0.2.5
@@ -24,7 +24,7 @@ spaczz==0.6.1
24
  https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
25
  rapidfuzz==3.12.1
26
  numpy==1.26.4
27
- awslambdaric==3.0.0
28
 
29
 
30
 
 
1
+ pdfminer.six==20240706
2
  pdf2image==1.17.0
3
+ pymupdf==1.25.3
4
  opencv-python==4.10.0.84
5
+ presidio_analyzer==2.2.357
6
+ presidio_anonymizer==2.2.357
7
+ presidio-image-redactor==0.0.55
8
+ pikepdf==9.5.2
9
  pandas==2.2.3
10
  nltk==3.9.1
11
+ scikit-learn==1.6.1
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
+ gradio==5.18.0
16
+ boto3==1.36.26
17
+ pyarrow==19.0.1
18
+ openpyxl==3.1.5
19
+ Faker==36.1.1
20
  python-levenshtein==0.26.1
21
  spaczz==0.6.1
22
  #gradio_image_annotation==0.2.5
 
24
  https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
25
  rapidfuzz==3.12.1
26
  numpy==1.26.4
27
+ awslambdaric==3.0.1
28
 
29
 
30
 
tools/file_redaction.py CHANGED
@@ -78,9 +78,9 @@ def choose_and_run_redactor(file_paths:List[str],
78
  custom_recogniser_word_list:List[str]=None,
79
  redact_whole_page_list:List[str]=None,
80
  latest_file_completed:int=0,
81
- out_message:list=[],
82
- out_file_paths:list=[],
83
- log_files_output_paths:list=[],
84
  first_loop_state:bool=False,
85
  page_min:int=0,
86
  page_max:int=999,
@@ -301,9 +301,6 @@ def choose_and_run_redactor(file_paths:List[str],
301
  file_paths_list = file_paths
302
  file_paths_loop = [file_paths_list[int(latest_file_completed)]]
303
 
304
- # print("file_paths_list in choose_redactor function:", file_paths_list)
305
-
306
-
307
  for file in file_paths_loop:
308
  if isinstance(file, str):
309
  file_path = file
@@ -313,7 +310,6 @@ def choose_and_run_redactor(file_paths:List[str],
313
  if file_path:
314
  pdf_file_name_without_ext = get_file_name_without_type(file_path)
315
  pdf_file_name_with_ext = os.path.basename(file_path)
316
- # print("Redacting file:", pdf_file_name_with_ext)
317
 
318
  is_a_pdf = is_pdf(file_path) == True
319
  if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -361,14 +357,11 @@ def choose_and_run_redactor(file_paths:List[str],
361
  custom_recogniser_word_list,
362
  redact_whole_page_list,
363
  max_fuzzy_spelling_mistakes_num,
364
- match_fuzzy_whole_phrase_bool)
365
-
366
-
367
- #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
368
-
369
  # Save Textract request metadata (if exists)
370
  if new_request_metadata:
371
- #print("Request metadata:", new_request_metadata)
372
  all_request_metadata.append(new_request_metadata)
373
 
374
  elif in_redact_method == text_ocr_option:
@@ -422,9 +415,6 @@ def choose_and_run_redactor(file_paths:List[str],
422
  # Save file
423
  if is_pdf(file_path) == False:
424
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
425
- #pymupdf_doc[0].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)
426
- #print("pymupdf_doc", pymupdf_doc)
427
- #print("pymupdf_doc[0]", pymupdf_doc[0])
428
  pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
429
  out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
430
 
@@ -434,10 +424,6 @@ def choose_and_run_redactor(file_paths:List[str],
434
 
435
  out_file_paths.append(out_redacted_pdf_file_path)
436
 
437
- #if log_files_output_paths:
438
- # log_files_output_paths.extend(log_files_output_paths)
439
-
440
-
441
  out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
442
 
443
  logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
@@ -450,27 +436,20 @@ def choose_and_run_redactor(file_paths:List[str],
450
 
451
  # Save the gradio_annotation_boxes to a JSON file
452
  try:
453
-
454
- #print("Saving annotations to CSV")
455
-
456
- # Convert json to csv and also save this
457
- #print("annotations_all_pages:", annotations_all_pages)
458
- #print("all_decision_process_table:", all_decision_process_table)
459
-
460
  review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
461
 
462
  out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
463
  review_df.to_csv(out_review_file_path, index=None)
464
  out_file_paths.append(out_review_file_path)
465
 
466
- print("Saved review file to csv")
467
 
468
  out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
469
  with open(out_annotation_file_path, 'w') as f:
470
  json.dump(annotations_all_pages, f)
471
  log_files_output_paths.append(out_annotation_file_path)
472
 
473
- print("Saving annotations to JSON")
474
 
475
  except Exception as e:
476
  print("Could not save annotations to json or csv file:", e)
@@ -488,7 +467,6 @@ def choose_and_run_redactor(file_paths:List[str],
488
  combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
489
 
490
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
491
- #print("Estimated total processing time:", str(estimate_total_processing_time))
492
 
493
  else:
494
  toc = time.perf_counter()
@@ -511,18 +489,11 @@ def choose_and_run_redactor(file_paths:List[str],
511
 
512
  if combined_out_message: out_message = combined_out_message
513
 
514
- #print("\nout_message at choose_and_run_redactor end is:", out_message)
515
-
516
  # Ensure no duplicated output files
517
  log_files_output_paths = list(set(log_files_output_paths))
518
  out_file_paths = list(set(out_file_paths))
519
  review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
520
 
521
- #print("log_files_output_paths:", log_files_output_paths)
522
- #print("out_file_paths:", out_file_paths)
523
- #print("review_out_file_paths:", review_out_file_paths)
524
-
525
-
526
  return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
527
 
528
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
@@ -646,9 +617,6 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot, image:Image, type="imag
646
  # Unpack coordinates
647
  x1, y1, x2, y2 = rect_coordinates
648
 
649
- #print("scale_width:", scale_width)
650
- #print("scale_height:", scale_height)
651
-
652
  x1 = (x1* scale_width)# + page_x_adjust
653
  new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
654
  x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
@@ -1005,12 +973,10 @@ def redact_image_pdf(file_path:str,
1005
  if custom_recogniser_word_list:
1006
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1007
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
1008
- #print("new_custom_recogniser:", new_custom_recogniser)
1009
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1010
 
1011
  nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
1012
  new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1013
- #print("new_custom_recogniser:", new_custom_recogniser)
1014
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1015
 
1016
 
@@ -1045,22 +1011,15 @@ def redact_image_pdf(file_path:str,
1045
  else: page_min = page_min - 1
1046
 
1047
  print("Page range:", str(page_min + 1), "to", str(page_max))
1048
- #print("Current_loop_page:", current_loop_page)
1049
 
1050
  # If running Textract, check if file already exists. If it does, load in existing data
1051
- # Import results from json and convert
1052
  if analysis_type == textract_option:
1053
 
1054
  json_file_path = output_folder + file_name + "_textract.json"
1055
 
1056
-
1057
  if not os.path.exists(json_file_path):
1058
  print("No existing Textract results file found.")
1059
  textract_data = {}
1060
- #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1061
- #log_files_output_paths.append(json_file_path)
1062
- #request_metadata = request_metadata + "\n" + new_request_metadata
1063
- #wrapped_text_blocks = {"pages":[text_blocks]}
1064
  else:
1065
  # Open the file and load the JSON data
1066
  no_textract_file = False
@@ -1073,7 +1032,6 @@ def redact_image_pdf(file_path:str,
1073
  textract_data = json.load(json_file)
1074
 
1075
  ###
1076
-
1077
  if current_loop_page == 0: page_loop_start = 0
1078
  else: page_loop_start = current_loop_page
1079
 
@@ -1087,7 +1045,6 @@ def redact_image_pdf(file_path:str,
1087
  page_break_return = False
1088
 
1089
  reported_page_number = str(page_no + 1)
1090
- #print("Redacting page:", reported_page_number)
1091
 
1092
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
1093
  try:
@@ -1104,7 +1061,6 @@ def redact_image_pdf(file_path:str,
1104
 
1105
  #print("Image is in range of pages to redact")
1106
  if isinstance(image, str):
1107
- #print("image is a file path", image)
1108
  image = Image.open(image)
1109
 
1110
  # Need image size to convert textract OCR outputs to the correct sizes
@@ -1192,13 +1148,13 @@ def redact_image_pdf(file_path:str,
1192
  redaction_bboxes = []
1193
 
1194
 
1195
- if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
1196
- elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
1197
 
1198
- # Save decision making process
1199
- bboxes_str = str(redaction_bboxes)
1200
- with open(interim_results_file_path, "w") as f:
1201
- f.write(bboxes_str)
1202
 
1203
  # Merge close bounding boxes
1204
  merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
@@ -1210,7 +1166,6 @@ def redact_image_pdf(file_path:str,
1210
  all_image_annotations_boxes = []
1211
 
1212
  for box in merged_redaction_bboxes:
1213
- #print("box:", box)
1214
 
1215
  x0 = box.left
1216
  y0 = box.top
@@ -1238,8 +1193,6 @@ def redact_image_pdf(file_path:str,
1238
 
1239
  ## Apply annotations with pymupdf
1240
  else:
1241
- #print("merged_redaction_boxes:", merged_redaction_bboxes)
1242
- #print("redact_whole_page_list:", redact_whole_page_list)
1243
  if redact_whole_page_list:
1244
  int_reported_page_number = int(reported_page_number)
1245
  if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
@@ -1284,8 +1237,6 @@ def redact_image_pdf(file_path:str,
1284
 
1285
  time_taken = toc - tic
1286
 
1287
- #print("toc - tic:", time_taken)
1288
-
1289
  # Break if time taken is greater than max_time seconds
1290
  if time_taken > max_time:
1291
  print("Processing for", max_time, "seconds, breaking loop.")
@@ -1298,7 +1249,6 @@ def redact_image_pdf(file_path:str,
1298
  pymupdf_doc = images
1299
 
1300
  # Check if the image already exists in annotations_all_pages
1301
- #print("annotations_all_pages:", annotations_all_pages)
1302
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1303
  if existing_index is not None:
1304
  # Replace the existing annotation
@@ -1315,6 +1265,8 @@ def redact_image_pdf(file_path:str,
1315
  if json_file_path not in log_files_output_paths:
1316
  log_files_output_paths.append(json_file_path)
1317
 
 
 
1318
  current_loop_page += 1
1319
 
1320
  return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
@@ -1324,7 +1276,6 @@ def redact_image_pdf(file_path:str,
1324
  pymupdf_doc = images
1325
 
1326
  # Check if the image already exists in annotations_all_pages
1327
- #print("annotations_all_pages:", annotations_all_pages)
1328
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1329
  if existing_index is not None:
1330
  # Replace the existing annotation
@@ -1409,9 +1360,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1409
 
1410
  if isinstance(char, LTAnno):
1411
 
1412
- # print("Character line:", "".join(character_text_objects_out))
1413
- # print("Char is an annotation object:", char)
1414
-
1415
  added_text = char.get_text()
1416
 
1417
  # Handle double quotes
@@ -1427,7 +1375,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1427
 
1428
  # Check for line break (assuming a new line is indicated by a specific character)
1429
  if '\n' in added_text:
1430
- #print("char_anno:", char)
1431
  # Finalize the current line
1432
  if current_word:
1433
  word_bboxes.append((current_word, current_word_bbox))
@@ -1475,13 +1423,12 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1475
  word_bboxes.append((current_word, current_word_bbox))
1476
 
1477
  if full_text:
1478
- #print("full_text before:", full_text)
1479
  if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
1480
  # Convert special characters to a human-readable format
1481
- #full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
1482
  full_text = clean_unicode_text(full_text)
1483
  full_text = full_text.strip()
1484
- #print("full_text:", full_text)
1485
 
1486
  line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
1487
 
@@ -1498,9 +1445,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
1498
  analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
1499
 
1500
  # Remove brackets and split the string into four separate columns
1501
- #print("analysed_bounding_boxes_df_new:", analysed_bounding_boxes_df_new['boundingBox'])
1502
- # analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].str.strip('[]').str.split(',', expand=True)
1503
-
1504
  # Split the boundingBox list into four separate columns
1505
  analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
1506
 
@@ -1512,8 +1456,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
1512
  analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
1513
  analysed_bounding_boxes_df_new['page'] = page_num + 1
1514
  decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
1515
-
1516
- #print('\n\ndecision_process_table:\n\n', decision_process_table)
1517
 
1518
  return decision_process_table
1519
 
@@ -1607,7 +1549,6 @@ def redact_text_pdf(
1607
  return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1608
 
1609
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1610
- #print("custom_recogniser_word_list:", custom_recogniser_word_list)
1611
  if custom_recogniser_word_list:
1612
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1613
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
@@ -1617,16 +1558,6 @@ def redact_text_pdf(
1617
  new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1618
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1619
 
1620
- # List all elements currently in the nlp_analyser registry
1621
- #print("Current recognizers in nlp_analyser registry:")
1622
- #for recognizer_name in nlp_analyser.registry.recognizers:
1623
- #print(recognizer_name)
1624
- #print(recognizer_name.name)
1625
-
1626
- #print("Custom recogniser:", nlp_analyser.registry)
1627
-
1628
- #print("custom_recogniser_word_list:", custom_recogniser_word_list)
1629
-
1630
  tic = time.perf_counter()
1631
 
1632
  # Open with Pikepdf to get text lines
@@ -1641,7 +1572,6 @@ def redact_text_pdf(
1641
  else: page_min = page_min - 1
1642
 
1643
  print("Page range is",str(page_min + 1), "to", str(page_max))
1644
- print("Current_loop_page:", current_loop_page)
1645
 
1646
  if current_loop_page == 0: page_loop_start = 0
1647
  else: page_loop_start = current_loop_page
@@ -1716,8 +1646,6 @@ def redact_text_pdf(
1716
  ### REDACTION
1717
 
1718
  if chosen_redact_entities or chosen_redact_comprehend_entities:
1719
- #print("Identifying redactions on page.")
1720
-
1721
  page_analysed_bounding_boxes = run_page_text_redaction(
1722
  language,
1723
  chosen_redact_entities,
@@ -1735,24 +1663,18 @@ def redact_text_pdf(
1735
  comprehend_query_number
1736
  )
1737
 
1738
- #print("page_analyser_results:", page_analyser_results)
1739
- #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
1740
- #print("image:", image)
1741
  else:
1742
  page_analysed_bounding_boxes = []
1743
 
1744
 
1745
  page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
1746
 
1747
- #print("page_analysed_bounding_boxes_out_converted:", page_analysed_bounding_boxes)
1748
 
1749
  # Annotate redactions on page
1750
  pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1751
 
1752
- # print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
1753
-
1754
  # Make pymupdf page redactions
1755
- #print("redact_whole_page_list:", redact_whole_page_list)
1756
  if redact_whole_page_list:
1757
  int_reported_page_number = int(reported_page_number)
1758
  if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
@@ -1761,9 +1683,6 @@ def redact_text_pdf(
1761
 
1762
  pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
1763
 
1764
- #print("image_annotations:", image_annotations)
1765
-
1766
- #print("Did redact_page_with_pymupdf function")
1767
  reported_page_no = page_no + 1
1768
  print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
1769
 
@@ -1778,14 +1697,12 @@ def redact_text_pdf(
1778
 
1779
  if not decision_process_table_on_page.empty:
1780
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
1781
- #print("all_decision_process_table:", all_decision_process_table)
1782
 
1783
  toc = time.perf_counter()
1784
 
1785
  time_taken = toc - tic
1786
 
1787
- #print("toc - tic:", time_taken)
1788
-
1789
  # Break if time taken is greater than max_time seconds
1790
  if time_taken > max_time:
1791
  print("Processing for", max_time, "seconds, breaking.")
 
78
  custom_recogniser_word_list:List[str]=None,
79
  redact_whole_page_list:List[str]=None,
80
  latest_file_completed:int=0,
81
+ out_message:List=[],
82
+ out_file_paths:List=[],
83
+ log_files_output_paths:List=[],
84
  first_loop_state:bool=False,
85
  page_min:int=0,
86
  page_max:int=999,
 
301
  file_paths_list = file_paths
302
  file_paths_loop = [file_paths_list[int(latest_file_completed)]]
303
 
 
 
 
304
  for file in file_paths_loop:
305
  if isinstance(file, str):
306
  file_path = file
 
310
  if file_path:
311
  pdf_file_name_without_ext = get_file_name_without_type(file_path)
312
  pdf_file_name_with_ext = os.path.basename(file_path)
 
313
 
314
  is_a_pdf = is_pdf(file_path) == True
315
  if is_a_pdf == False and in_redact_method == text_ocr_option:
 
357
  custom_recogniser_word_list,
358
  redact_whole_page_list,
359
  max_fuzzy_spelling_mistakes_num,
360
+ match_fuzzy_whole_phrase_bool,
361
+ log_files_output_paths=log_files_output_paths)
362
+
 
 
363
  # Save Textract request metadata (if exists)
364
  if new_request_metadata:
 
365
  all_request_metadata.append(new_request_metadata)
366
 
367
  elif in_redact_method == text_ocr_option:
 
415
  # Save file
416
  if is_pdf(file_path) == False:
417
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
 
 
 
418
  pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
419
  out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
420
 
 
424
 
425
  out_file_paths.append(out_redacted_pdf_file_path)
426
 
 
 
 
 
427
  out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
428
 
429
  logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
 
436
 
437
  # Save the gradio_annotation_boxes to a JSON file
438
  try:
 
 
 
 
 
 
 
439
  review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
440
 
441
  out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
442
  review_df.to_csv(out_review_file_path, index=None)
443
  out_file_paths.append(out_review_file_path)
444
 
445
+ #print("Saved review file to csv")
446
 
447
  out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
448
  with open(out_annotation_file_path, 'w') as f:
449
  json.dump(annotations_all_pages, f)
450
  log_files_output_paths.append(out_annotation_file_path)
451
 
452
+ #print("Saving annotations to JSON")
453
 
454
  except Exception as e:
455
  print("Could not save annotations to json or csv file:", e)
 
467
  combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
468
 
469
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
 
470
 
471
  else:
472
  toc = time.perf_counter()
 
489
 
490
  if combined_out_message: out_message = combined_out_message
491
 
 
 
492
  # Ensure no duplicated output files
493
  log_files_output_paths = list(set(log_files_output_paths))
494
  out_file_paths = list(set(out_file_paths))
495
  review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
496
 
 
 
 
 
 
497
  return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
498
 
499
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
 
617
  # Unpack coordinates
618
  x1, y1, x2, y2 = rect_coordinates
619
 
 
 
 
620
  x1 = (x1* scale_width)# + page_x_adjust
621
  new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
622
  x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
 
973
  if custom_recogniser_word_list:
974
  nlp_analyser.registry.remove_recognizer("CUSTOM")
975
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
 
976
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
977
 
978
  nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
979
  new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
 
980
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
981
 
982
 
 
1011
  else: page_min = page_min - 1
1012
 
1013
  print("Page range:", str(page_min + 1), "to", str(page_max))
 
1014
 
1015
  # If running Textract, check if file already exists. If it does, load in existing data
 
1016
  if analysis_type == textract_option:
1017
 
1018
  json_file_path = output_folder + file_name + "_textract.json"
1019
 
 
1020
  if not os.path.exists(json_file_path):
1021
  print("No existing Textract results file found.")
1022
  textract_data = {}
 
 
 
 
1023
  else:
1024
  # Open the file and load the JSON data
1025
  no_textract_file = False
 
1032
  textract_data = json.load(json_file)
1033
 
1034
  ###
 
1035
  if current_loop_page == 0: page_loop_start = 0
1036
  else: page_loop_start = current_loop_page
1037
 
 
1045
  page_break_return = False
1046
 
1047
  reported_page_number = str(page_no + 1)
 
1048
 
1049
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
1050
  try:
 
1061
 
1062
  #print("Image is in range of pages to redact")
1063
  if isinstance(image, str):
 
1064
  image = Image.open(image)
1065
 
1066
  # Need image size to convert textract OCR outputs to the correct sizes
 
1148
  redaction_bboxes = []
1149
 
1150
 
1151
+ # if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
1152
+ # elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
1153
 
1154
+ # # Save decision making process
1155
+ # bboxes_str = str(redaction_bboxes)
1156
+ # with open(interim_results_file_path, "w") as f:
1157
+ # f.write(bboxes_str)
1158
 
1159
  # Merge close bounding boxes
1160
  merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
 
1166
  all_image_annotations_boxes = []
1167
 
1168
  for box in merged_redaction_bboxes:
 
1169
 
1170
  x0 = box.left
1171
  y0 = box.top
 
1193
 
1194
  ## Apply annotations with pymupdf
1195
  else:
 
 
1196
  if redact_whole_page_list:
1197
  int_reported_page_number = int(reported_page_number)
1198
  if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
 
1237
 
1238
  time_taken = toc - tic
1239
 
 
 
1240
  # Break if time taken is greater than max_time seconds
1241
  if time_taken > max_time:
1242
  print("Processing for", max_time, "seconds, breaking loop.")
 
1249
  pymupdf_doc = images
1250
 
1251
  # Check if the image already exists in annotations_all_pages
 
1252
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1253
  if existing_index is not None:
1254
  # Replace the existing annotation
 
1265
  if json_file_path not in log_files_output_paths:
1266
  log_files_output_paths.append(json_file_path)
1267
 
1268
+ print("At end of redact_image_pdf function where time over max.", json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
1269
+
1270
  current_loop_page += 1
1271
 
1272
  return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 
1276
  pymupdf_doc = images
1277
 
1278
  # Check if the image already exists in annotations_all_pages
 
1279
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1280
  if existing_index is not None:
1281
  # Replace the existing annotation
 
1360
 
1361
  if isinstance(char, LTAnno):
1362
 
 
 
 
1363
  added_text = char.get_text()
1364
 
1365
  # Handle double quotes
 
1375
 
1376
  # Check for line break (assuming a new line is indicated by a specific character)
1377
  if '\n' in added_text:
1378
+
1379
  # Finalize the current line
1380
  if current_word:
1381
  word_bboxes.append((current_word, current_word_bbox))
 
1423
  word_bboxes.append((current_word, current_word_bbox))
1424
 
1425
  if full_text:
 
1426
  if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
1427
  # Convert special characters to a human-readable format
1428
+
1429
  full_text = clean_unicode_text(full_text)
1430
  full_text = full_text.strip()
1431
+
1432
 
1433
  line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
1434
 
 
1445
  analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
1446
 
1447
  # Remove brackets and split the string into four separate columns
 
 
 
1448
  # Split the boundingBox list into four separate columns
1449
  analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
1450
 
 
1456
  analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
1457
  analysed_bounding_boxes_df_new['page'] = page_num + 1
1458
  decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
 
 
1459
 
1460
  return decision_process_table
1461
 
 
1549
  return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1550
 
1551
  # Update custom word list analyser object with any new words that have been added to the custom deny list
 
1552
  if custom_recogniser_word_list:
1553
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1554
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
 
1558
  new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1559
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1560
 
 
 
 
 
 
 
 
 
 
 
1561
  tic = time.perf_counter()
1562
 
1563
  # Open with Pikepdf to get text lines
 
1572
  else: page_min = page_min - 1
1573
 
1574
  print("Page range is",str(page_min + 1), "to", str(page_max))
 
1575
 
1576
  if current_loop_page == 0: page_loop_start = 0
1577
  else: page_loop_start = current_loop_page
 
1646
  ### REDACTION
1647
 
1648
  if chosen_redact_entities or chosen_redact_comprehend_entities:
 
 
1649
  page_analysed_bounding_boxes = run_page_text_redaction(
1650
  language,
1651
  chosen_redact_entities,
 
1663
  comprehend_query_number
1664
  )
1665
 
1666
+
 
 
1667
  else:
1668
  page_analysed_bounding_boxes = []
1669
 
1670
 
1671
  page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
1672
 
 
1673
 
1674
  # Annotate redactions on page
1675
  pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1676
 
 
 
1677
  # Make pymupdf page redactions
 
1678
  if redact_whole_page_list:
1679
  int_reported_page_number = int(reported_page_number)
1680
  if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
 
1683
 
1684
  pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
1685
 
 
 
 
1686
  reported_page_no = page_no + 1
1687
  print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
1688
 
 
1697
 
1698
  if not decision_process_table_on_page.empty:
1699
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
1700
+
1701
 
1702
  toc = time.perf_counter()
1703
 
1704
  time_taken = toc - tic
1705
 
 
 
1706
  # Break if time taken is greater than max_time seconds
1707
  if time_taken > max_time:
1708
  print("Processing for", max_time, "seconds, breaking.")
tools/redaction_review.py CHANGED
@@ -396,7 +396,7 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
396
  row_value_page = evt.row_value[0] # This is the page number value
397
  return row_value_page
398
 
399
- def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
400
  '''
401
  Converts coordinates from image space to Adobe PDF space.
402
 
@@ -431,7 +431,7 @@ def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width,
431
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
432
 
433
 
434
- def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
435
  '''
436
  Create an xfdf file from a review csv file and a pdf
437
  '''
 
396
  row_value_page = evt.row_value[0] # This is the page number value
397
  return row_value_page
398
 
399
+ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
400
  '''
401
  Converts coordinates from image space to Adobe PDF space.
402
 
 
431
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
432
 
433
 
434
+ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str]):
435
  '''
436
  Create an xfdf file from a review csv file and a pdf
437
  '''