seanpedrickcase commited on
Commit
93ac94f
1 Parent(s): 8c33828

Updated decision making output files, log locations

Browse files
Files changed (4) hide show
  1. app.py +13 -7
  2. tools/data_anonymise.py +33 -27
  3. tools/file_conversion.py +1 -6
  4. tools/file_redaction.py +142 -117
app.py CHANGED
@@ -12,6 +12,9 @@ from tools.auth import authenticate_user
12
  #from tools.aws_functions import load_data_from_aws
13
  import gradio as gr
14
 
 
 
 
15
  add_folder_to_path("tesseract/")
16
  add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
17
 
@@ -21,6 +24,9 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
21
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
22
  language = 'en'
23
 
 
 
 
24
  # Create the gradio interface
25
  app = gr.Blocks(theme = gr.themes.Base())
26
 
@@ -35,10 +41,10 @@ with app:
35
 
36
  session_hash_state = gr.State()
37
  s3_output_folder_state = gr.State()
38
- feedback_logs_state = gr.State('feedback/log.csv')
39
- feedback_s3_logs_loc_state = gr.State('feedback/')
40
- usage_logs_state = gr.State('logs/log.csv')
41
- usage_s3_logs_loc_state = gr.State('logs/')
42
 
43
  gr.Markdown(
44
  """
@@ -162,18 +168,18 @@ with app:
162
 
163
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
164
  callback = gr.CSVLogger()
165
- callback.setup([session_hash_textbox], "logs")
166
  session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
167
 
168
  # User submitted feedback for pdf redactions
169
  pdf_callback = gr.CSVLogger()
170
- pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], "feedback")
171
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
172
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
173
 
174
  # User submitted feedback for data redactions
175
  data_callback = gr.CSVLogger()
176
- data_callback.setup([data_feedback_radio, data_further_details_text], "feedback")
177
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
178
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
179
 
 
12
  #from tools.aws_functions import load_data_from_aws
13
  import gradio as gr
14
 
15
+ from datetime import datetime
16
+ today_rev = datetime.now().strftime("%Y%m%d")
17
+
18
  add_folder_to_path("tesseract/")
19
  add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
20
 
 
24
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
25
  language = 'en'
26
 
27
+ feedback_data_folder = 'feedback/' + today_rev + '/'
28
+ logs_data_folder = 'logs/' + today_rev + '/'
29
+
30
  # Create the gradio interface
31
  app = gr.Blocks(theme = gr.themes.Base())
32
 
 
41
 
42
  session_hash_state = gr.State()
43
  s3_output_folder_state = gr.State()
44
+ feedback_logs_state = gr.State(feedback_data_folder + 'log.csv')
45
+ feedback_s3_logs_loc_state = gr.State(feedback_data_folder)
46
+ usage_logs_state = gr.State(logs_data_folder + 'log.csv')
47
+ usage_s3_logs_loc_state = gr.State(logs_data_folder)
48
 
49
  gr.Markdown(
50
  """
 
168
 
169
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
170
  callback = gr.CSVLogger()
171
+ callback.setup([session_hash_textbox], logs_data_folder)
172
  session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
173
 
174
  # User submitted feedback for pdf redactions
175
  pdf_callback = gr.CSVLogger()
176
+ pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], feedback_data_folder)
177
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
178
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
179
 
180
  # User submitted feedback for data redactions
181
  data_callback = gr.CSVLogger()
182
+ data_callback.setup([data_feedback_radio, data_further_details_text], feedback_data_folder)
183
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
184
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
185
 
tools/data_anonymise.py CHANGED
@@ -23,27 +23,7 @@ fake = Faker("en_UK")
23
  def fake_first_name(x):
24
  return fake.first_name()
25
 
26
- # Writing decision making process to file
27
- def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]) -> str:
28
- """
29
- Generate a detailed output of the decision process for entity recognition.
30
-
31
- This function takes the results from the analyzer and the original data dictionary,
32
- and produces a string output detailing the decision process for each recognized entity.
33
- It includes information such as entity type, position, confidence score, and the context
34
- in which the entity was found.
35
-
36
- Args:
37
- analyzer_results (List[DictAnalyzerResult]): The results from the entity analyzer.
38
- df_dict (Dict[str, List[Any]]): The original data in dictionary format.
39
-
40
- Returns:
41
- str: A string containing the detailed decision process output.
42
- """
43
- decision_process_output = []
44
- keys_to_keep = ['entity_type', 'start', 'end']
45
-
46
- def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
47
  output = []
48
 
49
  if hasattr(result, 'value'):
@@ -66,29 +46,53 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
66
  output.append(str(analysis_explanation))
67
 
68
  return output
69
-
70
- #print("Analyser results:", analyzer_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Run through each column to analyse for PII
73
  for i, result in enumerate(analyzer_results):
74
  print("Looking at result:", str(i))
 
75
 
76
  # If a single result
77
  if isinstance(result, RecognizerResult):
 
78
  decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
79
 
80
  # If a list of results
81
- elif isinstance(result, List):
82
- for x, recognizer_result in enumerate(result.recognizer_results):
 
83
  decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
84
 
85
  else:
86
  try:
 
87
  decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
88
  except Exception as e:
89
  print(e)
90
 
91
  decision_process_output_str = '\n'.join(decision_process_output)
 
 
92
 
93
 
94
  return decision_process_output_str
@@ -220,6 +224,8 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
220
  # Usage in the main function:
221
  decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
222
 
 
 
223
  analyse_toc = time.perf_counter()
224
  analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
225
  print(analyse_time_out)
@@ -325,12 +331,12 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
325
  # Write each DataFrame to a different worksheet.
326
  anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
327
 
328
- decision_process_log_output_file = anon_xlsx_export_file_name + "decision_process_output.txt"
329
  with open(decision_process_log_output_file, "w") as f:
330
  f.write(decision_process_output_str)
331
 
332
  else:
333
- anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
334
  anon_df_out.to_csv(anon_export_file_name, index = None)
335
 
336
  decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
 
23
  def fake_first_name(x):
24
  return fake.first_name()
25
 
26
+ def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  output = []
28
 
29
  if hasattr(result, 'value'):
 
46
  output.append(str(analysis_explanation))
47
 
48
  return output
49
+
50
+ # Writing decision making process to file
51
+ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult], df_dict: Dict[str, List[Any]]) -> str:
52
+ """
53
+ Generate a detailed output of the decision process for entity recognition.
54
+
55
+ This function takes the results from the analyzer and the original data dictionary,
56
+ and produces a string output detailing the decision process for each recognized entity.
57
+ It includes information such as entity type, position, confidence score, and the context
58
+ in which the entity was found.
59
+
60
+ Args:
61
+ analyzer_results (List[DictAnalyzerResult]): The results from the entity analyzer.
62
+ df_dict (Dict[str, List[Any]]): The original data in dictionary format.
63
+
64
+ Returns:
65
+ str: A string containing the detailed decision process output.
66
+ """
67
+ decision_process_output = []
68
+ keys_to_keep = ['entity_type', 'start', 'end']
69
 
70
  # Run through each column to analyse for PII
71
  for i, result in enumerate(analyzer_results):
72
  print("Looking at result:", str(i))
73
+ print("result:\n\n", result)
74
 
75
  # If a single result
76
  if isinstance(result, RecognizerResult):
77
+ print("Processing recogniser result as RecognizerResult:", str(i))
78
  decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
79
 
80
  # If a list of results
81
+ elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
82
+ for x, recognizer_result in enumerate(result.recognizer_results):
83
+ print("Processing recogniser result as List:", str(i))
84
  decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
85
 
86
  else:
87
  try:
88
+ print("Processing recogniser result in other:", str(i))
89
  decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
90
  except Exception as e:
91
  print(e)
92
 
93
  decision_process_output_str = '\n'.join(decision_process_output)
94
+
95
+ print("decision_process_output_str:\n\n", decision_process_output_str)
96
 
97
 
98
  return decision_process_output_str
 
224
  # Usage in the main function:
225
  decision_process_output_str = generate_decision_process_output(analyzer_results, df_dict)
226
 
227
+ #print("decision_process_output_str:\n\n", decision_process_output_str)
228
+
229
  analyse_toc = time.perf_counter()
230
  analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
231
  print(analyse_time_out)
 
331
  # Write each DataFrame to a different worksheet.
332
  anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
333
 
334
+ decision_process_log_output_file = anon_xlsx_export_file_name + "_" + excel_sheet_name + "_decision_process_output.txt"
335
  with open(decision_process_log_output_file, "w") as f:
336
  f.write(decision_process_output_str)
337
 
338
  else:
339
+ anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
340
  anon_df_out.to_csv(anon_export_file_name, index = None)
341
 
342
  decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
tools/file_conversion.py CHANGED
@@ -91,8 +91,6 @@ def process_file(file_path):
91
 
92
  return img_object
93
 
94
-
95
-
96
  def prepare_image_or_text_pdf(
97
  file_paths: List[str],
98
  in_redact_method: str,
@@ -123,9 +121,7 @@ def prepare_image_or_text_pdf(
123
 
124
  # If out message or out_file_paths are blank, change to a list so it can be appended to
125
  #if isinstance(out_message, str):
126
- # out_message = [out_message]
127
-
128
-
129
 
130
  # If this is the first time around, set variables to 0/blank
131
  if first_loop_state==True:
@@ -189,7 +185,6 @@ def prepare_image_or_text_pdf(
189
 
190
  return out_message, out_file_paths
191
 
192
-
193
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
194
  file_path_without_ext = get_file_path_end(in_file_path)
195
 
 
91
 
92
  return img_object
93
 
 
 
94
  def prepare_image_or_text_pdf(
95
  file_paths: List[str],
96
  in_redact_method: str,
 
121
 
122
  # If out message or out_file_paths are blank, change to a list so it can be appended to
123
  #if isinstance(out_message, str):
124
+ # out_message = [out_message]
 
 
125
 
126
  # If this is the first time around, set variables to 0/blank
127
  if first_loop_state==True:
 
185
 
186
  return out_message, out_file_paths
187
 
 
188
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
189
  file_path_without_ext = get_file_path_end(in_file_path)
190
 
tools/file_redaction.py CHANGED
@@ -247,142 +247,167 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
247
 
248
  return images, decision_process_output_str
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
251
  '''
252
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
253
  '''
254
-
255
- combined_analyzer_results = []
256
- analyser_explanations = []
257
  annotations_all_pages = []
258
- analyzed_bounding_boxes_df = pd.DataFrame()
259
-
260
- # Horizontal distance between PII bounding boxes under/equal they are combined into one
261
- combine_pixel_dist = 100
262
 
263
  pdf = Pdf.open(filename)
264
-
265
  page_num = 0
266
 
267
- #for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
268
  for page in pdf.pages:
269
  print("Page number is:", page_num + 1)
270
 
271
  annotations_on_page = []
272
- analyzed_bounding_boxes = []
273
 
274
  for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
275
- analyzer_results = []
 
 
 
 
 
276
 
277
  for text_container in page_layout:
278
- if isinstance(text_container, LTTextContainer):
279
- text_to_analyze = text_container.get_text()
280
-
281
- analyzer_results = []
282
- characters = []
283
-
284
- analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
285
- language=language,
286
- entities=chosen_redact_entities,
287
- score_threshold=score_threshold,
288
- return_decision_process=True,
289
- allow_list=allow_list)
290
-
291
-
292
-
293
-
294
- characters = [char # This is what we want to include in the list
295
- for line in text_container # Loop through each line in text_container
296
- if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
297
- for char in line] # Loop through each character in the line
298
- #if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
299
-
300
-
301
- # if len(analyzer_results) > 0 and len(characters) > 0:
302
- # analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
303
- # combined_analyzer_results.extend(analyzer_results)
304
-
305
- # Inside the loop where you process analyzer_results:
306
- if len(analyzer_results) > 0 and len(characters) > 0:
307
- merged_bounding_boxes = []
308
- current_box = None
309
- current_y = None
310
-
311
- for result in analyzer_results:
312
- for char in characters[result.start : result.end]:
313
- if isinstance(char, LTChar):
314
- char_box = list(char.bbox)
315
-
316
- # Fix: Check if either current_y or current_box are None
317
- if current_y is None or current_box is None:
318
- # This is the first character, so initialize current_box and current_y
319
- current_box = char_box
320
- current_y = char_box[1]
321
- else: # Now we have previous values to compare
322
- #print("Comparing values")
323
- vertical_diff_bboxes = abs(char_box[1] - current_y)
324
- horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
325
- #print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
326
-
327
- if (
328
- vertical_diff_bboxes <= 5
329
- and horizontal_diff_bboxes <= combine_pixel_dist
330
- ):
331
- old_right_pos = current_box[2]
332
- current_box[2] = char_box[2]
333
- else:
334
- merged_bounding_boxes.append(
335
- {"boundingBox": current_box, "result": result})
336
-
337
- current_box = char_box
338
- current_y = char_box[1]
339
- # Add the last box
340
- if current_box:
341
- merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
342
-
343
- if not merged_bounding_boxes:
344
- analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
345
- else:
346
- analyzed_bounding_boxes.extend(merged_bounding_boxes)
347
-
348
- combined_analyzer_results.extend(analyzer_results)
349
-
350
- if len(analyzer_results) > 0:
351
- #decision_process_output_str = generate_decision_process_output(analyzer_results, {'text':text_to_analyze})
352
- #print("Decision process:", decision_process_output_str)
353
- # Create summary df of annotations to be made
354
- analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
355
- analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
356
- analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
357
- analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
358
- analyzed_bounding_boxes_df_new['page'] = page_num + 1
359
- analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
360
-
361
- print('analyzed_bounding_boxes_df:', analyzed_bounding_boxes_df)
362
-
363
- for analyzed_bounding_box in analyzed_bounding_boxes:
364
- bounding_box = analyzed_bounding_box["boundingBox"]
365
- annotation = Dictionary(
366
- Type=Name.Annot,
367
- Subtype=Name.Square, #Name.Highlight,
368
- QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
369
- Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
370
- C=[0, 0, 0],
371
- IC=[0, 0, 0],
372
- CA=1, # Transparency
373
- T=analyzed_bounding_box["result"].entity_type,
374
- BS=Dictionary(
375
- W=0, # Border width: 1 point
376
- S=Name.S # Border style: solid
377
- )
378
- )
379
- annotations_on_page.append(annotation)
380
 
381
- annotations_all_pages.extend([annotations_on_page])
382
-
383
- print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
 
 
 
 
 
 
 
 
 
 
384
  page.Annots = pdf.make_indirect(annotations_on_page)
385
 
 
 
 
 
 
386
  page_num += 1
387
 
388
- return pdf, analyzed_bounding_boxes_df
 
247
 
248
  return images, decision_process_output_str
249
 
250
+ def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
251
+ if isinstance(text_container, LTTextContainer):
252
+ text_to_analyze = text_container.get_text()
253
+
254
+ analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
255
+ language=language,
256
+ entities=chosen_redact_entities,
257
+ score_threshold=score_threshold,
258
+ return_decision_process=True,
259
+ allow_list=allow_list)
260
+ characters = [char
261
+ for line in text_container
262
+ if isinstance(line, LTTextLine)
263
+ for char in line]
264
+
265
+ return analyzer_results, characters
266
+ return [], []
267
+
268
+ # Inside the loop where you process analyzer_results, merge bounding boxes that are right next to each other:
269
+ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist):
270
+ analyzed_bounding_boxes = []
271
+ if len(analyzer_results) > 0 and len(characters) > 0:
272
+ merged_bounding_boxes = []
273
+ current_box = None
274
+ current_y = None
275
+
276
+ for i, result in enumerate(analyzer_results):
277
+ print("Considering result", str(i))
278
+ for char in characters[result.start : result.end]:
279
+ if isinstance(char, LTChar):
280
+ char_box = list(char.bbox)
281
+
282
+ if current_y is None or current_box is None:
283
+ current_box = char_box
284
+ current_y = char_box[1]
285
+ else:
286
+ vertical_diff_bboxes = abs(char_box[1] - current_y)
287
+ horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
288
+
289
+ if (
290
+ vertical_diff_bboxes <= 5
291
+ and horizontal_diff_bboxes <= combine_pixel_dist
292
+ ):
293
+ current_box[2] = char_box[2] # Extend the current box horizontally
294
+ else:
295
+ merged_bounding_boxes.append(
296
+ {"boundingBox": current_box, "result": result})
297
+
298
+ # Reset current_box and current_y after appending
299
+ current_box = char_box
300
+ current_y = char_box[1]
301
+
302
+ # After finishing with the current result, add the last box for this result
303
+ if current_box:
304
+ merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
305
+ current_box = None
306
+ current_y = None # Reset for the next result
307
+
308
+ if not merged_bounding_boxes:
309
+ analyzed_bounding_boxes.extend(
310
+ {"boundingBox": char.bbox, "result": result}
311
+ for result in analyzer_results
312
+ for char in characters[result.start:result.end]
313
+ if isinstance(char, LTChar)
314
+ )
315
+ else:
316
+ analyzed_bounding_boxes.extend(merged_bounding_boxes)
317
+
318
+ print("analysed_bounding_boxes:\n\n", analyzed_bounding_boxes)
319
+
320
+ return analyzed_bounding_boxes
321
+
322
+ def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
323
+ decision_process_table = pd.DataFrame()
324
+
325
+ if len(analyzer_results) > 0:
326
+ # Create summary df of annotations to be made
327
+ analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
328
+ analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
329
+ analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
330
+ analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
331
+ analyzed_bounding_boxes_df_new['page'] = page_num + 1
332
+ decision_process_table = pd.concat([decision_process_table, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
333
+
334
+ print('\n\ndecision_process_table:\n\n', decision_process_table)
335
+
336
+ return decision_process_table
337
+
338
+ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
339
+ annotations_on_page = []
340
+ for analyzed_bounding_box in analyzed_bounding_boxes:
341
+ bounding_box = analyzed_bounding_box["boundingBox"]
342
+ annotation = Dictionary(
343
+ Type=Name.Annot,
344
+ Subtype=Name.Square, #Name.Highlight,
345
+ QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3],
346
+ bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
347
+ Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
348
+ C=[0, 0, 0],
349
+ IC=[0, 0, 0],
350
+ CA=1, # Transparency
351
+ T=analyzed_bounding_box["result"].entity_type,
352
+ BS=Dictionary(
353
+ W=0, # Border width: 1 point
354
+ S=Name.S # Border style: solid
355
+ )
356
+ )
357
+ annotations_on_page.append(annotation)
358
+ return annotations_on_page
359
+
360
  def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
361
  '''
362
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
363
  '''
 
 
 
364
  annotations_all_pages = []
365
+ decision_process_table_all_pages = []
366
+
367
+ combine_pixel_dist = 100 # Horizontal distance between PII bounding boxes under/equal they are combined into one
 
368
 
369
  pdf = Pdf.open(filename)
 
370
  page_num = 0
371
 
 
372
  for page in pdf.pages:
373
  print("Page number is:", page_num + 1)
374
 
375
  annotations_on_page = []
376
+ decision_process_table_on_page = []
377
 
378
  for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
379
+
380
+ page_analyzer_results = []
381
+ page_analyzed_bounding_boxes = []
382
+ text_container_analyzer_results = []
383
+ text_container_analyzed_bounding_boxes = []
384
+ characters = []
385
 
386
  for text_container in page_layout:
387
+ text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
388
+ # Merge bounding boxes if very close together
389
+ text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
+ print("\n\nanalyzed_bounding_boxes_in_loop:", text_container_analyzed_bounding_boxes)
392
+
393
+ page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
394
+ page_analyzer_results.extend(text_container_analyzer_results)
395
+
396
+ print("analyzed_bounding_boxes_out_loop:\n\n", page_analyzed_bounding_boxes)
397
+
398
+ decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
399
+
400
+ annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
401
+ #print('\n\nannotations_on_page:', annotations_on_page)
402
+
403
+ # Make page annotations
404
  page.Annots = pdf.make_indirect(annotations_on_page)
405
 
406
+ annotations_all_pages.extend([annotations_on_page])
407
+ decision_process_table_all_pages.extend([decision_process_table_on_page])
408
+
409
+ print("For page number:", page_num, "there are", len(annotations_all_pages[page_num]), "annotations")
410
+
411
  page_num += 1
412
 
413
+ return pdf, decision_process_table_all_pages