seanpedrickcase commited on
Commit
2807627
1 Parent(s): 19846ba

Fixed some more input bugs

Browse files
Files changed (3) hide show
  1. app.py +5 -4
  2. tools/file_conversion.py +3 -3
  3. tools/file_redaction.py +20 -32
app.py CHANGED
@@ -20,6 +20,7 @@ with block:
20
 
21
  prepared_pdf_state = gr.State([])
22
  output_image_files_state = gr.State([])
 
23
 
24
  gr.Markdown(
25
  """
@@ -61,13 +62,13 @@ with block:
61
  ### Loading AWS data ###
62
  load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
63
 
64
- redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
65
  outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
66
  then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
67
- outputs=[output_summary, output_file], api_name="redact")
68
 
69
- convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file],
70
- outputs=[output_summary, output_file], api_name="redact")
71
 
72
  # Simple run for HF spaces or local on your computer
73
  #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
 
20
 
21
  prepared_pdf_state = gr.State([])
22
  output_image_files_state = gr.State([])
23
+ output_file_list_state = gr.State([])
24
 
25
  gr.Markdown(
26
  """
 
62
  ### Loading AWS data ###
63
  load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
64
 
65
+ redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
66
  outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
67
  then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
68
+ outputs=[output_summary, output_file, output_file_list_state], api_name="redact")
69
 
70
+ convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
71
+ outputs=[output_summary, output_file])
72
 
73
  # Simple run for HF spaces or local on your computer
74
  #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
tools/file_conversion.py CHANGED
@@ -86,7 +86,7 @@ def process_file(file_path):
86
 
87
  return out_path
88
 
89
- def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
90
 
91
  out_message = ''
92
  out_file_paths = []
@@ -119,11 +119,11 @@ def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str,
119
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
120
  file_path_without_ext = get_file_path_end(in_file_path)
121
 
122
- out_file_paths = []
123
 
124
  # Convert annotated text pdf back to image to give genuine redactions
125
  print("Creating image version of results")
126
- pdf_text_image_paths = process_file(out_text_file_path)
127
  out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
128
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
129
 
 
86
 
87
  return out_path
88
 
89
+ def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
90
 
91
  out_message = ''
92
  out_file_paths = []
 
119
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
120
  file_path_without_ext = get_file_path_end(in_file_path)
121
 
122
+ out_file_paths = out_text_file_path
123
 
124
  # Convert annotated text pdf back to image to give genuine redactions
125
  print("Creating image version of results")
126
+ pdf_text_image_paths = process_file(out_text_file_path[0])
127
  out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
128
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
129
 
tools/file_redaction.py CHANGED
@@ -21,7 +21,8 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
21
  out_message = ''
22
  out_file_paths = []
23
 
24
- in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
 
25
 
26
  if file_path:
27
  file_path_without_ext = get_file_path_end(file_path)
@@ -35,7 +36,7 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
35
  # if is_pdf_or_image(file_path) == False:
36
  # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
37
 
38
- pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
39
  out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
40
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
41
 
@@ -53,9 +54,8 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
53
 
54
  out_file_paths.append(out_text_file_path)
55
 
56
-
57
 
58
-
59
  else:
60
  out_message = "No redaction method selected"
61
  print(out_message)
@@ -67,19 +67,21 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
67
 
68
  out_message = out_message + "\n\n" + out_time
69
 
70
- return out_message, out_file_paths
71
 
72
 
73
- def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
74
  '''
75
  take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
76
  '''
77
 
78
- out_message = "Converting pages to image"
79
- print(out_message)
80
- progress(0, desc=out_message)
81
 
82
- image_paths = process_file(file_path)
 
 
 
 
83
 
84
  # Create a new PDF
85
  #pdf = pikepdf.new()
@@ -136,7 +138,10 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
136
 
137
  pdf = Pdf.open(filename)
138
 
139
- for page_num, page in progress.tqdm(enumerate(pdf.pages), total=len(pdf.pages), unit="pages", desc="Redacting pages"):
 
 
 
140
 
141
  print("Page number is: ", page_num)
142
 
@@ -169,25 +174,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
169
  if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
170
  for char in line] # Loop through each character in the line
171
  #if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
172
-
173
-
174
- #print(characters)
175
-
176
- # Collect unique types
177
- # unique_types = set()
178
-
179
- # for line in text_container:
180
- # if isinstance(line, LTTextLine):
181
- # print("Line: ", line)
182
- # for char in line:
183
- # unique_types.add(type(char))
184
- # if isinstance(char, LTAnno):
185
- # print(char)
186
-
187
- # # Print the unique types
188
- # print("Unique types in text_container:")
189
- # for t in unique_types:
190
- # print(t)
191
 
192
  # If any results found
193
  print(analyzer_results)
@@ -216,13 +202,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
216
  CA=1, # Transparency
217
  T=analyzed_bounding_box["result"].entity_type
218
  )
219
- annotations_on_page.append(annotation)
220
 
221
  annotations_all_pages.extend([annotations_on_page])
222
-
223
  print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
224
  page.Annots = pdf.make_indirect(annotations_on_page)
225
 
 
 
226
  # Extracting data from dictionaries
227
  # extracted_data = []
228
  # for item in annotations_all_pages:
 
21
  out_message = ''
22
  out_file_paths = []
23
 
24
+ if in_allow_list:
25
+ in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
26
 
27
  if file_path:
28
  file_path_without_ext = get_file_path_end(file_path)
 
36
  # if is_pdf_or_image(file_path) == False:
37
  # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
38
 
39
+ pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
40
  out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
41
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
42
 
 
54
 
55
  out_file_paths.append(out_text_file_path)
56
 
57
+ out_message = "Text-based PDF successfully redacted and saved to file."
58
 
 
59
  else:
60
  out_message = "No redaction method selected"
61
  print(out_message)
 
67
 
68
  out_message = out_message + "\n\n" + out_time
69
 
70
+ return out_message, out_file_paths, out_file_paths
71
 
72
 
73
+ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
74
  '''
75
  take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
76
  '''
77
 
78
+ if not image_paths:
 
 
79
 
80
+ out_message = "PDF does not exist as images. Converting pages to image"
81
+ print(out_message)
82
+ progress(0, desc=out_message)
83
+
84
+ image_paths = process_file(file_path)
85
 
86
  # Create a new PDF
87
  #pdf = pikepdf.new()
 
138
 
139
  pdf = Pdf.open(filename)
140
 
141
+ page_num = 0
142
+
143
+ for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
144
+
145
 
146
  print("Page number is: ", page_num)
147
 
 
174
  if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
175
  for char in line] # Loop through each character in the line
176
  #if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  # If any results found
179
  print(analyzer_results)
 
202
  CA=1, # Transparency
203
  T=analyzed_bounding_box["result"].entity_type
204
  )
205
+ annotations_on_page.append(annotation)
206
 
207
  annotations_all_pages.extend([annotations_on_page])
208
+
209
  print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
210
  page.Annots = pdf.make_indirect(annotations_on_page)
211
 
212
+ page_num += 1
213
+
214
  # Extracting data from dictionaries
215
  # extracted_data = []
216
  # for item in annotations_all_pages: