seanpedrickcase
commited on
Commit
•
2807627
1
Parent(s):
19846ba
Fixed some more input bugs
Browse files- app.py +5 -4
- tools/file_conversion.py +3 -3
- tools/file_redaction.py +20 -32
app.py
CHANGED
@@ -20,6 +20,7 @@ with block:
|
|
20 |
|
21 |
prepared_pdf_state = gr.State([])
|
22 |
output_image_files_state = gr.State([])
|
|
|
23 |
|
24 |
gr.Markdown(
|
25 |
"""
|
@@ -61,13 +62,13 @@ with block:
|
|
61 |
### Loading AWS data ###
|
62 |
load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
63 |
|
64 |
-
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file,
|
65 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
66 |
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
67 |
-
outputs=[output_summary, output_file], api_name="redact")
|
68 |
|
69 |
-
convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file,
|
70 |
-
outputs=[output_summary, output_file]
|
71 |
|
72 |
# Simple run for HF spaces or local on your computer
|
73 |
#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
|
|
|
20 |
|
21 |
prepared_pdf_state = gr.State([])
|
22 |
output_image_files_state = gr.State([])
|
23 |
+
output_file_list_state = gr.State([])
|
24 |
|
25 |
gr.Markdown(
|
26 |
"""
|
|
|
62 |
### Loading AWS data ###
|
63 |
load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
64 |
|
65 |
+
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
|
66 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
67 |
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
68 |
+
outputs=[output_summary, output_file, output_file_list_state], api_name="redact")
|
69 |
|
70 |
+
convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
|
71 |
+
outputs=[output_summary, output_file])
|
72 |
|
73 |
# Simple run for HF spaces or local on your computer
|
74 |
#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
|
tools/file_conversion.py
CHANGED
@@ -86,7 +86,7 @@ def process_file(file_path):
|
|
86 |
|
87 |
return out_path
|
88 |
|
89 |
-
def prepare_image_or_text_pdf(file_path:str,
|
90 |
|
91 |
out_message = ''
|
92 |
out_file_paths = []
|
@@ -119,11 +119,11 @@ def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str,
|
|
119 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
120 |
file_path_without_ext = get_file_path_end(in_file_path)
|
121 |
|
122 |
-
out_file_paths =
|
123 |
|
124 |
# Convert annotated text pdf back to image to give genuine redactions
|
125 |
print("Creating image version of results")
|
126 |
-
pdf_text_image_paths = process_file(out_text_file_path)
|
127 |
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
|
128 |
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
129 |
|
|
|
86 |
|
87 |
return out_path
|
88 |
|
89 |
+
def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
|
90 |
|
91 |
out_message = ''
|
92 |
out_file_paths = []
|
|
|
119 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
120 |
file_path_without_ext = get_file_path_end(in_file_path)
|
121 |
|
122 |
+
out_file_paths = out_text_file_path
|
123 |
|
124 |
# Convert annotated text pdf back to image to give genuine redactions
|
125 |
print("Creating image version of results")
|
126 |
+
pdf_text_image_paths = process_file(out_text_file_path[0])
|
127 |
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
|
128 |
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
129 |
|
tools/file_redaction.py
CHANGED
@@ -21,7 +21,8 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
21 |
out_message = ''
|
22 |
out_file_paths = []
|
23 |
|
24 |
-
|
|
|
25 |
|
26 |
if file_path:
|
27 |
file_path_without_ext = get_file_path_end(file_path)
|
@@ -35,7 +36,7 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
35 |
# if is_pdf_or_image(file_path) == False:
|
36 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
37 |
|
38 |
-
pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
39 |
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
40 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
41 |
|
@@ -53,9 +54,8 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
53 |
|
54 |
out_file_paths.append(out_text_file_path)
|
55 |
|
56 |
-
|
57 |
|
58 |
-
|
59 |
else:
|
60 |
out_message = "No redaction method selected"
|
61 |
print(out_message)
|
@@ -67,19 +67,21 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
67 |
|
68 |
out_message = out_message + "\n\n" + out_time
|
69 |
|
70 |
-
return out_message, out_file_paths
|
71 |
|
72 |
|
73 |
-
def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
74 |
'''
|
75 |
take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
|
76 |
'''
|
77 |
|
78 |
-
|
79 |
-
print(out_message)
|
80 |
-
progress(0, desc=out_message)
|
81 |
|
82 |
-
|
|
|
|
|
|
|
|
|
83 |
|
84 |
# Create a new PDF
|
85 |
#pdf = pikepdf.new()
|
@@ -136,7 +138,10 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
136 |
|
137 |
pdf = Pdf.open(filename)
|
138 |
|
139 |
-
|
|
|
|
|
|
|
140 |
|
141 |
print("Page number is: ", page_num)
|
142 |
|
@@ -169,25 +174,6 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
169 |
if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
|
170 |
for char in line] # Loop through each character in the line
|
171 |
#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
|
172 |
-
|
173 |
-
|
174 |
-
#print(characters)
|
175 |
-
|
176 |
-
# Collect unique types
|
177 |
-
# unique_types = set()
|
178 |
-
|
179 |
-
# for line in text_container:
|
180 |
-
# if isinstance(line, LTTextLine):
|
181 |
-
# print("Line: ", line)
|
182 |
-
# for char in line:
|
183 |
-
# unique_types.add(type(char))
|
184 |
-
# if isinstance(char, LTAnno):
|
185 |
-
# print(char)
|
186 |
-
|
187 |
-
# # Print the unique types
|
188 |
-
# print("Unique types in text_container:")
|
189 |
-
# for t in unique_types:
|
190 |
-
# print(t)
|
191 |
|
192 |
# If any results found
|
193 |
print(analyzer_results)
|
@@ -216,13 +202,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
216 |
CA=1, # Transparency
|
217 |
T=analyzed_bounding_box["result"].entity_type
|
218 |
)
|
219 |
-
annotations_on_page.append(annotation)
|
220 |
|
221 |
annotations_all_pages.extend([annotations_on_page])
|
222 |
-
|
223 |
print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
|
224 |
page.Annots = pdf.make_indirect(annotations_on_page)
|
225 |
|
|
|
|
|
226 |
# Extracting data from dictionaries
|
227 |
# extracted_data = []
|
228 |
# for item in annotations_all_pages:
|
|
|
21 |
out_message = ''
|
22 |
out_file_paths = []
|
23 |
|
24 |
+
if in_allow_list:
|
25 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
26 |
|
27 |
if file_path:
|
28 |
file_path_without_ext = get_file_path_end(file_path)
|
|
|
36 |
# if is_pdf_or_image(file_path) == False:
|
37 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
38 |
|
39 |
+
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
|
40 |
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
41 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
42 |
|
|
|
54 |
|
55 |
out_file_paths.append(out_text_file_path)
|
56 |
|
57 |
+
out_message = "Text-based PDF successfully redacted and saved to file."
|
58 |
|
|
|
59 |
else:
|
60 |
out_message = "No redaction method selected"
|
61 |
print(out_message)
|
|
|
67 |
|
68 |
out_message = out_message + "\n\n" + out_time
|
69 |
|
70 |
+
return out_message, out_file_paths, out_file_paths
|
71 |
|
72 |
|
73 |
+
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
74 |
'''
|
75 |
take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
|
76 |
'''
|
77 |
|
78 |
+
if not image_paths:
|
|
|
|
|
79 |
|
80 |
+
out_message = "PDF does not exist as images. Converting pages to image"
|
81 |
+
print(out_message)
|
82 |
+
progress(0, desc=out_message)
|
83 |
+
|
84 |
+
image_paths = process_file(file_path)
|
85 |
|
86 |
# Create a new PDF
|
87 |
#pdf = pikepdf.new()
|
|
|
138 |
|
139 |
pdf = Pdf.open(filename)
|
140 |
|
141 |
+
page_num = 0
|
142 |
+
|
143 |
+
for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
144 |
+
|
145 |
|
146 |
print("Page number is: ", page_num)
|
147 |
|
|
|
174 |
if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
|
175 |
for char in line] # Loop through each character in the line
|
176 |
#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
# If any results found
|
179 |
print(analyzer_results)
|
|
|
202 |
CA=1, # Transparency
|
203 |
T=analyzed_bounding_box["result"].entity_type
|
204 |
)
|
205 |
+
annotations_on_page.append(annotation)
|
206 |
|
207 |
annotations_all_pages.extend([annotations_on_page])
|
208 |
+
|
209 |
print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
|
210 |
page.Annots = pdf.make_indirect(annotations_on_page)
|
211 |
|
212 |
+
page_num += 1
|
213 |
+
|
214 |
# Extracting data from dictionaries
|
215 |
# extracted_data = []
|
216 |
# for item in annotations_all_pages:
|