seanpedrickcase
commited on
Commit
•
04d80a1
1
Parent(s):
542c252
Improved time taken reporting and readme
Browse files- README.md +2 -6
- app.py +4 -8
- tools/file_conversion.py +0 -2
- tools/file_redaction.py +2 -11
README.md
CHANGED
@@ -12,15 +12,11 @@ license: agpl-3.0
|
|
12 |
# Document redaction
|
13 |
|
14 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
|
15 |
-
|
16 |
-
See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
|
17 |
-
|
18 |
-
You can also review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app.
|
19 |
|
20 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
21 |
|
22 |
-
This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
|
23 |
-
|
24 |
# USER GUIDE
|
25 |
|
26 |
Please refer to these example files to follow this guide:
|
|
|
12 |
# Document redaction
|
13 |
|
14 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
|
15 |
+
|
16 |
+
Review suggested redactions on the 'Review redactions' tab using a point and click visual interface. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app. The app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app in future.
|
|
|
|
|
17 |
|
18 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
19 |
|
|
|
|
|
20 |
# USER GUIDE
|
21 |
|
22 |
Please refer to these example files to follow this guide:
|
app.py
CHANGED
@@ -126,20 +126,16 @@ with app:
|
|
126 |
|
127 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
|
128 |
|
129 |
-
See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction.
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
134 |
-
|
135 |
-
This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.""")
|
136 |
|
137 |
# PDF / IMAGES TAB
|
138 |
with gr.Tab("PDFs/images"):
|
139 |
with gr.Accordion("Redact document", open = True):
|
140 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
|
141 |
-
in_redaction_method = gr.Radio(label="Choose
|
142 |
-
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
143 |
|
144 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
145 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
|
|
126 |
|
127 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
|
128 |
|
129 |
+
Review suggested redactions on the 'Review redactions' tab using a point and click visual interface. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app. The app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app in future.
|
130 |
|
131 |
+
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.""")
|
|
|
|
|
|
|
|
|
132 |
|
133 |
# PDF / IMAGES TAB
|
134 |
with gr.Tab("PDFs/images"):
|
135 |
with gr.Accordion("Redact document", open = True):
|
136 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
|
137 |
+
in_redaction_method = gr.Radio(label="Choose text extract method. AWS Textract has a cost per page.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
138 |
+
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost per 100 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
139 |
|
140 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
141 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
tools/file_conversion.py
CHANGED
@@ -113,8 +113,6 @@ def process_file(file_path):
|
|
113 |
# Run your function for processing PDF files here
|
114 |
img_object = convert_pdf_to_images(file_path)
|
115 |
|
116 |
-
print("img_object has length", len(img_object), "and contains", img_object)
|
117 |
-
|
118 |
else:
|
119 |
print(f"{file_path} is not an image or PDF file.")
|
120 |
img_object = ['']
|
|
|
113 |
# Run your function for processing PDF files here
|
114 |
img_object = convert_pdf_to_images(file_path)
|
115 |
|
|
|
|
|
116 |
else:
|
117 |
print(f"{file_path} is not an image or PDF file.")
|
118 |
img_object = ['']
|
tools/file_redaction.py
CHANGED
@@ -309,7 +309,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
309 |
latest_file_completed += 1
|
310 |
current_loop_page = 999
|
311 |
|
312 |
-
if latest_file_completed != len(
|
313 |
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
314 |
|
315 |
# Save file
|
@@ -384,15 +384,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
384 |
# if isinstance(out_message, list):
|
385 |
# out_message.append(out_message_new) # Ensure out_message is a list of strings
|
386 |
|
387 |
-
if latest_file_completed != len(file_paths):
|
388 |
-
print("Completed file number:", str(latest_file_completed), " there are more files to do")
|
389 |
-
|
390 |
-
|
391 |
-
# Make a combined message for the file
|
392 |
-
if isinstance(out_message, list):
|
393 |
-
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
394 |
-
else: combined_out_message = out_message
|
395 |
-
|
396 |
|
397 |
# If textract requests made, write to logging file
|
398 |
if all_request_metadata:
|
@@ -409,7 +400,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
409 |
|
410 |
if combined_out_message: out_message = combined_out_message
|
411 |
|
412 |
-
print("\nout_message at choose_and_run_redactor end is:", out_message)
|
413 |
|
414 |
# Ensure no duplicated output files
|
415 |
log_files_output_paths = list(set(log_files_output_paths))
|
|
|
309 |
latest_file_completed += 1
|
310 |
current_loop_page = 999
|
311 |
|
312 |
+
if latest_file_completed != len(file_paths_list):
|
313 |
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
314 |
|
315 |
# Save file
|
|
|
384 |
# if isinstance(out_message, list):
|
385 |
# out_message.append(out_message_new) # Ensure out_message is a list of strings
|
386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
|
388 |
# If textract requests made, write to logging file
|
389 |
if all_request_metadata:
|
|
|
400 |
|
401 |
if combined_out_message: out_message = combined_out_message
|
402 |
|
403 |
+
#print("\nout_message at choose_and_run_redactor end is:", out_message)
|
404 |
|
405 |
# Ensure no duplicated output files
|
406 |
log_files_output_paths = list(set(log_files_output_paths))
|