seanpedrickcase commited on
Commit
504104c
1 Parent(s): d32c12a

Minor changes to app and requirements files

Browse files
Files changed (2) hide show
  1. app.py +2 -14
  2. requirements.txt +6 -0
app.py CHANGED
@@ -8,7 +8,7 @@ import pandas as pd
8
  import gradio as gr
9
  import time
10
 
11
- file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
12
 
13
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
14
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
@@ -75,24 +75,18 @@ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:
75
 
76
  return out_message, out_file_paths
77
 
78
-
79
  # Create the gradio interface
80
 
81
  block = gr.Blocks(theme = gr.themes.Base())
82
 
83
  with block:
84
 
85
- data_state = gr.State(pd.DataFrame())
86
- ref_data_state = gr.State(pd.DataFrame())
87
- results_data_state = gr.State(pd.DataFrame())
88
- ref_results_data_state =gr.State(pd.DataFrame())
89
-
90
  gr.Markdown(
91
  """
92
  # Document redaction
93
  Take an image-based PDF or image file, or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction, and also works with JPG or PNG files. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
94
 
95
- WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
96
 
97
  Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
98
  """)
@@ -120,16 +114,10 @@ with block:
120
  load_aws_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
121
 
122
  aws_log_box = gr.Textbox(label="AWS data load status")
123
-
124
 
125
  ### Loading AWS data ###
126
  load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
127
 
128
-
129
- # Updates to components
130
- #in_file.change(fn = initial_data_load, inputs=[in_file], outputs=[output_summary, in_redact_entities, in_existing, data_state, results_data_state])
131
- #in_ref.change(fn = initial_data_load, inputs=[in_ref], outputs=[output_summary, in_refcol, in_joincol, ref_data_state, ref_results_data_state])
132
-
133
  redact_btn.click(fn = choose_and_run_redactor, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
134
  outputs=[output_summary, output_file], api_name="redact")
135
 
 
8
  import gradio as gr
9
  import time
10
 
11
+ #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
12
 
13
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
14
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 
75
 
76
  return out_message, out_file_paths
77
 
 
78
  # Create the gradio interface
79
 
80
  block = gr.Blocks(theme = gr.themes.Base())
81
 
82
  with block:
83
 
 
 
 
 
 
84
  gr.Markdown(
85
  """
86
  # Document redaction
87
  Take an image-based PDF or image file, or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction, and also works with JPG or PNG files. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
88
 
89
+ WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed. Also, the output from the Text analysis ending 'as_text.pdf' is an annotated pdf, which is a layer on top of the text that can be removed. So the text has not truly been redacted. Use the '...as_img.pdf' versions instead for safer redaction.
90
 
91
  Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
92
  """)
 
114
  load_aws_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
115
 
116
  aws_log_box = gr.Textbox(label="AWS data load status")
 
117
 
118
  ### Loading AWS data ###
119
  load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
120
 
 
 
 
 
 
121
  redact_btn.click(fn = choose_and_run_redactor, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
122
  outputs=[output_summary, output_file], api_name="redact")
123
 
requirements.txt CHANGED
@@ -10,3 +10,9 @@ spacy
10
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
11
  gradio
12
  boto3
 
 
 
 
 
 
 
10
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
11
  gradio
12
  boto3
13
+ #unstructured
14
+ #unstructured_inference # This is big! Only necessary if you want to use the high res strategy in pdf_partition
15
+ #unstructured_pytesseract
16
+ #pillow-heif
17
+ #python-docx
18
+ #python-pptx