seanpedrickcase
commited on
Commit
•
bc22fc4
1
Parent(s):
7810536
Added possibility to do authentication with AWS Cognito on load. Other minor changes.
Browse files- .dockerignore +2 -1
- .gitignore +2 -1
- DocRedactApp_0.2.spec +52 -0
- README.md +1 -3
- app.py +18 -6
- how_to_create_exe_dist.txt +2 -2
- tools/auth.py +48 -0
- tools/helper_functions.py +59 -0
.dockerignore
CHANGED
@@ -13,4 +13,5 @@ tesseract/*
|
|
13 |
poppler/*
|
14 |
build/*
|
15 |
dist/*
|
16 |
-
build_deps/*
|
|
|
|
13 |
poppler/*
|
14 |
build/*
|
15 |
dist/*
|
16 |
+
build_deps/*
|
17 |
+
doc_redaction_amplify_app/*
|
.gitignore
CHANGED
@@ -13,4 +13,5 @@ tesseract/*
|
|
13 |
poppler/*
|
14 |
build/*
|
15 |
dist/*
|
16 |
-
build_deps/*
|
|
|
|
13 |
poppler/*
|
14 |
build/*
|
15 |
dist/*
|
16 |
+
build_deps/*
|
17 |
+
doc_redaction_amplify_app/*
|
DocRedactApp_0.2.spec
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- mode: python ; coding: utf-8 -*-
|
2 |
+
from PyInstaller.utils.hooks import collect_data_files
|
3 |
+
|
4 |
+
datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
|
5 |
+
datas += collect_data_files('gradio_client')
|
6 |
+
datas += collect_data_files('gradio')
|
7 |
+
|
8 |
+
|
9 |
+
a = Analysis(
|
10 |
+
['app.py'],
|
11 |
+
pathex=[],
|
12 |
+
binaries=[],
|
13 |
+
datas=datas,
|
14 |
+
hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
|
15 |
+
hookspath=['build_deps'],
|
16 |
+
hooksconfig={},
|
17 |
+
runtime_hooks=[],
|
18 |
+
excludes=[],
|
19 |
+
noarchive=False,
|
20 |
+
optimize=0,
|
21 |
+
module_collection_mode={
|
22 |
+
'gradio': 'py', # Collect gradio package as source .py files
|
23 |
+
}
|
24 |
+
)
|
25 |
+
pyz = PYZ(a.pure)
|
26 |
+
|
27 |
+
exe = EXE(
|
28 |
+
pyz,
|
29 |
+
a.scripts,
|
30 |
+
[],
|
31 |
+
exclude_binaries=True,
|
32 |
+
name='DocRedactApp_0.2',
|
33 |
+
debug=False,
|
34 |
+
bootloader_ignore_signals=False,
|
35 |
+
strip=False,
|
36 |
+
upx=True,
|
37 |
+
console=True,
|
38 |
+
disable_windowed_traceback=False,
|
39 |
+
argv_emulation=False,
|
40 |
+
target_arch=None,
|
41 |
+
codesign_identity=None,
|
42 |
+
entitlements_file=None,
|
43 |
+
)
|
44 |
+
coll = COLLECT(
|
45 |
+
exe,
|
46 |
+
a.binaries,
|
47 |
+
a.datas,
|
48 |
+
strip=False,
|
49 |
+
upx=True,
|
50 |
+
upx_exclude=[],
|
51 |
+
name='DocRedactApp_0.2',
|
52 |
+
)
|
README.md
CHANGED
@@ -10,9 +10,7 @@ license: mit
|
|
10 |
---
|
11 |
|
12 |
# Introduction
|
13 |
-
Redact
|
14 |
-
|
15 |
-
Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
|
16 |
|
17 |
WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
18 |
|
|
|
10 |
---
|
11 |
|
12 |
# Introduction
|
13 |
+
Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
|
|
|
|
|
14 |
|
15 |
WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
16 |
|
app.py
CHANGED
@@ -3,15 +3,16 @@ import os
|
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
|
6 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
from tools.file_conversion import prepare_image_or_text_pdf
|
9 |
from tools.data_anonymise import do_anonymise
|
|
|
10 |
#from tools.aws_functions import load_data_from_aws
|
11 |
import gradio as gr
|
12 |
|
13 |
-
add_folder_to_path("
|
14 |
-
add_folder_to_path("
|
15 |
|
16 |
ensure_output_folder_exists()
|
17 |
|
@@ -20,14 +21,17 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
|
|
20 |
language = 'en'
|
21 |
|
22 |
# Create the gradio interface
|
23 |
-
|
24 |
|
25 |
-
with
|
26 |
|
27 |
prepared_pdf_state = gr.State([])
|
28 |
output_image_files_state = gr.State([])
|
29 |
output_file_list_state = gr.State([])
|
30 |
|
|
|
|
|
|
|
31 |
gr.Markdown(
|
32 |
"""
|
33 |
# Document redaction
|
@@ -116,7 +120,15 @@ with block:
|
|
116 |
in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
|
117 |
match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
|
118 |
|
|
|
119 |
|
120 |
# Launch the Gradio app
|
|
|
|
|
|
|
121 |
if __name__ == "__main__":
|
122 |
-
|
|
|
|
|
|
|
|
|
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
|
6 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
from tools.file_conversion import prepare_image_or_text_pdf
|
9 |
from tools.data_anonymise import do_anonymise
|
10 |
+
from tools.auth import authenticate_user
|
11 |
#from tools.aws_functions import load_data_from_aws
|
12 |
import gradio as gr
|
13 |
|
14 |
+
add_folder_to_path("tesseract/")
|
15 |
+
add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
|
16 |
|
17 |
ensure_output_folder_exists()
|
18 |
|
|
|
21 |
language = 'en'
|
22 |
|
23 |
# Create the gradio interface
|
24 |
+
app = gr.Blocks(theme = gr.themes.Base())
|
25 |
|
26 |
+
with app:
|
27 |
|
28 |
prepared_pdf_state = gr.State([])
|
29 |
output_image_files_state = gr.State([])
|
30 |
output_file_list_state = gr.State([])
|
31 |
|
32 |
+
session_hash_state = gr.State()
|
33 |
+
s3_output_folder_state = gr.State()
|
34 |
+
|
35 |
gr.Markdown(
|
36 |
"""
|
37 |
# Document redaction
|
|
|
120 |
in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
|
121 |
match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
|
122 |
|
123 |
+
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
|
124 |
|
125 |
# Launch the Gradio app
|
126 |
+
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
127 |
+
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
128 |
+
|
129 |
if __name__ == "__main__":
|
130 |
+
|
131 |
+
if os.environ['COGNITO_AUTH'] == "1":
|
132 |
+
app.queue().launch(show_error=True, auth=authenticate_user)
|
133 |
+
else:
|
134 |
+
app.queue().launch(show_error=True, inbrowser=True)
|
how_to_create_exe_dist.txt
CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRedactApp_0.
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
@@ -28,7 +28,7 @@ a = Analysis(
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
-
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.
|
32 |
|
33 |
|
34 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRedactApp_0.2 app.py
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.spec
|
32 |
|
33 |
|
34 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
tools/auth.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import boto3
|
3 |
+
from tools.helper_functions import get_or_create_env_var
|
4 |
+
|
5 |
+
client_id = get_or_create_env_var('AWS_CLIENT_ID', 'l762du1rg94e1r2q0ii7ls0ef') # This client id is borrowed from async gradio app client
|
6 |
+
print(f'The value of AWS_CLIENT_ID is {client_id}')
|
7 |
+
|
8 |
+
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', 'eu-west-2_8fCzl8qej')
|
9 |
+
print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
|
10 |
+
|
11 |
+
def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
|
12 |
+
"""Authenticates a user against an AWS Cognito user pool.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
user_pool_id (str): The ID of the Cognito user pool.
|
16 |
+
client_id (str): The ID of the Cognito user pool client.
|
17 |
+
username (str): The username of the user.
|
18 |
+
password (str): The password of the user.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
bool: True if the user is authenticated, False otherwise.
|
22 |
+
"""
|
23 |
+
|
24 |
+
client = boto3.client('cognito-idp') # Cognito Identity Provider client
|
25 |
+
|
26 |
+
try:
|
27 |
+
response = client.initiate_auth(
|
28 |
+
AuthFlow='USER_PASSWORD_AUTH',
|
29 |
+
AuthParameters={
|
30 |
+
'USERNAME': username,
|
31 |
+
'PASSWORD': password,
|
32 |
+
},
|
33 |
+
ClientId=client_id
|
34 |
+
)
|
35 |
+
|
36 |
+
# If successful, you'll receive an AuthenticationResult in the response
|
37 |
+
if response.get('AuthenticationResult'):
|
38 |
+
return True
|
39 |
+
else:
|
40 |
+
return False
|
41 |
+
|
42 |
+
except client.exceptions.NotAuthorizedException:
|
43 |
+
return False
|
44 |
+
except client.exceptions.UserNotFoundException:
|
45 |
+
return False
|
46 |
+
except Exception as e:
|
47 |
+
print(f"An error occurred: {e}")
|
48 |
+
return False
|
tools/helper_functions.py
CHANGED
@@ -109,3 +109,62 @@ def add_folder_to_path(folder_path: str):
|
|
109 |
print(f"Directory {folder_path} already exists in PATH.")
|
110 |
else:
|
111 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
print(f"Directory {folder_path} already exists in PATH.")
|
110 |
else:
|
111 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
112 |
+
|
113 |
+
async def get_connection_params(request: gr.Request):
|
114 |
+
if request:
|
115 |
+
#print("request user:", request.username)
|
116 |
+
|
117 |
+
#request_data = await request.json() # Parse JSON body
|
118 |
+
#print("All request data:", request_data)
|
119 |
+
#context_value = request_data.get('context')
|
120 |
+
#if 'context' in request_data:
|
121 |
+
# print("Request context dictionary:", request_data['context'])
|
122 |
+
|
123 |
+
# print("Request headers dictionary:", request.headers)
|
124 |
+
# print("All host elements", request.client)
|
125 |
+
# print("IP address:", request.client.host)
|
126 |
+
# print("Query parameters:", dict(request.query_params))
|
127 |
+
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
128 |
+
#print("Request dictionary to object:", request.request.body())
|
129 |
+
print("Session hash:", request.session_hash)
|
130 |
+
|
131 |
+
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
|
132 |
+
CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
|
133 |
+
#print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
|
134 |
+
|
135 |
+
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
|
136 |
+
CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
|
137 |
+
#print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
|
138 |
+
|
139 |
+
if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
|
140 |
+
if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
|
141 |
+
supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
|
142 |
+
if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
|
143 |
+
print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
|
144 |
+
else:
|
145 |
+
raise(ValueError, "Custom Cloudfront header value does not match expected value.")
|
146 |
+
|
147 |
+
# Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
|
148 |
+
|
149 |
+
if request.username:
|
150 |
+
out_session_hash = request.username
|
151 |
+
print("Request username found:", out_session_hash)
|
152 |
+
|
153 |
+
elif 'x-cognito-id' in request.headers:
|
154 |
+
out_session_hash = request.headers['x-cognito-id']
|
155 |
+
base_folder = "user-files/"
|
156 |
+
print("Cognito ID found:", out_session_hash)
|
157 |
+
|
158 |
+
else:
|
159 |
+
out_session_hash = request.session_hash
|
160 |
+
base_folder = "temp-files/"
|
161 |
+
# print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
|
162 |
+
|
163 |
+
output_folder = base_folder + out_session_hash + "/"
|
164 |
+
#if bucket_name:
|
165 |
+
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
166 |
+
|
167 |
+
return out_session_hash, output_folder
|
168 |
+
else:
|
169 |
+
print("No session parameters found.")
|
170 |
+
return "",""
|