seanpedrickcase commited on
Commit
bc22fc4
1 Parent(s): 7810536

Added possibility to do authentication with AWS Cognito on load. Other minor changes.

Browse files
.dockerignore CHANGED
@@ -13,4 +13,5 @@ tesseract/*
13
  poppler/*
14
  build/*
15
  dist/*
16
- build_deps/*
 
 
13
  poppler/*
14
  build/*
15
  dist/*
16
+ build_deps/*
17
+ doc_redaction_amplify_app/*
.gitignore CHANGED
@@ -13,4 +13,5 @@ tesseract/*
13
  poppler/*
14
  build/*
15
  dist/*
16
- build_deps/*
 
 
13
  poppler/*
14
  build/*
15
  dist/*
16
+ build_deps/*
17
+ doc_redaction_amplify_app/*
DocRedactApp_0.2.spec ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- mode: python ; coding: utf-8 -*-
2
+ from PyInstaller.utils.hooks import collect_data_files
3
+
4
+ datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
5
+ datas += collect_data_files('gradio_client')
6
+ datas += collect_data_files('gradio')
7
+
8
+
9
+ a = Analysis(
10
+ ['app.py'],
11
+ pathex=[],
12
+ binaries=[],
13
+ datas=datas,
14
+ hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
15
+ hookspath=['build_deps'],
16
+ hooksconfig={},
17
+ runtime_hooks=[],
18
+ excludes=[],
19
+ noarchive=False,
20
+ optimize=0,
21
+ module_collection_mode={
22
+ 'gradio': 'py', # Collect gradio package as source .py files
23
+ }
24
+ )
25
+ pyz = PYZ(a.pure)
26
+
27
+ exe = EXE(
28
+ pyz,
29
+ a.scripts,
30
+ [],
31
+ exclude_binaries=True,
32
+ name='DocRedactApp_0.2',
33
+ debug=False,
34
+ bootloader_ignore_signals=False,
35
+ strip=False,
36
+ upx=True,
37
+ console=True,
38
+ disable_windowed_traceback=False,
39
+ argv_emulation=False,
40
+ target_arch=None,
41
+ codesign_identity=None,
42
+ entitlements_file=None,
43
+ )
44
+ coll = COLLECT(
45
+ exe,
46
+ a.binaries,
47
+ a.datas,
48
+ strip=False,
49
+ upx=True,
50
+ upx_exclude=[],
51
+ name='DocRedactApp_0.2',
52
+ )
README.md CHANGED
@@ -10,9 +10,7 @@ license: mit
10
  ---
11
 
12
  # Introduction
13
- Redact PDF files using image-based OCR or direct text analysis from pdfminer.six. Personal information identification performed using Microsoft Presidio.
14
-
15
- Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
16
 
17
  WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
18
 
 
10
  ---
11
 
12
  # Introduction
13
+ Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
 
 
14
 
15
  WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
16
 
app.py CHANGED
@@ -3,15 +3,16 @@ import os
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
 
6
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df
7
  from tools.file_redaction import choose_and_run_redactor
8
  from tools.file_conversion import prepare_image_or_text_pdf
9
  from tools.data_anonymise import do_anonymise
 
10
  #from tools.aws_functions import load_data_from_aws
11
  import gradio as gr
12
 
13
- add_folder_to_path("_internal/tesseract/")
14
- add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/")
15
 
16
  ensure_output_folder_exists()
17
 
@@ -20,14 +21,17 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
20
  language = 'en'
21
 
22
  # Create the gradio interface
23
- block = gr.Blocks(theme = gr.themes.Base())
24
 
25
- with block:
26
 
27
  prepared_pdf_state = gr.State([])
28
  output_image_files_state = gr.State([])
29
  output_file_list_state = gr.State([])
30
 
 
 
 
31
  gr.Markdown(
32
  """
33
  # Document redaction
@@ -116,7 +120,15 @@ with block:
116
  in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
117
  match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
118
 
 
119
 
120
  # Launch the Gradio app
 
 
 
121
  if __name__ == "__main__":
122
- block.queue().launch(show_error=True) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
 
 
 
 
 
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
 
6
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
7
  from tools.file_redaction import choose_and_run_redactor
8
  from tools.file_conversion import prepare_image_or_text_pdf
9
  from tools.data_anonymise import do_anonymise
10
+ from tools.auth import authenticate_user
11
  #from tools.aws_functions import load_data_from_aws
12
  import gradio as gr
13
 
14
+ add_folder_to_path("tesseract/")
15
+ add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
16
 
17
  ensure_output_folder_exists()
18
 
 
21
  language = 'en'
22
 
23
  # Create the gradio interface
24
+ app = gr.Blocks(theme = gr.themes.Base())
25
 
26
+ with app:
27
 
28
  prepared_pdf_state = gr.State([])
29
  output_image_files_state = gr.State([])
30
  output_file_list_state = gr.State([])
31
 
32
+ session_hash_state = gr.State()
33
+ s3_output_folder_state = gr.State()
34
+
35
  gr.Markdown(
36
  """
37
  # Document redaction
 
120
  in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
121
  match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
122
 
123
+ app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
124
 
125
  # Launch the Gradio app
126
+ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
127
+ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
128
+
129
  if __name__ == "__main__":
130
+
131
+ if os.environ['COGNITO_AUTH'] == "1":
132
+ app.queue().launch(show_error=True, auth=authenticate_user)
133
+ else:
134
+ app.queue().launch(show_error=True, inbrowser=True)
how_to_create_exe_dist.txt CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRedactApp_0.1 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
@@ -28,7 +28,7 @@ a = Analysis(
28
  }
29
  )
30
 
31
- c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.1.spec
32
 
33
 
34
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
 
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRedactApp_0.2 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
 
28
  }
29
  )
30
 
31
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.spec
32
 
33
 
34
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
tools/auth.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import boto3
3
+ from tools.helper_functions import get_or_create_env_var
4
+
5
+ client_id = get_or_create_env_var('AWS_CLIENT_ID', 'l762du1rg94e1r2q0ii7ls0ef') # This client id is borrowed from async gradio app client
6
+ print(f'The value of AWS_CLIENT_ID is {client_id}')
7
+
8
+ user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', 'eu-west-2_8fCzl8qej')
9
+ print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
10
+
11
+ def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
12
+ """Authenticates a user against an AWS Cognito user pool.
13
+
14
+ Args:
15
+ user_pool_id (str): The ID of the Cognito user pool.
16
+ client_id (str): The ID of the Cognito user pool client.
17
+ username (str): The username of the user.
18
+ password (str): The password of the user.
19
+
20
+ Returns:
21
+ bool: True if the user is authenticated, False otherwise.
22
+ """
23
+
24
+ client = boto3.client('cognito-idp') # Cognito Identity Provider client
25
+
26
+ try:
27
+ response = client.initiate_auth(
28
+ AuthFlow='USER_PASSWORD_AUTH',
29
+ AuthParameters={
30
+ 'USERNAME': username,
31
+ 'PASSWORD': password,
32
+ },
33
+ ClientId=client_id
34
+ )
35
+
36
+ # If successful, you'll receive an AuthenticationResult in the response
37
+ if response.get('AuthenticationResult'):
38
+ return True
39
+ else:
40
+ return False
41
+
42
+ except client.exceptions.NotAuthorizedException:
43
+ return False
44
+ except client.exceptions.UserNotFoundException:
45
+ return False
46
+ except Exception as e:
47
+ print(f"An error occurred: {e}")
48
+ return False
tools/helper_functions.py CHANGED
@@ -109,3 +109,62 @@ def add_folder_to_path(folder_path: str):
109
  print(f"Directory {folder_path} already exists in PATH.")
110
  else:
111
  print(f"Folder not found at {folder_path} - not added to PATH")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  print(f"Directory {folder_path} already exists in PATH.")
110
  else:
111
  print(f"Folder not found at {folder_path} - not added to PATH")
112
+
113
+ async def get_connection_params(request: gr.Request):
114
+ if request:
115
+ #print("request user:", request.username)
116
+
117
+ #request_data = await request.json() # Parse JSON body
118
+ #print("All request data:", request_data)
119
+ #context_value = request_data.get('context')
120
+ #if 'context' in request_data:
121
+ # print("Request context dictionary:", request_data['context'])
122
+
123
+ # print("Request headers dictionary:", request.headers)
124
+ # print("All host elements", request.client)
125
+ # print("IP address:", request.client.host)
126
+ # print("Query parameters:", dict(request.query_params))
127
+ # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
128
+ #print("Request dictionary to object:", request.request.body())
129
+ print("Session hash:", request.session_hash)
130
+
131
+ # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
132
+ CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
133
+ #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
134
+
135
+ # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
136
+ CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
137
+ #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
138
+
139
+ if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
140
+ if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
141
+ supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
142
+ if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
143
+ print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
144
+ else:
145
+ raise(ValueError, "Custom Cloudfront header value does not match expected value.")
146
+
147
+ # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
148
+
149
+ if request.username:
150
+ out_session_hash = request.username
151
+ print("Request username found:", out_session_hash)
152
+
153
+ elif 'x-cognito-id' in request.headers:
154
+ out_session_hash = request.headers['x-cognito-id']
155
+ base_folder = "user-files/"
156
+ print("Cognito ID found:", out_session_hash)
157
+
158
+ else:
159
+ out_session_hash = request.session_hash
160
+ base_folder = "temp-files/"
161
+ # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
162
+
163
+ output_folder = base_folder + out_session_hash + "/"
164
+ #if bucket_name:
165
+ # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
166
+
167
+ return out_session_hash, output_folder
168
+ else:
169
+ print("No session parameters found.")
170
+ return "",""