seanpedrickcase
commited on
Commit
•
2a4b347
1
Parent(s):
3810d26
Version 0.1. Adapted code for pyinstaller local executable conversion (Windows)
Browse files- .dockerignore +6 -1
- .gitignore +6 -1
- DocRedactApp_0.1.spec +52 -0
- app.py +7 -3
- how_to_create_exe_dist.txt +38 -0
- tools/helper_functions.py +35 -0
- tools/load_spacy_model_custom_recognisers.py +12 -2
.dockerignore
CHANGED
@@ -8,4 +8,9 @@ examples/*
|
|
8 |
processing/*
|
9 |
output/*
|
10 |
tools/__pycache__/*
|
11 |
-
old_code/*
|
|
|
|
|
|
|
|
|
|
|
|
8 |
processing/*
|
9 |
output/*
|
10 |
tools/__pycache__/*
|
11 |
+
old_code/*
|
12 |
+
tesseract/*
|
13 |
+
poppler/*
|
14 |
+
build/*
|
15 |
+
dist/*
|
16 |
+
build_deps/*
|
.gitignore
CHANGED
@@ -8,4 +8,9 @@ examples/*
|
|
8 |
processing/*
|
9 |
output/*
|
10 |
tools/__pycache__/*
|
11 |
-
old_code/*
|
|
|
|
|
|
|
|
|
|
|
|
8 |
processing/*
|
9 |
output/*
|
10 |
tools/__pycache__/*
|
11 |
+
old_code/*
|
12 |
+
tesseract/*
|
13 |
+
poppler/*
|
14 |
+
build/*
|
15 |
+
dist/*
|
16 |
+
build_deps/*
|
DocRedactApp_0.1.spec
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- mode: python ; coding: utf-8 -*-
|
2 |
+
from PyInstaller.utils.hooks import collect_data_files
|
3 |
+
|
4 |
+
datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
|
5 |
+
datas += collect_data_files('gradio_client')
|
6 |
+
datas += collect_data_files('gradio')
|
7 |
+
|
8 |
+
|
9 |
+
a = Analysis(
|
10 |
+
['app.py'],
|
11 |
+
pathex=[],
|
12 |
+
binaries=[],
|
13 |
+
datas=datas,
|
14 |
+
hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
|
15 |
+
hookspath=['build_deps'],
|
16 |
+
hooksconfig={},
|
17 |
+
runtime_hooks=[],
|
18 |
+
excludes=[],
|
19 |
+
noarchive=False,
|
20 |
+
optimize=0,
|
21 |
+
module_collection_mode={
|
22 |
+
'gradio': 'py', # Collect gradio package as source .py files
|
23 |
+
}
|
24 |
+
)
|
25 |
+
pyz = PYZ(a.pure)
|
26 |
+
|
27 |
+
exe = EXE(
|
28 |
+
pyz,
|
29 |
+
a.scripts,
|
30 |
+
[],
|
31 |
+
exclude_binaries=True,
|
32 |
+
name='DocRedactApp_0.1',
|
33 |
+
debug=False,
|
34 |
+
bootloader_ignore_signals=False,
|
35 |
+
strip=False,
|
36 |
+
upx=True,
|
37 |
+
console=True,
|
38 |
+
disable_windowed_traceback=False,
|
39 |
+
argv_emulation=False,
|
40 |
+
target_arch=None,
|
41 |
+
codesign_identity=None,
|
42 |
+
entitlements_file=None,
|
43 |
+
)
|
44 |
+
coll = COLLECT(
|
45 |
+
exe,
|
46 |
+
a.binaries,
|
47 |
+
a.datas,
|
48 |
+
strip=False,
|
49 |
+
upx=True,
|
50 |
+
upx_exclude=[],
|
51 |
+
name='DocRedactApp_0.1',
|
52 |
+
)
|
app.py
CHANGED
@@ -3,12 +3,16 @@ import os
|
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
|
|
|
6 |
from tools.file_redaction import choose_and_run_redactor
|
7 |
from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
|
8 |
from tools.aws_functions import load_data_from_aws
|
9 |
import gradio as gr
|
10 |
|
11 |
-
|
|
|
|
|
|
|
12 |
|
13 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
14 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
@@ -57,8 +61,8 @@ with block:
|
|
57 |
with gr.Accordion(label = "AWS data access", open = True):
|
58 |
aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
59 |
with gr.Row():
|
60 |
-
in_aws_file = gr.Dropdown(label="Choose
|
61 |
-
load_aws_data_button = gr.Button(value="Load
|
62 |
|
63 |
aws_log_box = gr.Textbox(label="AWS data load status")
|
64 |
|
|
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
|
6 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
|
9 |
from tools.aws_functions import load_data_from_aws
|
10 |
import gradio as gr
|
11 |
|
12 |
+
add_folder_to_path("_internal/tesseract/")
|
13 |
+
add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/")
|
14 |
+
|
15 |
+
ensure_output_folder_exists()
|
16 |
|
17 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
18 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
|
|
61 |
with gr.Accordion(label = "AWS data access", open = True):
|
62 |
aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
63 |
with gr.Row():
|
64 |
+
in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
|
65 |
+
load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
|
66 |
|
67 |
aws_log_box = gr.Textbox(label="AWS data load status")
|
68 |
|
how_to_create_exe_dist.txt
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
|
2 |
+
|
3 |
+
2. Activate the environment 'conda activate new_env'
|
4 |
+
|
5 |
+
3. cd to this folder. Install packages from requirements.txt using 'pip install -r requirements.txt'
|
6 |
+
|
7 |
+
NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
|
8 |
+
|
9 |
+
6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for en_core_web_sm (a spaCy model). Put these in the build_deps\ subfolder
|
10 |
+
|
11 |
+
7. pip install pyinstaller
|
12 |
+
|
13 |
+
8. In command line, cd to the folder that contains app.py.
|
14 |
+
|
15 |
+
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
+
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRedactApp_0.1 app.py
|
18 |
+
|
19 |
+
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
+
|
21 |
+
|
22 |
+
b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
|
23 |
+
|
24 |
+
a = Analysis(
|
25 |
+
...
|
26 |
+
module_collection_mode={
|
27 |
+
'gradio': 'py', # Collect gradio package as source .py files
|
28 |
+
}
|
29 |
+
)
|
30 |
+
|
31 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.1.spec
|
32 |
+
|
33 |
+
|
34 |
+
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
35 |
+
|
36 |
+
10. In 'dist\data_text_search' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
|
37 |
+
|
38 |
+
11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
|
tools/helper_functions.py
CHANGED
@@ -10,3 +10,38 @@ def get_file_path_end(file_path):
|
|
10 |
#print(filename_without_extension)
|
11 |
|
12 |
return filename_without_extension
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
#print(filename_without_extension)
|
11 |
|
12 |
return filename_without_extension
|
13 |
+
|
14 |
+
def ensure_output_folder_exists():
|
15 |
+
"""Checks if the 'output/' folder exists, creates it if not."""
|
16 |
+
|
17 |
+
folder_name = "output/"
|
18 |
+
|
19 |
+
if not os.path.exists(folder_name):
|
20 |
+
# Create the folder if it doesn't exist
|
21 |
+
os.makedirs(folder_name)
|
22 |
+
print(f"Created the 'output/' folder.")
|
23 |
+
else:
|
24 |
+
print(f"The 'output/' folder already exists.")
|
25 |
+
|
26 |
+
|
27 |
+
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
28 |
+
def add_folder_to_path(folder_path: str):
|
29 |
+
'''
|
30 |
+
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
|
31 |
+
'''
|
32 |
+
|
33 |
+
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
34 |
+
print(folder_path, "folder exists.")
|
35 |
+
|
36 |
+
# Resolve relative path to absolute path
|
37 |
+
absolute_path = os.path.abspath(folder_path)
|
38 |
+
|
39 |
+
current_path = os.environ['PATH']
|
40 |
+
if absolute_path not in current_path.split(os.pathsep):
|
41 |
+
full_path_extension = absolute_path + os.pathsep + current_path
|
42 |
+
os.environ['PATH'] = full_path_extension
|
43 |
+
print(f"Updated PATH with: ", full_path_extension)
|
44 |
+
else:
|
45 |
+
print(f"Directory {folder_path} already exists in PATH.")
|
46 |
+
else:
|
47 |
+
print(f"Folder not found at {folder_path} - not added to PATH")
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -3,6 +3,8 @@ from typing import List
|
|
3 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
4 |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
5 |
import spacy
|
|
|
|
|
6 |
import re
|
7 |
|
8 |
# %%
|
@@ -136,8 +138,16 @@ class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
|
136 |
self.nlp = {"en": loaded_spacy_model}
|
137 |
|
138 |
# %%
|
139 |
-
# Load
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
143 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
|
|
3 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
4 |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
5 |
import spacy
|
6 |
+
spacy.prefer_gpu()
|
7 |
+
from spacy.cli.download import download
|
8 |
import re
|
9 |
|
10 |
# %%
|
|
|
138 |
self.nlp = {"en": loaded_spacy_model}
|
139 |
|
140 |
# %%
|
141 |
+
# Load spacy model
|
142 |
+
try:
|
143 |
+
import en_core_web_lg
|
144 |
+
nlp = en_core_web_lg.load()
|
145 |
+
print("Successfully imported spaCy model")
|
146 |
+
|
147 |
+
except:
|
148 |
+
download("en_core_web_lg")
|
149 |
+
nlp = spacy.load("en_core_web_lg")
|
150 |
+
print("Successfully downloaded and imported spaCy model")
|
151 |
|
152 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
153 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|