seanpedrickcase commited on
Commit
dce6100
·
1 Parent(s): 72a4f68

Added TLDExtract cache files so that internet connection is not required

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -0
  2. app.py +5 -2
  3. tld/.tld_set_snapshot +0 -0
Dockerfile CHANGED
@@ -26,6 +26,7 @@ RUN useradd -m -u 1000 user
26
 
27
  # Make output folder
28
  RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
 
29
 
30
  # Switch to the "user" user
31
  USER user
@@ -40,6 +41,7 @@ ENV HOME=/home/user \
40
  GRADIO_SERVER_NAME=0.0.0.0 \
41
  GRADIO_SERVER_PORT=7860 \
42
  GRADIO_THEME=huggingface \
 
43
  #GRADIO_TEMP_DIR=$HOME/tmp \
44
  #GRADIO_ROOT_PATH=/address-match \
45
  # gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
 
26
 
27
  # Make output folder
28
  RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
29
+ RUN mkdir -p /home/user/app/tld && chown -R user:user /home/user/app/tld
30
 
31
  # Switch to the "user" user
32
  USER user
 
41
  GRADIO_SERVER_NAME=0.0.0.0 \
42
  GRADIO_SERVER_PORT=7860 \
43
  GRADIO_THEME=huggingface \
44
+ TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
45
  #GRADIO_TEMP_DIR=$HOME/tmp \
46
  #GRADIO_ROOT_PATH=/address-match \
47
  # gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
app.py CHANGED
@@ -1,8 +1,11 @@
 
 
 
 
 
1
  from tools.file_redaction import choose_and_run_redactor
2
  from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
3
  from tools.aws_functions import load_data_from_aws
4
- from typing import List
5
- import pandas as pd
6
  import gradio as gr
7
 
8
  #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
 
1
+ import os
2
+
3
+ # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
+ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
+
6
  from tools.file_redaction import choose_and_run_redactor
7
  from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
8
  from tools.aws_functions import load_data_from_aws
 
 
9
  import gradio as gr
10
 
11
  #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
tld/.tld_set_snapshot ADDED
The diff for this file is too large to render. See raw diff