Spaces:

seanpedrickcase
/

document_redaction

Running

seanpedrickcase commited on May 20, 2024

Commit

dce6100

1 Parent(s): 72a4f68

Added TLDExtract cache files so that internet connection is not required

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -26,6 +26,7 @@ RUN useradd -m -u 1000 user
 # Make output folder
 RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
 # Switch to the "user" user
 USER user
@@ -40,6 +41,7 @@ ENV HOME=/home/user \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_SERVER_PORT=7860 \
 	GRADIO_THEME=huggingface \
 	#GRADIO_TEMP_DIR=$HOME/tmp \
 	#GRADIO_ROOT_PATH=/address-match \
 	# gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout

 # Make output folder
 RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
+RUN mkdir -p /home/user/app/tld && chown -R user:user /home/user/app/tld
 # Switch to the "user" user
 USER user
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_SERVER_PORT=7860 \
 	GRADIO_THEME=huggingface \
+	TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
 	#GRADIO_TEMP_DIR=$HOME/tmp \
 	#GRADIO_ROOT_PATH=/address-match \
 	# gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout

app.py CHANGED Viewed

@@ -1,8 +1,11 @@
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
 from tools.aws_functions import load_data_from_aws
-from typing import List
-import pandas as pd
 import gradio as gr
 #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #

+import os
+# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
+os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
 from tools.aws_functions import load_data_from_aws
 import gradio as gr
 #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #

tld/.tld_set_snapshot ADDED Viewed

The diff for this file is too large to render. See raw diff