seanpedrickcase
commited on
Commit
·
43287c3
1
Parent(s):
452d304
Page conversion now page by page calls hopefully to avoid fastapi timeouts on AWS. gunicorn keep_alive parameter extended to 60 seconds just in case that helps too.
Browse files- Dockerfile +3 -1
- tools/file_conversion.py +24 -4
- tools/file_redaction.py +1 -1
Dockerfile
CHANGED
@@ -30,7 +30,7 @@ RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
|
|
30 |
# Switch to the "user" user
|
31 |
USER user
|
32 |
|
33 |
-
# Set
|
34 |
ENV HOME=/home/user \
|
35 |
PATH=/home/user/.local/bin:$PATH \
|
36 |
PYTHONPATH=$HOME/app \
|
@@ -42,6 +42,8 @@ ENV HOME=/home/user \
|
|
42 |
GRADIO_THEME=huggingface \
|
43 |
#GRADIO_TEMP_DIR=$HOME/tmp \
|
44 |
#GRADIO_ROOT_PATH=/address-match \
|
|
|
|
|
45 |
SYSTEM=spaces
|
46 |
|
47 |
# Set the working directory to the user's home directory
|
|
|
30 |
# Switch to the "user" user
|
31 |
USER user
|
32 |
|
33 |
+
# Set environmental variables
|
34 |
ENV HOME=/home/user \
|
35 |
PATH=/home/user/.local/bin:$PATH \
|
36 |
PYTHONPATH=$HOME/app \
|
|
|
42 |
GRADIO_THEME=huggingface \
|
43 |
#GRADIO_TEMP_DIR=$HOME/tmp \
|
44 |
#GRADIO_ROOT_PATH=/address-match \
|
45 |
+
# gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
|
46 |
+
KEEP_ALIVE=60 \
|
47 |
SYSTEM=spaces
|
48 |
|
49 |
# Set the working directory to the user's home directory
|
tools/file_conversion.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
-
from pdf2image import convert_from_path
|
2 |
from PIL import Image
|
3 |
import os
|
|
|
4 |
|
5 |
def is_pdf_or_image(filename):
|
6 |
"""
|
@@ -33,10 +34,29 @@ def is_pdf(filename):
|
|
33 |
# %%
|
34 |
## Convert pdf to image if necessary
|
35 |
|
36 |
-
def convert_pdf_to_images(pdf_path):
|
37 |
|
38 |
-
#
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
# Save each image as a separate file - deprecated
|
42 |
#image_paths = []
|
|
|
1 |
+
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
from PIL import Image
|
3 |
import os
|
4 |
+
from gradio import Progress
|
5 |
|
6 |
def is_pdf_or_image(filename):
|
7 |
"""
|
|
|
34 |
# %%
|
35 |
## Convert pdf to image if necessary
|
36 |
|
37 |
+
def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
|
38 |
|
39 |
+
# Get the number of pages in the PDF
|
40 |
+
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
41 |
+
|
42 |
+
images = []
|
43 |
+
|
44 |
+
# Open the PDF file
|
45 |
+
for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
46 |
+
|
47 |
+
# Convert one page to image
|
48 |
+
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
|
49 |
+
|
50 |
+
# If no images are returned, break the loop
|
51 |
+
if not image:
|
52 |
+
break
|
53 |
+
|
54 |
+
# # Convert PDF to a list of images
|
55 |
+
# images = convert_from_path(pdf_path)
|
56 |
+
|
57 |
+
# images = []
|
58 |
+
|
59 |
+
images.extend(image)
|
60 |
|
61 |
# Save each image as a separate file - deprecated
|
62 |
#image_paths = []
|
tools/file_redaction.py
CHANGED
@@ -79,7 +79,7 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
|
|
79 |
|
80 |
return images
|
81 |
|
82 |
-
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress()):
|
83 |
'''
|
84 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
85 |
'''
|
|
|
79 |
|
80 |
return images
|
81 |
|
82 |
+
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
83 |
'''
|
84 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
85 |
'''
|