Spaces:

seanpedrickcase
/

document_redaction

Running

seanpedrickcase commited on May 13, 2024

Commit

43287c3

1 Parent(s): 452d304

Page conversion now page by page calls hopefully to avoid fastapi timeouts on AWS. gunicorn keep_alive parameter extended to 60 seconds just in case that helps too.

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -30,7 +30,7 @@ RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
 # Switch to the "user" user
 USER user
-# Set home to the user's home directory
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
     PYTHONPATH=$HOME/app \
@@ -42,6 +42,8 @@ ENV HOME=/home/user \
 	GRADIO_THEME=huggingface \
 	#GRADIO_TEMP_DIR=$HOME/tmp \
 	#GRADIO_ROOT_PATH=/address-match \
 	SYSTEM=spaces
 # Set the working directory to the user's home directory

 # Switch to the "user" user
 USER user
+# Set environmental variables
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
     PYTHONPATH=$HOME/app \
 	GRADIO_THEME=huggingface \
 	#GRADIO_TEMP_DIR=$HOME/tmp \
 	#GRADIO_ROOT_PATH=/address-match \
+	# gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
+	KEEP_ALIVE=60 \
 	SYSTEM=spaces
 # Set the working directory to the user's home directory

tools/file_conversion.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from pdf2image import convert_from_path
 from PIL import Image
 import os
 def is_pdf_or_image(filename):
     """
@@ -33,10 +34,29 @@ def is_pdf(filename):
 # %%
 ## Convert pdf to image if necessary
-def convert_pdf_to_images(pdf_path):
-    # Convert PDF to a list of images
-    images = convert_from_path(pdf_path)
     # Save each image as a separate file - deprecated
     #image_paths = []

+from pdf2image import convert_from_path, pdfinfo_from_path
 from PIL import Image
 import os
+from gradio import Progress
 def is_pdf_or_image(filename):
     """
 # %%
 ## Convert pdf to image if necessary
+def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
+    # Get the number of pages in the PDF
+    page_count = pdfinfo_from_path(pdf_path)['Pages']
+    images = []
+    # Open the PDF file
+    for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
+        # Convert one page to image
+        image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
+        # If no images are returned, break the loop
+        if not image:
+            break
+        # # Convert PDF to a list of images
+        # images = convert_from_path(pdf_path)
+        # images = []
+        images.extend(image)
     # Save each image as a separate file - deprecated
     #image_paths = []

tools/file_redaction.py CHANGED Viewed

@@ -79,7 +79,7 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
     return images
-def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress()):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''

     return images
+def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''