seanpedrickcase commited on
Commit
43287c3
·
1 Parent(s): 452d304

Page conversion now page by page calls hopefully to avoid fastapi timeouts on AWS. gunicorn keep_alive parameter extended to 60 seconds just in case that helps too.

Browse files
Dockerfile CHANGED
@@ -30,7 +30,7 @@ RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
30
  # Switch to the "user" user
31
  USER user
32
 
33
- # Set home to the user's home directory
34
  ENV HOME=/home/user \
35
  PATH=/home/user/.local/bin:$PATH \
36
  PYTHONPATH=$HOME/app \
@@ -42,6 +42,8 @@ ENV HOME=/home/user \
42
  GRADIO_THEME=huggingface \
43
  #GRADIO_TEMP_DIR=$HOME/tmp \
44
  #GRADIO_ROOT_PATH=/address-match \
 
 
45
  SYSTEM=spaces
46
 
47
  # Set the working directory to the user's home directory
 
30
  # Switch to the "user" user
31
  USER user
32
 
33
+ # Set environmental variables
34
  ENV HOME=/home/user \
35
  PATH=/home/user/.local/bin:$PATH \
36
  PYTHONPATH=$HOME/app \
 
42
  GRADIO_THEME=huggingface \
43
  #GRADIO_TEMP_DIR=$HOME/tmp \
44
  #GRADIO_ROOT_PATH=/address-match \
45
+ # gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
46
+ KEEP_ALIVE=60 \
47
  SYSTEM=spaces
48
 
49
  # Set the working directory to the user's home directory
tools/file_conversion.py CHANGED
@@ -1,6 +1,7 @@
1
- from pdf2image import convert_from_path
2
  from PIL import Image
3
  import os
 
4
 
5
  def is_pdf_or_image(filename):
6
  """
@@ -33,10 +34,29 @@ def is_pdf(filename):
33
  # %%
34
  ## Convert pdf to image if necessary
35
 
36
- def convert_pdf_to_images(pdf_path):
37
 
38
- # Convert PDF to a list of images
39
- images = convert_from_path(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  # Save each image as a separate file - deprecated
42
  #image_paths = []
 
1
+ from pdf2image import convert_from_path, pdfinfo_from_path
2
  from PIL import Image
3
  import os
4
+ from gradio import Progress
5
 
6
  def is_pdf_or_image(filename):
7
  """
 
34
  # %%
35
  ## Convert pdf to image if necessary
36
 
37
+ def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
38
 
39
+ # Get the number of pages in the PDF
40
+ page_count = pdfinfo_from_path(pdf_path)['Pages']
41
+
42
+ images = []
43
+
44
+ # Open the PDF file
45
+ for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
46
+
47
+ # Convert one page to image
48
+ image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
49
+
50
+ # If no images are returned, break the loop
51
+ if not image:
52
+ break
53
+
54
+ # # Convert PDF to a list of images
55
+ # images = convert_from_path(pdf_path)
56
+
57
+ # images = []
58
+
59
+ images.extend(image)
60
 
61
  # Save each image as a separate file - deprecated
62
  #image_paths = []
tools/file_redaction.py CHANGED
@@ -79,7 +79,7 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
79
 
80
  return images
81
 
82
- def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress()):
83
  '''
84
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
85
  '''
 
79
 
80
  return images
81
 
82
+ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
83
  '''
84
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
85
  '''