Added -y to poppler-utils installation in Dockerfile. Added support for image files in image-based redaction.
37d982e
from pdf2image import convert_from_path | |
from PIL import Image | |
import os | |
def is_pdf_or_image(filename): | |
""" | |
Check if a file name is a PDF or an image file. | |
Args: | |
filename (str): The name of the file. | |
Returns: | |
bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise. | |
""" | |
if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".png"): | |
output = True | |
else: | |
output = False | |
return output | |
def is_pdf(filename): | |
""" | |
Check if a file name is a PDF. | |
Args: | |
filename (str): The name of the file. | |
Returns: | |
bool: True if the file name ends with ".pdf", False otherwise. | |
""" | |
return filename.lower().endswith(".pdf") | |
# %% | |
## Convert pdf to image if necessary | |
def convert_pdf_to_images(pdf_path): | |
# Convert PDF to a list of images | |
images = convert_from_path(pdf_path) | |
# Save each image as a separate file - deprecated | |
#image_paths = [] | |
# for i, image in enumerate(images): | |
# page_path = f"processing/page_{i+1}.png" | |
# image.save(page_path, "PNG") | |
# image_paths.append(page_path) | |
print("PDF has been converted to images.") | |
return images | |
# %% | |
def process_file(file_path): | |
# Get the file extension | |
file_extension = os.path.splitext(file_path)[1].lower() | |
# Check if the file is an image type | |
if file_extension in ['.jpg', '.jpeg', '.png']: | |
print(f"{file_path} is an image file.") | |
# Perform image processing here | |
out_path = [Image.open(file_path)] | |
# Check if the file is a PDF | |
elif file_extension == '.pdf': | |
print(f"{file_path} is a PDF file. Converting to image set") | |
# Run your function for processing PDF files here | |
out_path = convert_pdf_to_images(file_path) | |
else: | |
print(f"{file_path} is not an image or PDF file.") | |
out_path = [''] | |
return out_path | |