Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / tools /file_conversion.py

seanpedrickcase

Added -y to poppler-utils installation in Dockerfile. Added support for image files in image-based redaction.

37d982e 7 months ago

raw

history blame

2.05 kB

	from pdf2image import convert_from_path
	from PIL import Image
	import os

	def is_pdf_or_image(filename):
	"""
	Check if a file name is a PDF or an image file.

	Args:
	filename (str): The name of the file.

	Returns:
	bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
	"""
	if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".png"):
	output = True
	else:
	output = False
	return output

	def is_pdf(filename):
	"""
	Check if a file name is a PDF.

	Args:
	filename (str): The name of the file.

	Returns:
	bool: True if the file name ends with ".pdf", False otherwise.
	"""
	return filename.lower().endswith(".pdf")

	# %%
	## Convert pdf to image if necessary

	def convert_pdf_to_images(pdf_path):

	# Convert PDF to a list of images
	images = convert_from_path(pdf_path)

	# Save each image as a separate file - deprecated
	#image_paths = []
	# for i, image in enumerate(images):
	# page_path = f"processing/page_{i+1}.png"
	# image.save(page_path, "PNG")
	# image_paths.append(page_path)

	print("PDF has been converted to images.")

	return images

	# %%
	def process_file(file_path):
	# Get the file extension
	file_extension = os.path.splitext(file_path)[1].lower()

	# Check if the file is an image type
	if file_extension in ['.jpg', '.jpeg', '.png']:
	print(f"{file_path} is an image file.")
	# Perform image processing here
	out_path = [Image.open(file_path)]

	# Check if the file is a PDF
	elif file_extension == '.pdf':
	print(f"{file_path} is a PDF file. Converting to image set")
	# Run your function for processing PDF files here
	out_path = convert_pdf_to_images(file_path)

	else:
	print(f"{file_path} is not an image or PDF file.")
	out_path = ['']

	return out_path