|
from pdf2image import convert_from_path, pdfinfo_from_path
|
|
from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
|
|
from PIL import Image
|
|
import os
|
|
import time
|
|
from gradio import Progress
|
|
from typing import List, Optional
|
|
|
|
def is_pdf_or_image(filename):
|
|
"""
|
|
Check if a file name is a PDF or an image file.
|
|
|
|
Args:
|
|
filename (str): The name of the file.
|
|
|
|
Returns:
|
|
bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
|
|
"""
|
|
if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
|
|
output = True
|
|
else:
|
|
output = False
|
|
return output
|
|
|
|
def is_pdf(filename):
|
|
"""
|
|
Check if a file name is a PDF.
|
|
|
|
Args:
|
|
filename (str): The name of the file.
|
|
|
|
Returns:
|
|
bool: True if the file name ends with ".pdf", False otherwise.
|
|
"""
|
|
return filename.lower().endswith(".pdf")
|
|
|
|
|
|
|
|
|
|
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
|
|
|
|
|
|
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
|
print("Number of pages in PDF: ", str(page_count))
|
|
|
|
images = []
|
|
|
|
|
|
|
|
for page_num in range(page_min,page_count):
|
|
|
|
print("Converting page: ", str(page_num + 1))
|
|
|
|
|
|
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
|
|
|
|
|
|
|
|
if not image:
|
|
print("Conversion of page", str(page_num), "to file failed.")
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
images.extend(image)
|
|
|
|
print("PDF has been converted to images.")
|
|
|
|
|
|
return images
|
|
|
|
|
|
|
|
def process_file(file_path):
|
|
|
|
file_extension = os.path.splitext(file_path)[1].lower()
|
|
|
|
|
|
if file_extension in ['.jpg', '.jpeg', '.png']:
|
|
print(f"{file_path} is an image file.")
|
|
|
|
img_object = [Image.open(file_path)]
|
|
|
|
|
|
|
|
elif file_extension == '.pdf':
|
|
print(f"{file_path} is a PDF file. Converting to image set")
|
|
|
|
img_object = convert_pdf_to_images(file_path)
|
|
|
|
else:
|
|
print(f"{file_path} is not an image or PDF file.")
|
|
img_object = ['']
|
|
|
|
return img_object
|
|
|
|
def prepare_image_or_text_pdf(
|
|
file_paths: List[str],
|
|
in_redact_method: str,
|
|
in_allow_list: Optional[List[List[str]]] = None,
|
|
latest_file_completed: int = 0,
|
|
out_message: List[str] = [],
|
|
first_loop_state: bool = False,
|
|
progress: Progress = Progress(track_tqdm=True)
|
|
) -> tuple[List[str], List[str]]:
|
|
"""
|
|
Prepare and process image or text PDF files for redaction.
|
|
|
|
This function takes a list of file paths, processes each file based on the specified redaction method,
|
|
and returns the output messages and processed file paths.
|
|
|
|
Args:
|
|
file_paths (List[str]): List of file paths to process.
|
|
in_redact_method (str): The redaction method to use.
|
|
in_allow_list (Optional[List[List[str]]]): List of allowed terms for redaction.
|
|
latest_file_completed (int): Index of the last completed file.
|
|
out_message (List[str]): List to store output messages.
|
|
first_loop_state (bool): Flag indicating if this is the first iteration.
|
|
progress (Progress): Progress tracker for the operation.
|
|
|
|
Returns:
|
|
tuple[List[str], List[str]]: A tuple containing the output messages and processed file paths.
|
|
"""
|
|
|
|
tic = time.perf_counter()
|
|
|
|
|
|
if isinstance(out_message, str):
|
|
out_message = [out_message]
|
|
|
|
|
|
if first_loop_state==True:
|
|
latest_file_completed = 0
|
|
out_message = []
|
|
out_file_paths = []
|
|
else:
|
|
print("Now attempting file:", str(latest_file_completed))
|
|
out_file_paths = []
|
|
|
|
if not file_paths:
|
|
file_paths = []
|
|
|
|
|
|
|
|
latest_file_completed = int(latest_file_completed)
|
|
|
|
|
|
if latest_file_completed >= len(file_paths):
|
|
print("Last file reached, returning files:", str(latest_file_completed))
|
|
if isinstance(out_message, list):
|
|
final_out_message = '\n'.join(out_message)
|
|
else:
|
|
final_out_message = out_message
|
|
return final_out_message, out_file_paths
|
|
|
|
|
|
|
|
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
|
|
|
|
|
|
|
for file in file_paths_loop:
|
|
file_path = file.name
|
|
file_path_without_ext = get_file_path_end(file_path)
|
|
|
|
|
|
|
|
file_extension = os.path.splitext(file_path)[1].lower()
|
|
|
|
|
|
if file_extension in ['.jpg', '.jpeg', '.png']:
|
|
in_redact_method = "Image analysis"
|
|
|
|
|
|
|
|
if not file_path:
|
|
out_message = "No file selected"
|
|
print(out_message)
|
|
return out_message, out_file_paths
|
|
|
|
if in_redact_method == "Image analysis" or in_redact_method == "AWS Textract":
|
|
|
|
if is_pdf_or_image(file_path) == False:
|
|
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
|
print(out_message)
|
|
return out_message, out_file_paths
|
|
|
|
out_file_path = process_file(file_path)
|
|
|
|
|
|
elif in_redact_method == "Text analysis":
|
|
if is_pdf(file_path) == False:
|
|
out_message = "Please upload a PDF file for text analysis."
|
|
print(out_message)
|
|
return out_message, out_file_paths
|
|
|
|
out_file_path = file_path
|
|
|
|
out_file_paths.append(out_file_path)
|
|
|
|
toc = time.perf_counter()
|
|
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
|
|
|
print(out_time)
|
|
|
|
out_message.append(out_time)
|
|
out_message_out = '\n'.join(out_message)
|
|
|
|
return out_message_out, out_file_paths
|
|
|
|
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
file_path_without_ext = get_file_path_end(in_file_path)
|
|
|
|
out_file_paths = out_text_file_path
|
|
|
|
|
|
print("Creating image version of redacted PDF to embed redactions.")
|
|
|
|
pdf_text_image_paths = process_file(out_text_file_path[0])
|
|
out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
|
|
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
|
|
|
|
|
|
|
out_file_paths = [out_text_image_file_path]
|
|
|
|
out_message = "PDF " + file_path_without_ext + " converted to image-based file."
|
|
print(out_message)
|
|
|
|
|
|
|
|
return out_message, out_file_paths |