import glob import os import random from pathlib import Path from pdf2image import convert_from_path from tqdm import tqdm random.seed(42) def convert_pdf_to_images(pdf_file: str, save_folder: str): """ Convert each page of a pdf to a jpg image and save them in a folder. Args: - pdf_file (str): path to the pdf file - save_folder (str): path to the folder where the images will be saved """ images = convert_from_path(pdf_file) for i, image in enumerate(images): if not os.path.exists(save_folder): os.makedirs(save_folder) image.save(os.path.join(save_folder, f"page_{i+1}.jpg"), "JPEG") def convert_all_pdfs_to_images(path_to_folder: str, n_samples: int = 0): """ Convert all pdfs in a folder and its subfolder to images and save them in a folder. It will sample n_samples pdf files in each subfolder, allowing to have granularity on the number of pdf files to convert. Args: - path_to_folder (str): path to the folder containing the pdf files - n_samples (int): number of pdf files to sample in each subfolder directory structure: - path_to_folder - subfolder1 - pdf1 - pdf2 - ... - subfolder2 - pdf1 - pdf2 - ... - ... """ # take n_samples pdf files in each subfolder : I want to take 10 pdf files from each subfolder sub_dirs = [d for d in os.listdir(path_to_folder) if os.path.isdir(os.path.join(path_to_folder, d))] sampled_files = [] for sub_dir in sub_dirs: pdf_files = glob.glob(os.path.join(path_to_folder, sub_dir, "*.pdf")) if (n_samples == 0) or (len(pdf_files) <= n_samples): print(f"Taking all pdf files in {sub_dir}") sampled_files.extend(pdf_files) else: print(f"Taking {n_samples} pdf files in {sub_dir}") sampled_files.extend(random.sample(pdf_files, n_samples)) pdf_files = [str(file) for file in sampled_files] # Create an empty text file that will contain the file paths of the corrupted pdf files dirpath_corrupted = Path(path_to_folder) / "corrupted_pdf_files.txt" dirpath_corrupted.parent.mkdir(parents=True, exist_ok=True) with dirpath_corrupted.open("w") as f: with tqdm(total=len(pdf_files)) as pbar: for pdf_file in pdf_files: pbar.set_description(f"Processing {pdf_file}") save_folder = os.path.join("pages_extracted", *Path(pdf_file).parts[-2:]) if not os.path.exists(os.path.join(path_to_folder, save_folder)): try: convert_pdf_to_images(pdf_file, os.path.join(path_to_folder, save_folder)) except Exception as e: print(f"Error converting {pdf_file}: {e}") f.write(pdf_file) f.write("\n") pbar.update(1) return