|
import os |
|
import random |
|
import shutil |
|
import tempfile |
|
import zipfile |
|
from datetime import datetime |
|
|
|
import gradio as gr |
|
from huggingface_hub import HfApi, DatasetCard, DatasetCardData |
|
from pdf2image import convert_from_path |
|
from PyPDF2 import PdfReader |
|
from dataset_card_template import DATASET_CARD_TEMPLATE |
|
|
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" |
|
|
|
|
|
def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()): |
|
if not os.path.exists(temp_dir): |
|
os.makedirs(temp_dir) |
|
progress(0, desc="Starting conversion") |
|
all_images = [] |
|
skipped_pdfs = [] |
|
for pdf_file in progress.tqdm(pdf_files, desc="Converting PDFs"): |
|
try: |
|
pdf_path = pdf_file.name |
|
pdf = PdfReader(pdf_path) |
|
total_pages = len(pdf.pages) |
|
|
|
|
|
pages_to_convert = ( |
|
total_pages if sample_size == 0 else min(sample_size, total_pages) |
|
) |
|
|
|
|
|
if sample_size > 0 and sample_size < total_pages: |
|
selected_pages = sorted( |
|
random.sample(range(1, total_pages + 1), pages_to_convert) |
|
) |
|
else: |
|
selected_pages = range(1, total_pages + 1) |
|
|
|
|
|
for page_num in selected_pages: |
|
images = convert_from_path( |
|
pdf_path, first_page=page_num, last_page=page_num |
|
) |
|
for image in images: |
|
image_path = os.path.join( |
|
temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num}.jpg" |
|
) |
|
image.save(image_path, "JPEG") |
|
all_images.append(image_path) |
|
except Exception as e: |
|
skipped_pdfs.append(pdf_file.name) |
|
gr.Info(f"Skipped PDF {pdf_file.name} due to error: {str(e)}") |
|
|
|
message = f"Saved {len(all_images)} images to temporary directory" |
|
if skipped_pdfs: |
|
message += f"\nSkipped {len(skipped_pdfs)} PDFs due to errors: {', '.join(skipped_pdfs)}" |
|
return all_images, message |
|
|
|
|
|
def get_size_category(num_images): |
|
if num_images < 1000: |
|
return "n<1K" |
|
elif num_images < 10000: |
|
return "1K<n<10K" |
|
elif num_images < 100000: |
|
return "10K<n<100K" |
|
elif num_images < 1000000: |
|
return "100K<n<1M" |
|
else: |
|
return "n>1M" |
|
|
|
|
|
def process_pdfs( |
|
pdf_files, |
|
sample_size, |
|
hf_repo, |
|
create_zip, |
|
private_repo, |
|
oauth_token: gr.OAuthToken | None, |
|
progress=gr.Progress(), |
|
): |
|
if not pdf_files: |
|
return ( |
|
None, |
|
None, |
|
gr.Markdown( |
|
"⚠️ No PDF files uploaded. Please upload at least one PDF file." |
|
), |
|
) |
|
|
|
if oauth_token is None: |
|
return ( |
|
None, |
|
None, |
|
gr.Markdown( |
|
"⚠️ Not logged in to Hugging Face. Please log in to upload to a Hugging Face dataset." |
|
), |
|
) |
|
|
|
try: |
|
temp_dir = tempfile.mkdtemp() |
|
images_dir = os.path.join(temp_dir, "images") |
|
os.makedirs(images_dir) |
|
|
|
progress(0, desc="Starting PDF processing") |
|
images, message = pdf_to_images(pdf_files, sample_size, images_dir) |
|
|
|
zip_path = None |
|
if create_zip: |
|
|
|
zip_path = os.path.join(temp_dir, "converted_images.zip") |
|
with zipfile.ZipFile(zip_path, "w") as zipf: |
|
progress(0, desc="Zipping images") |
|
for image in progress.tqdm(images, desc="Zipping images"): |
|
zipf.write(image, os.path.basename(image)) |
|
message += f"\nCreated zip file with {len(images)} images" |
|
|
|
if hf_repo: |
|
try: |
|
hf_api = HfApi(token=oauth_token.token) |
|
hf_api.create_repo( |
|
hf_repo, |
|
repo_type="dataset", |
|
private=private_repo, |
|
) |
|
hf_api.upload_large_folder( |
|
folder_path=temp_dir, |
|
repo_id=hf_repo, |
|
repo_type="dataset", |
|
|
|
) |
|
|
|
|
|
size_category = get_size_category(len(images)) |
|
|
|
|
|
card_data = DatasetCardData( |
|
tags=["created-with-pdfs-to-page-images-converter", "pdf-to-image"], |
|
size_categories=[size_category], |
|
) |
|
|
|
|
|
card = DatasetCard.from_template( |
|
card_data, |
|
template_path=None, |
|
hf_repo=hf_repo, |
|
num_images=len(images), |
|
num_pdfs=len(pdf_files), |
|
sample_size=sample_size if sample_size > 0 else "All pages", |
|
creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), |
|
) |
|
|
|
|
|
card.text = DATASET_CARD_TEMPLATE.format( |
|
hf_repo=hf_repo, |
|
num_images=len(images), |
|
num_pdfs=len(pdf_files), |
|
sample_size=sample_size if sample_size > 0 else "All pages", |
|
creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), |
|
size_category=size_category, |
|
) |
|
|
|
repo_url = f"https://huggingface.co/datasets/{hf_repo}" |
|
message += f"\nUploaded dataset card to Hugging Face repo: [{hf_repo}]({repo_url})" |
|
|
|
card.push_to_hub(hf_repo, token=oauth_token.token) |
|
except Exception as e: |
|
message += f"\nFailed to upload to Hugging Face: {str(e)}" |
|
|
|
return images, zip_path, message |
|
except Exception as e: |
|
if "temp_dir" in locals(): |
|
shutil.rmtree(temp_dir) |
|
return None, None, f"An error occurred: {str(e)}" |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML( |
|
"""<h1 style='text-align: center;'> PDFs to Page Images Converter</h1> |
|
<center><i> 📁 Convert PDFs to an image dataset, splitting pages into individual images 📁 </i></center>""" |
|
) |
|
gr.Markdown( |
|
""" |
|
This app allows you to: |
|
1. Upload one or more PDF files |
|
2. Convert each page of the PDFs into separate image files |
|
3. (Optionally) sample a specific number of pages from each PDF |
|
4. (Optionally) Create a downloadable ZIP file of the converted images |
|
5. (Optionally) Upload the images to a Hugging Face dataset repository |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
gr.LoginButton(size="sm") |
|
|
|
with gr.Row(): |
|
pdf_files = gr.File( |
|
file_count="multiple", label="Upload PDF(s)", file_types=["*.pdf"] |
|
) |
|
with gr.Row(): |
|
sample_size = gr.Number( |
|
value=None, |
|
label="Pages per PDF (0 for all pages)", |
|
info="Specify how many pages to convert from each PDF. Use 0 to convert all pages.", |
|
) |
|
hf_repo = gr.Textbox( |
|
label="Hugging Face Repo", |
|
placeholder="username/repo-name", |
|
info="Enter the Hugging Face repository name in the format 'username/repo-name'", |
|
) |
|
with gr.Row(): |
|
create_zip = gr.Checkbox(label="Create ZIP file of images?", value=False) |
|
private_repo = gr.Checkbox(label="Make repository private?", value=False) |
|
with gr.Accordion("View converted images", open=False): |
|
output_gallery = gr.Gallery(label="Converted Images") |
|
status_text = gr.Markdown(label="Status") |
|
download_button = gr.File(label="Download Converted Images") |
|
|
|
submit_button = gr.Button("Convert PDFs to page images") |
|
submit_button.click( |
|
process_pdfs, |
|
inputs=[pdf_files, sample_size, hf_repo, create_zip, private_repo], |
|
outputs=[output_gallery, download_button, status_text], |
|
) |
|
|
|
|
|
demo.launch(debug=True) |
|
|