|
import gradio as gr |
|
import os |
|
import random |
|
import tempfile |
|
from pdf2image import convert_from_path |
|
from PyPDF2 import PdfReader |
|
from huggingface_hub import create_repo, upload_folder, HfApi |
|
|
|
|
|
def pdf_to_images(pdf_files, sample_size, temp_dir): |
|
if not os.path.exists(temp_dir): |
|
os.makedirs(temp_dir) |
|
|
|
all_images = [] |
|
for pdf_file in pdf_files: |
|
pdf_path = pdf_file.name |
|
pdf = PdfReader(pdf_path) |
|
total_pages = len(pdf.pages) |
|
|
|
|
|
pages_to_convert = ( |
|
total_pages if sample_size == 0 else min(sample_size, total_pages) |
|
) |
|
|
|
|
|
if sample_size > 0 and sample_size < total_pages: |
|
selected_pages = sorted( |
|
random.sample(range(1, total_pages + 1), pages_to_convert) |
|
) |
|
else: |
|
selected_pages = range(1, total_pages + 1) |
|
|
|
|
|
for page_num in selected_pages: |
|
images = convert_from_path( |
|
pdf_path, first_page=page_num, last_page=page_num |
|
) |
|
for image in images: |
|
image_path = os.path.join( |
|
temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num}.jpg" |
|
) |
|
image.save(image_path, "JPEG") |
|
all_images.append(image_path) |
|
|
|
return all_images, f"Saved {len(all_images)} images to temporary directory" |
|
|
|
|
|
def process_pdfs(pdf_files, sample_size, hf_repo, oauth_token: gr.OAuthToken | None): |
|
if not pdf_files: |
|
return None, "No PDF files uploaded." |
|
|
|
if oauth_token is None: |
|
return None, "Please log in to upload to Hugging Face." |
|
|
|
try: |
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
images_dir = os.path.join(temp_dir, "images") |
|
os.makedirs(images_dir) |
|
|
|
images, message = pdf_to_images(pdf_files, sample_size, images_dir) |
|
|
|
if hf_repo: |
|
print(oauth_token.token) |
|
try: |
|
create_repo( |
|
hf_repo, |
|
repo_type="dataset", |
|
token=oauth_token.token, |
|
) |
|
upload_folder( |
|
folder_path=images_dir, |
|
repo_id=hf_repo, |
|
repo_type="dataset", |
|
path_in_repo="images", |
|
token=oauth_token.token, |
|
) |
|
message += ( |
|
f"\nUploaded images to Hugging Face repo: {hf_repo}/images" |
|
) |
|
except Exception as e: |
|
message += f"\nFailed to upload to Hugging Face: {str(e)}" |
|
|
|
return images, message |
|
except Exception as e: |
|
return None, f"An error occurred: {str(e)}" |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# PDF to Image Converter") |
|
gr.Markdown( |
|
"Upload PDF(s), convert pages to images, and optionally upload them to a Hugging Face repo. If a sample size is specified, random pages will be selected." |
|
) |
|
|
|
with gr.Row(): |
|
gr.LoginButton(size="sm") |
|
|
|
with gr.Row(): |
|
pdf_files = gr.File(file_count="multiple", label="Upload PDF(s)") |
|
sample_size = gr.Slider( |
|
minimum=0, |
|
maximum=50, |
|
step=1, |
|
value=0, |
|
label="Sample Size (0 for all pages)", |
|
) |
|
hf_repo = gr.Textbox( |
|
label="Hugging Face Repo", placeholder="username/repo-name" |
|
) |
|
|
|
output_gallery = gr.Gallery(label="Converted Images") |
|
status_text = gr.Textbox(label="Status") |
|
|
|
submit_button = gr.Button("Process PDFs") |
|
submit_button.click( |
|
process_pdfs, |
|
inputs=[pdf_files, sample_size, hf_repo], |
|
outputs=[output_gallery, status_text], |
|
) |
|
|
|
|
|
demo.launch(debug=True) |
|
|