davanstrien HF staff commited on
Commit
f43467c
β€’
1 Parent(s): f74505d

update pdf library

Browse files
Files changed (1) hide show
  1. app.py +51 -34
app.py CHANGED
@@ -3,56 +3,73 @@ import random
3
  import shutil
4
  import tempfile
5
  import zipfile
 
6
  from datetime import datetime
7
 
 
8
  import gradio as gr
9
- from huggingface_hub import HfApi, DatasetCard, DatasetCardData
10
- from pdf2image import convert_from_path
11
- from PyPDF2 import PdfReader
12
  from dataset_card_template import DATASET_CARD_TEMPLATE
13
 
14
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
18
  if not os.path.exists(temp_dir):
19
  os.makedirs(temp_dir)
 
20
  progress(0, desc="Starting conversion")
21
  all_images = []
22
  skipped_pdfs = []
23
- for pdf_file in progress.tqdm(pdf_files, desc="Converting PDFs"):
24
- try:
25
- pdf_path = pdf_file.name
26
- pdf = PdfReader(pdf_path)
27
- total_pages = len(pdf.pages)
28
-
29
- # Determine the number of pages to convert
30
- pages_to_convert = (
31
- total_pages if sample_size == 0 else min(sample_size, total_pages)
32
- )
33
 
34
- # Select random pages if sampling
35
- if sample_size > 0 and sample_size < total_pages:
36
- selected_pages = sorted(
37
- random.sample(range(1, total_pages + 1), pages_to_convert)
38
- )
 
 
 
 
 
 
 
 
 
39
  else:
40
- selected_pages = range(1, total_pages + 1)
41
-
42
- # Convert selected PDF pages to images
43
- for page_num in selected_pages:
44
- images = convert_from_path(
45
- pdf_path, first_page=page_num, last_page=page_num
46
- )
47
- for image in images:
48
- image_path = os.path.join(
49
- temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num}.jpg"
50
- )
51
- image.save(image_path, "JPEG")
52
- all_images.append(image_path)
53
- except Exception as e:
54
- skipped_pdfs.append(pdf_file.name)
55
- gr.Info(f"Skipped PDF {pdf_file.name} due to error: {str(e)}")
56
 
57
  message = f"Saved {len(all_images)} images to temporary directory"
58
  if skipped_pdfs:
 
3
  import shutil
4
  import tempfile
5
  import zipfile
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
  from datetime import datetime
8
 
9
+ import fitz # PyMuPDF
10
  import gradio as gr
11
+ from huggingface_hub import DatasetCard, DatasetCardData, HfApi
12
+
 
13
  from dataset_card_template import DATASET_CARD_TEMPLATE
14
 
15
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
16
 
17
 
18
+ def process_pdf(pdf_file, sample_size, temp_dir):
19
+ try:
20
+ pdf_path = pdf_file.name
21
+ doc = fitz.open(pdf_path)
22
+ total_pages = len(doc)
23
+
24
+ pages_to_convert = (
25
+ total_pages if sample_size == 0 else min(sample_size, total_pages)
26
+ )
27
+ selected_pages = (
28
+ sorted(random.sample(range(total_pages), pages_to_convert))
29
+ if sample_size > 0 and sample_size < total_pages
30
+ else range(total_pages)
31
+ )
32
+
33
+ images = []
34
+ for page_num in selected_pages:
35
+ page = doc[page_num]
36
+ pix = page.get_pixmap()
37
+ image_path = os.path.join(
38
+ temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num+1}.png"
39
+ )
40
+ pix.save(image_path)
41
+ images.append(image_path)
42
+
43
+ doc.close()
44
+ return images, None
45
+ except Exception as e:
46
+ return [], f"Error processing {pdf_file.name}: {str(e)}"
47
+
48
+
49
  def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
50
  if not os.path.exists(temp_dir):
51
  os.makedirs(temp_dir)
52
+
53
  progress(0, desc="Starting conversion")
54
  all_images = []
55
  skipped_pdfs = []
 
 
 
 
 
 
 
 
 
 
56
 
57
+ with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
58
+ future_to_pdf = {
59
+ executor.submit(process_pdf, pdf_file, sample_size, temp_dir): pdf_file
60
+ for pdf_file in pdf_files
61
+ }
62
+
63
+ for future in progress.tqdm(
64
+ as_completed(future_to_pdf), total=len(pdf_files), desc="Converting PDFs"
65
+ ):
66
+ pdf_file = future_to_pdf[future]
67
+ images, error = future.result()
68
+ if error:
69
+ skipped_pdfs.append(pdf_file.name)
70
+ gr.Info(error)
71
  else:
72
+ all_images.extend(images)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  message = f"Saved {len(all_images)} images to temporary directory"
75
  if skipped_pdfs: