davanstrien HF staff commited on
Commit
90cd056
β€’
1 Parent(s): ca4bcc4

add try except

Browse files
Files changed (1) hide show
  1. app.py +34 -26
app.py CHANGED
@@ -19,37 +19,45 @@ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
19
  os.makedirs(temp_dir)
20
  progress(0, desc="Starting conversion")
21
  all_images = []
 
22
  for pdf_file in progress.tqdm(pdf_files, desc="Converting PDFs"):
23
- pdf_path = pdf_file.name
24
- pdf = PdfReader(pdf_path)
25
- total_pages = len(pdf.pages)
26
-
27
- # Determine the number of pages to convert
28
- pages_to_convert = (
29
- total_pages if sample_size == 0 else min(sample_size, total_pages)
30
- )
31
-
32
- # Select random pages if sampling
33
- if sample_size > 0 and sample_size < total_pages:
34
- selected_pages = sorted(
35
- random.sample(range(1, total_pages + 1), pages_to_convert)
36
  )
37
- else:
38
- selected_pages = range(1, total_pages + 1)
39
 
40
- # Convert selected PDF pages to images
41
- for page_num in selected_pages:
42
- images = convert_from_path(
43
- pdf_path, first_page=page_num, last_page=page_num
44
- )
45
- for image in images:
46
- image_path = os.path.join(
47
- temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num}.jpg"
48
  )
49
- image.save(image_path, "JPEG")
50
- all_images.append(image_path)
51
 
52
- return all_images, f"Saved {len(all_images)} images to temporary directory"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
 
55
  def get_size_category(num_images):
 
19
  os.makedirs(temp_dir)
20
  progress(0, desc="Starting conversion")
21
  all_images = []
22
+ skipped_pdfs = []
23
  for pdf_file in progress.tqdm(pdf_files, desc="Converting PDFs"):
24
+ try:
25
+ pdf_path = pdf_file.name
26
+ pdf = PdfReader(pdf_path)
27
+ total_pages = len(pdf.pages)
28
+
29
+ # Determine the number of pages to convert
30
+ pages_to_convert = (
31
+ total_pages if sample_size == 0 else min(sample_size, total_pages)
 
 
 
 
 
32
  )
 
 
33
 
34
+ # Select random pages if sampling
35
+ if sample_size > 0 and sample_size < total_pages:
36
+ selected_pages = sorted(
37
+ random.sample(range(1, total_pages + 1), pages_to_convert)
 
 
 
 
38
  )
39
+ else:
40
+ selected_pages = range(1, total_pages + 1)
41
 
42
+ # Convert selected PDF pages to images
43
+ for page_num in selected_pages:
44
+ images = convert_from_path(
45
+ pdf_path, first_page=page_num, last_page=page_num
46
+ )
47
+ for image in images:
48
+ image_path = os.path.join(
49
+ temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num}.jpg"
50
+ )
51
+ image.save(image_path, "JPEG")
52
+ all_images.append(image_path)
53
+ except Exception as e:
54
+ skipped_pdfs.append(pdf_file.name)
55
+ gr.Info(f"Skipped PDF {pdf_file.name} due to error: {str(e)}")
56
+
57
+ message = f"Saved {len(all_images)} images to temporary directory"
58
+ if skipped_pdfs:
59
+ message += f"\nSkipped {len(skipped_pdfs)} PDFs due to errors: {', '.join(skipped_pdfs)}"
60
+ return all_images, message
61
 
62
 
63
  def get_size_category(num_images):