davanstrien HF staff commited on
Commit
78b9829
β€’
1 Parent(s): 7183f70

add progress

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -15,7 +15,7 @@ from dataset_card_template import DATASET_CARD_TEMPLATE
15
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
16
 
17
 
18
- def process_pdf(pdf_file, sample_size, temp_dir):
19
  try:
20
  pdf_path = pdf_file.name
21
  doc = fitz.open(pdf_path)
@@ -31,7 +31,7 @@ def process_pdf(pdf_file, sample_size, temp_dir):
31
  )
32
 
33
  images = []
34
- for page_num in selected_pages:
35
  page = doc[page_num]
36
  pix = page.get_pixmap()
37
  image_path = os.path.join(
@@ -39,6 +39,10 @@ def process_pdf(pdf_file, sample_size, temp_dir):
39
  )
40
  pix.save(image_path)
41
  images.append(image_path)
 
 
 
 
42
 
43
  doc.close()
44
  return images, None
@@ -56,13 +60,13 @@ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
56
 
57
  with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
58
  future_to_pdf = {
59
- executor.submit(process_pdf, pdf_file, sample_size, temp_dir): pdf_file
 
 
60
  for pdf_file in pdf_files
61
  }
62
 
63
- for future in progress.tqdm(
64
- as_completed(future_to_pdf), total=len(pdf_files), desc="Converting PDFs"
65
- ):
66
  pdf_file = future_to_pdf[future]
67
  images, error = future.result()
68
  if error:
@@ -70,6 +74,7 @@ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
70
  gr.Info(error)
71
  else:
72
  all_images.extend(images)
 
73
 
74
  message = f"Saved {len(all_images)} images to temporary directory"
75
  if skipped_pdfs:
 
15
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
16
 
17
 
18
+ def process_pdf(pdf_file, sample_size, temp_dir, progress=gr.Progress()):
19
  try:
20
  pdf_path = pdf_file.name
21
  doc = fitz.open(pdf_path)
 
31
  )
32
 
33
  images = []
34
+ for i, page_num in enumerate(selected_pages):
35
  page = doc[page_num]
36
  pix = page.get_pixmap()
37
  image_path = os.path.join(
 
39
  )
40
  pix.save(image_path)
41
  images.append(image_path)
42
+ progress(
43
+ (i + 1) / len(selected_pages),
44
+ desc=f"Converting {os.path.basename(pdf_path)}",
45
+ )
46
 
47
  doc.close()
48
  return images, None
 
60
 
61
  with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
62
  future_to_pdf = {
63
+ executor.submit(
64
+ process_pdf, pdf_file, sample_size, temp_dir, progress
65
+ ): pdf_file
66
  for pdf_file in pdf_files
67
  }
68
 
69
+ for i, future in enumerate(as_completed(future_to_pdf)):
 
 
70
  pdf_file = future_to_pdf[future]
71
  images, error = future.result()
72
  if error:
 
74
  gr.Info(error)
75
  else:
76
  all_images.extend(images)
77
+ progress((i + 1) / len(pdf_files), desc="Converting PDFs")
78
 
79
  message = f"Saved {len(all_images)} images to temporary directory"
80
  if skipped_pdfs: