SuriRaja commited on
Commit
aff2fb2
·
verified ·
1 Parent(s): 609ff0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -5
app.py CHANGED
@@ -37,12 +37,11 @@ def extract_text_from_images(image_paths):
37
 
38
  # Step 3: Rebuild the PDF with Extracted Text
39
  def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
40
- """Overlay extracted text onto the original PDF."""
41
  with fitz.open(original_pdf_path) as pdf:
42
  output_pdf = fitz.open()
43
 
44
  for page_num, page in enumerate(pdf):
45
- # Render page as an image for reference
46
  pix = page.get_pixmap(dpi=300)
47
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
48
 
@@ -54,15 +53,25 @@ def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
54
  if conf > 0.6: # Confidence threshold
55
  (x_min, y_min), (x_max, y_max) = bbox[0], bbox[2]
56
  rect = fitz.Rect(x_min, y_min, x_max, y_max)
 
 
 
 
 
 
57
  new_page.insert_textbox(
58
- rect, text, fontsize=12, fontname="helv", color=(0, 0, 0)
 
 
 
 
59
  )
60
 
61
  # Add original diagrams and graphics
62
  new_page.show_pdf_page(page.rect, pdf, page_num)
63
 
64
- # Save rebuilt PDF
65
- output_pdf.save(output_pdf_path)
66
 
67
 
68
  # Full Workflow
 
37
 
38
  # Step 3: Rebuild the PDF with Extracted Text
39
  def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
40
+ """Overlay extracted text onto the original PDF with enhanced alignment."""
41
  with fitz.open(original_pdf_path) as pdf:
42
  output_pdf = fitz.open()
43
 
44
  for page_num, page in enumerate(pdf):
 
45
  pix = page.get_pixmap(dpi=300)
46
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
47
 
 
53
  if conf > 0.6: # Confidence threshold
54
  (x_min, y_min), (x_max, y_max) = bbox[0], bbox[2]
55
  rect = fitz.Rect(x_min, y_min, x_max, y_max)
56
+
57
+ # Adjust text placement for alignment
58
+ adjusted_x = x_min + 2 # Fine-tune as needed
59
+ adjusted_y = y_min + 2 # Fine-tune as needed
60
+ adjusted_rect = fitz.Rect(adjusted_x, adjusted_y, x_max, y_max)
61
+
62
  new_page.insert_textbox(
63
+ adjusted_rect, # Use adjusted bounding box
64
+ text,
65
+ fontsize=10, # Adjust font size as needed
66
+ fontname="helv", # Use Helvetica for compatibility
67
+ color=(0, 0, 0), # Black color for text
68
  )
69
 
70
  # Add original diagrams and graphics
71
  new_page.show_pdf_page(page.rect, pdf, page_num)
72
 
73
+ # Save rebuilt PDF with embedded fonts
74
+ output_pdf.save(output_pdf_path, garbage=4)
75
 
76
 
77
  # Full Workflow