Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -37,12 +37,11 @@ def extract_text_from_images(image_paths):
|
|
37 |
|
38 |
# Step 3: Rebuild the PDF with Extracted Text
|
39 |
def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
|
40 |
-
"""Overlay extracted text onto the original PDF."""
|
41 |
with fitz.open(original_pdf_path) as pdf:
|
42 |
output_pdf = fitz.open()
|
43 |
|
44 |
for page_num, page in enumerate(pdf):
|
45 |
-
# Render page as an image for reference
|
46 |
pix = page.get_pixmap(dpi=300)
|
47 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
48 |
|
@@ -54,15 +53,25 @@ def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
|
|
54 |
if conf > 0.6: # Confidence threshold
|
55 |
(x_min, y_min), (x_max, y_max) = bbox[0], bbox[2]
|
56 |
rect = fitz.Rect(x_min, y_min, x_max, y_max)
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
new_page.insert_textbox(
|
58 |
-
|
|
|
|
|
|
|
|
|
59 |
)
|
60 |
|
61 |
# Add original diagrams and graphics
|
62 |
new_page.show_pdf_page(page.rect, pdf, page_num)
|
63 |
|
64 |
-
# Save rebuilt PDF
|
65 |
-
output_pdf.save(output_pdf_path)
|
66 |
|
67 |
|
68 |
# Full Workflow
|
|
|
37 |
|
38 |
# Step 3: Rebuild the PDF with Extracted Text
|
39 |
def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
|
40 |
+
"""Overlay extracted text onto the original PDF with enhanced alignment."""
|
41 |
with fitz.open(original_pdf_path) as pdf:
|
42 |
output_pdf = fitz.open()
|
43 |
|
44 |
for page_num, page in enumerate(pdf):
|
|
|
45 |
pix = page.get_pixmap(dpi=300)
|
46 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
47 |
|
|
|
53 |
if conf > 0.6: # Confidence threshold
|
54 |
(x_min, y_min), (x_max, y_max) = bbox[0], bbox[2]
|
55 |
rect = fitz.Rect(x_min, y_min, x_max, y_max)
|
56 |
+
|
57 |
+
# Adjust text placement for alignment
|
58 |
+
adjusted_x = x_min + 2 # Fine-tune as needed
|
59 |
+
adjusted_y = y_min + 2 # Fine-tune as needed
|
60 |
+
adjusted_rect = fitz.Rect(adjusted_x, adjusted_y, x_max, y_max)
|
61 |
+
|
62 |
new_page.insert_textbox(
|
63 |
+
adjusted_rect, # Use adjusted bounding box
|
64 |
+
text,
|
65 |
+
fontsize=10, # Adjust font size as needed
|
66 |
+
fontname="helv", # Use Helvetica for compatibility
|
67 |
+
color=(0, 0, 0), # Black color for text
|
68 |
)
|
69 |
|
70 |
# Add original diagrams and graphics
|
71 |
new_page.show_pdf_page(page.rect, pdf, page_num)
|
72 |
|
73 |
+
# Save rebuilt PDF with embedded fonts
|
74 |
+
output_pdf.save(output_pdf_path, garbage=4)
|
75 |
|
76 |
|
77 |
# Full Workflow
|