Spaces:
Sleeping
Sleeping
File size: 4,085 Bytes
87a44c1 70bc109 aab2516 70bc109 87a44c1 aab2516 87a44c1 aab2516 87a44c1 aab2516 87a44c1 aab2516 87a44c1 70bc109 aab2516 87a44c1 aab2516 87a44c1 aab2516 87a44c1 70bc109 aab2516 87a44c1 43a0c5f 87a44c1 590af17 70bc109 87a44c1 590af17 87a44c1 70bc109 87a44c1 43a0c5f aff2fb2 43a0c5f 590af17 aff2fb2 43a0c5f 70bc109 87a44c1 590af17 077c446 43a0c5f aff2fb2 87a44c1 aab2516 87a44c1 aab2516 87a44c1 aab2516 70bc109 aab2516 87a44c1 077c446 aab2516 87a44c1 aab2516 70bc109 87a44c1 aab2516 87a44c1 70bc109 87a44c1 aab2516 70bc109 87a44c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import os
import fitz # PyMuPDF
import easyocr
from PIL import Image
import streamlit as st
# Step 1: Convert PDF to Images
def pdf_to_images(pdf_path, output_folder, dpi=300):
"""Convert PDF pages to high-resolution images using PyMuPDF."""
pdf_document = fitz.open(pdf_path)
image_paths = []
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
# Render page as an image
pix = page.get_pixmap(dpi=dpi)
image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
pix.save(image_path)
image_paths.append(image_path)
return image_paths
# Step 2: Perform OCR on Images
def extract_text_from_images(image_paths):
"""Perform OCR on images to extract text."""
reader = easyocr.Reader(["en", "ar"], gpu=False) # Add languages as needed
text_data = []
for image_path in image_paths:
results = reader.readtext(image_path, detail=1)
text_data.append(results)
return text_data
# Step 3: Rebuild the PDF with Extracted Text
def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
"""Overlay extracted text onto the original PDF with enhanced alignment and font embedding."""
with fitz.open(original_pdf_path) as pdf:
output_pdf = fitz.open()
for page_num, page in enumerate(pdf):
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Create a new page in the output PDF
new_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height)
# Overlay OCR-extracted text
for bbox, text, conf in text_data[page_num]:
if conf > 0.6: # Confidence threshold
(x_min, y_min), (x_max, y_max) = bbox[0], bbox[2]
# Apply dynamic alignment adjustments
adjusted_x_min = x_min + 2 # Shift horizontally
adjusted_y_min = y_min + 3 # Shift vertically
adjusted_rect = fitz.Rect(adjusted_x_min, adjusted_y_min, x_max, y_max)
# Insert text with font embedding
new_page.insert_textbox(
adjusted_rect, # Use adjusted bounding box
text,
fontsize=10, # Adjust font size for better fit
fontname="helv", # Use Helvetica for better compatibility
fontfile="path/to/font.ttf", # Embed font (replace with actual font path)
color=(0, 0, 0), # Black text
)
# Add original diagrams and graphics
new_page.show_pdf_page(page.rect, pdf, page_num)
# Save rebuilt PDF with optimized structure
output_pdf.save(output_pdf_path, garbage=4)
# Full Workflow
def process_pdf(uploaded_pdf, output_folder):
"""Full process: PDF to images, OCR, and rebuild."""
os.makedirs(output_folder, exist_ok=True)
# Step 1: Convert PDF to Images
print("Converting PDF to images...")
image_paths = pdf_to_images(uploaded_pdf, output_folder)
# Step 2: Perform OCR
print("Performing OCR on images...")
text_data = extract_text_from_images(image_paths)
# Step 3: Rebuild the PDF
output_pdf_path = "rebuilt_output.pdf"
print("Rebuilding the PDF with extracted text...")
rebuild_pdf(uploaded_pdf, text_data, output_pdf_path)
return output_pdf_path
# Streamlit App
st.title("PDF Text Extraction and Rebuild")
uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_pdf:
output_folder = "temp_images"
st.write("Processing your PDF...")
with open("temp_uploaded.pdf", "wb") as temp_file:
temp_file.write(uploaded_pdf.read())
output_pdf_path = process_pdf("temp_uploaded.pdf", output_folder)
st.write("Download the processed PDF below:")
with open(output_pdf_path, "rb") as f:
st.download_button("Download PDF", f, file_name="processed_output.pdf")
|