Spaces:
Sleeping
Sleeping
import os | |
import fitz # PyMuPDF | |
import easyocr | |
from PIL import Image | |
import streamlit as st | |
# Step 1: Convert PDF to Images | |
def pdf_to_images(pdf_path, output_folder, dpi=300): | |
"""Convert PDF pages to high-resolution images using PyMuPDF.""" | |
pdf_document = fitz.open(pdf_path) | |
image_paths = [] | |
for page_num in range(len(pdf_document)): | |
page = pdf_document[page_num] | |
# Render page as an image | |
pix = page.get_pixmap(dpi=dpi) | |
image_path = os.path.join(output_folder, f"page_{page_num + 1}.png") | |
pix.save(image_path) | |
image_paths.append(image_path) | |
return image_paths | |
# Step 2: Perform OCR on Images | |
def extract_text_from_images(image_paths): | |
"""Perform OCR on images to extract text.""" | |
reader = easyocr.Reader(["en", "ar"], gpu=False) # Add languages as needed | |
text_data = [] | |
for image_path in image_paths: | |
results = reader.readtext(image_path, detail=1) | |
text_data.append(results) | |
return text_data | |
# Step 3: Rebuild the PDF with Extracted Text | |
def rebuild_pdf(original_pdf_path, text_data, output_pdf_path): | |
"""Overlay extracted text onto the original PDF with enhanced alignment and font embedding.""" | |
with fitz.open(original_pdf_path) as pdf: | |
output_pdf = fitz.open() | |
for page_num, page in enumerate(pdf): | |
pix = page.get_pixmap(dpi=300) | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
# Create a new page in the output PDF | |
new_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height) | |
# Overlay OCR-extracted text | |
for bbox, text, conf in text_data[page_num]: | |
if conf > 0.6: # Confidence threshold | |
(x_min, y_min), (x_max, y_max) = bbox[0], bbox[2] | |
# Apply dynamic alignment adjustments | |
adjusted_x_min = x_min + 2 # Shift horizontally | |
adjusted_y_min = y_min + 3 # Shift vertically | |
adjusted_rect = fitz.Rect(adjusted_x_min, adjusted_y_min, x_max, y_max) | |
# Insert text with font embedding | |
new_page.insert_textbox( | |
adjusted_rect, # Use adjusted bounding box | |
text, | |
fontsize=10, # Adjust font size for better fit | |
fontname="helv", # Use Helvetica for better compatibility | |
fontfile="path/to/font.ttf", # Embed font (replace with actual font path) | |
color=(0, 0, 0), # Black text | |
) | |
# Add original diagrams and graphics | |
new_page.show_pdf_page(page.rect, pdf, page_num) | |
# Save rebuilt PDF with optimized structure | |
output_pdf.save(output_pdf_path, garbage=4) | |
# Full Workflow | |
def process_pdf(uploaded_pdf, output_folder): | |
"""Full process: PDF to images, OCR, and rebuild.""" | |
os.makedirs(output_folder, exist_ok=True) | |
# Step 1: Convert PDF to Images | |
print("Converting PDF to images...") | |
image_paths = pdf_to_images(uploaded_pdf, output_folder) | |
# Step 2: Perform OCR | |
print("Performing OCR on images...") | |
text_data = extract_text_from_images(image_paths) | |
# Step 3: Rebuild the PDF | |
output_pdf_path = "rebuilt_output.pdf" | |
print("Rebuilding the PDF with extracted text...") | |
rebuild_pdf(uploaded_pdf, text_data, output_pdf_path) | |
return output_pdf_path | |
# Streamlit App | |
st.title("PDF Text Extraction and Rebuild") | |
uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"]) | |
if uploaded_pdf: | |
output_folder = "temp_images" | |
st.write("Processing your PDF...") | |
with open("temp_uploaded.pdf", "wb") as temp_file: | |
temp_file.write(uploaded_pdf.read()) | |
output_pdf_path = process_pdf("temp_uploaded.pdf", output_folder) | |
st.write("Download the processed PDF below:") | |
with open(output_pdf_path, "rb") as f: | |
st.download_button("Download PDF", f, file_name="processed_output.pdf") | |