Curves_to_text / app.py
SuriRaja's picture
Update app.py
43a0c5f verified
raw
history blame
4.09 kB
import os
import fitz # PyMuPDF
import easyocr
from PIL import Image
import streamlit as st
# Step 1: Convert PDF to Images
def pdf_to_images(pdf_path, output_folder, dpi=300):
"""Convert PDF pages to high-resolution images using PyMuPDF."""
pdf_document = fitz.open(pdf_path)
image_paths = []
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
# Render page as an image
pix = page.get_pixmap(dpi=dpi)
image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
pix.save(image_path)
image_paths.append(image_path)
return image_paths
# Step 2: Perform OCR on Images
def extract_text_from_images(image_paths):
"""Perform OCR on images to extract text."""
reader = easyocr.Reader(["en", "ar"], gpu=False) # Add languages as needed
text_data = []
for image_path in image_paths:
results = reader.readtext(image_path, detail=1)
text_data.append(results)
return text_data
# Step 3: Rebuild the PDF with Extracted Text
def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
"""Overlay extracted text onto the original PDF with enhanced alignment and font embedding."""
with fitz.open(original_pdf_path) as pdf:
output_pdf = fitz.open()
for page_num, page in enumerate(pdf):
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Create a new page in the output PDF
new_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height)
# Overlay OCR-extracted text
for bbox, text, conf in text_data[page_num]:
if conf > 0.6: # Confidence threshold
(x_min, y_min), (x_max, y_max) = bbox[0], bbox[2]
# Apply dynamic alignment adjustments
adjusted_x_min = x_min + 2 # Shift horizontally
adjusted_y_min = y_min + 3 # Shift vertically
adjusted_rect = fitz.Rect(adjusted_x_min, adjusted_y_min, x_max, y_max)
# Insert text with font embedding
new_page.insert_textbox(
adjusted_rect, # Use adjusted bounding box
text,
fontsize=10, # Adjust font size for better fit
fontname="helv", # Use Helvetica for better compatibility
fontfile="path/to/font.ttf", # Embed font (replace with actual font path)
color=(0, 0, 0), # Black text
)
# Add original diagrams and graphics
new_page.show_pdf_page(page.rect, pdf, page_num)
# Save rebuilt PDF with optimized structure
output_pdf.save(output_pdf_path, garbage=4)
# Full Workflow
def process_pdf(uploaded_pdf, output_folder):
"""Full process: PDF to images, OCR, and rebuild."""
os.makedirs(output_folder, exist_ok=True)
# Step 1: Convert PDF to Images
print("Converting PDF to images...")
image_paths = pdf_to_images(uploaded_pdf, output_folder)
# Step 2: Perform OCR
print("Performing OCR on images...")
text_data = extract_text_from_images(image_paths)
# Step 3: Rebuild the PDF
output_pdf_path = "rebuilt_output.pdf"
print("Rebuilding the PDF with extracted text...")
rebuild_pdf(uploaded_pdf, text_data, output_pdf_path)
return output_pdf_path
# Streamlit App
st.title("PDF Text Extraction and Rebuild")
uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_pdf:
output_folder = "temp_images"
st.write("Processing your PDF...")
with open("temp_uploaded.pdf", "wb") as temp_file:
temp_file.write(uploaded_pdf.read())
output_pdf_path = process_pdf("temp_uploaded.pdf", output_folder)
st.write("Download the processed PDF below:")
with open(output_pdf_path, "rb") as f:
st.download_button("Download PDF", f, file_name="processed_output.pdf")