Spaces:

SuriRaja
/

Curves_to_text

Sleeping

App Files Files Community

Curves_to_text / app.py

SuriRaja

Update app.py

43a0c5f verified 3 months ago

raw

history blame

4.09 kB

	import os
	import fitz # PyMuPDF
	import easyocr
	from PIL import Image
	import streamlit as st


	# Step 1: Convert PDF to Images
	def pdf_to_images(pdf_path, output_folder, dpi=300):
	"""Convert PDF pages to high-resolution images using PyMuPDF."""
	pdf_document = fitz.open(pdf_path)
	image_paths = []

	for page_num in range(len(pdf_document)):
	page = pdf_document[page_num]
	# Render page as an image
	pix = page.get_pixmap(dpi=dpi)
	image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
	pix.save(image_path)
	image_paths.append(image_path)

	return image_paths


	# Step 2: Perform OCR on Images
	def extract_text_from_images(image_paths):
	"""Perform OCR on images to extract text."""
	reader = easyocr.Reader(["en", "ar"], gpu=False) # Add languages as needed
	text_data = []

	for image_path in image_paths:
	results = reader.readtext(image_path, detail=1)
	text_data.append(results)

	return text_data


	# Step 3: Rebuild the PDF with Extracted Text
	def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
	"""Overlay extracted text onto the original PDF with enhanced alignment and font embedding."""
	with fitz.open(original_pdf_path) as pdf:
	output_pdf = fitz.open()

	for page_num, page in enumerate(pdf):
	pix = page.get_pixmap(dpi=300)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

	# Create a new page in the output PDF
	new_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height)

	# Overlay OCR-extracted text
	for bbox, text, conf in text_data[page_num]:
	if conf > 0.6: # Confidence threshold
	(x_min, y_min), (x_max, y_max) = bbox[0], bbox[2]

	# Apply dynamic alignment adjustments
	adjusted_x_min = x_min + 2 # Shift horizontally
	adjusted_y_min = y_min + 3 # Shift vertically
	adjusted_rect = fitz.Rect(adjusted_x_min, adjusted_y_min, x_max, y_max)

	# Insert text with font embedding
	new_page.insert_textbox(
	adjusted_rect, # Use adjusted bounding box
	text,
	fontsize=10, # Adjust font size for better fit
	fontname="helv", # Use Helvetica for better compatibility
	fontfile="path/to/font.ttf", # Embed font (replace with actual font path)
	color=(0, 0, 0), # Black text
	)

	# Add original diagrams and graphics
	new_page.show_pdf_page(page.rect, pdf, page_num)

	# Save rebuilt PDF with optimized structure
	output_pdf.save(output_pdf_path, garbage=4)


	# Full Workflow
	def process_pdf(uploaded_pdf, output_folder):
	"""Full process: PDF to images, OCR, and rebuild."""
	os.makedirs(output_folder, exist_ok=True)

	# Step 1: Convert PDF to Images
	print("Converting PDF to images...")
	image_paths = pdf_to_images(uploaded_pdf, output_folder)

	# Step 2: Perform OCR
	print("Performing OCR on images...")
	text_data = extract_text_from_images(image_paths)

	# Step 3: Rebuild the PDF
	output_pdf_path = "rebuilt_output.pdf"
	print("Rebuilding the PDF with extracted text...")
	rebuild_pdf(uploaded_pdf, text_data, output_pdf_path)

	return output_pdf_path


	# Streamlit App
	st.title("PDF Text Extraction and Rebuild")
	uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
	if uploaded_pdf:
	output_folder = "temp_images"
	st.write("Processing your PDF...")
	with open("temp_uploaded.pdf", "wb") as temp_file:
	temp_file.write(uploaded_pdf.read())

	output_pdf_path = process_pdf("temp_uploaded.pdf", output_folder)
	st.write("Download the processed PDF below:")
	with open(output_pdf_path, "rb") as f:
	st.download_button("Download PDF", f, file_name="processed_output.pdf")