File size: 4,085 Bytes
87a44c1
70bc109
aab2516
70bc109
87a44c1
aab2516
87a44c1
 
 
aab2516
 
87a44c1
aab2516
 
 
 
 
 
 
87a44c1
aab2516
87a44c1
70bc109
aab2516
87a44c1
 
 
 
 
aab2516
87a44c1
 
 
aab2516
87a44c1
70bc109
aab2516
87a44c1
 
43a0c5f
87a44c1
 
590af17
70bc109
 
 
 
87a44c1
 
590af17
87a44c1
 
70bc109
87a44c1
43a0c5f
 
 
 
 
aff2fb2
43a0c5f
590af17
aff2fb2
 
43a0c5f
 
 
 
70bc109
 
87a44c1
590af17
077c446
43a0c5f
aff2fb2
87a44c1
aab2516
87a44c1
 
 
 
 
aab2516
87a44c1
aab2516
70bc109
aab2516
87a44c1
 
077c446
aab2516
87a44c1
 
aab2516
70bc109
87a44c1
 
aab2516
 
87a44c1
70bc109
 
87a44c1
 
aab2516
 
 
 
70bc109
87a44c1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import fitz  # PyMuPDF
import easyocr
from PIL import Image
import streamlit as st


# Step 1: Convert PDF to Images
def pdf_to_images(pdf_path, output_folder, dpi=300):
    """Convert PDF pages to high-resolution images using PyMuPDF."""
    pdf_document = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        # Render page as an image
        pix = page.get_pixmap(dpi=dpi)
        image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
        pix.save(image_path)
        image_paths.append(image_path)

    return image_paths


# Step 2: Perform OCR on Images
def extract_text_from_images(image_paths):
    """Perform OCR on images to extract text."""
    reader = easyocr.Reader(["en", "ar"], gpu=False)  # Add languages as needed
    text_data = []

    for image_path in image_paths:
        results = reader.readtext(image_path, detail=1)
        text_data.append(results)

    return text_data


# Step 3: Rebuild the PDF with Extracted Text
def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
    """Overlay extracted text onto the original PDF with enhanced alignment and font embedding."""
    with fitz.open(original_pdf_path) as pdf:
        output_pdf = fitz.open()

        for page_num, page in enumerate(pdf):
            pix = page.get_pixmap(dpi=300)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            # Create a new page in the output PDF
            new_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height)

            # Overlay OCR-extracted text
            for bbox, text, conf in text_data[page_num]:
                if conf > 0.6:  # Confidence threshold
                    (x_min, y_min), (x_max, y_max) = bbox[0], bbox[2]
                    
                    # Apply dynamic alignment adjustments
                    adjusted_x_min = x_min + 2  # Shift horizontally
                    adjusted_y_min = y_min + 3  # Shift vertically
                    adjusted_rect = fitz.Rect(adjusted_x_min, adjusted_y_min, x_max, y_max)

                    # Insert text with font embedding
                    new_page.insert_textbox(
                        adjusted_rect,  # Use adjusted bounding box
                        text,
                        fontsize=10,  # Adjust font size for better fit
                        fontname="helv",  # Use Helvetica for better compatibility
                        fontfile="path/to/font.ttf",  # Embed font (replace with actual font path)
                        color=(0, 0, 0),  # Black text
                    )

            # Add original diagrams and graphics
            new_page.show_pdf_page(page.rect, pdf, page_num)

        # Save rebuilt PDF with optimized structure
        output_pdf.save(output_pdf_path, garbage=4)


# Full Workflow
def process_pdf(uploaded_pdf, output_folder):
    """Full process: PDF to images, OCR, and rebuild."""
    os.makedirs(output_folder, exist_ok=True)

    # Step 1: Convert PDF to Images
    print("Converting PDF to images...")
    image_paths = pdf_to_images(uploaded_pdf, output_folder)

    # Step 2: Perform OCR
    print("Performing OCR on images...")
    text_data = extract_text_from_images(image_paths)

    # Step 3: Rebuild the PDF
    output_pdf_path = "rebuilt_output.pdf"
    print("Rebuilding the PDF with extracted text...")
    rebuild_pdf(uploaded_pdf, text_data, output_pdf_path)

    return output_pdf_path


# Streamlit App
st.title("PDF Text Extraction and Rebuild")
uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_pdf:
    output_folder = "temp_images"
    st.write("Processing your PDF...")
    with open("temp_uploaded.pdf", "wb") as temp_file:
        temp_file.write(uploaded_pdf.read())

    output_pdf_path = process_pdf("temp_uploaded.pdf", output_folder)
    st.write("Download the processed PDF below:")
    with open(output_pdf_path, "rb") as f:
        st.download_button("Download PDF", f, file_name="processed_output.pdf")