File size: 4,392 Bytes
17622ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import fitz  # PyMuPDF for PDF handling
import easyocr  # OCR for text extraction
import tempfile
import streamlit as st


def extract_text_with_positions_using_ocr(pdf_path):
    """
    Extract text with bounding box positions using OCR for both English and Arabic text.
    :param pdf_path: Path to the input PDF file.
    :return: List of dictionaries containing text and positions for each page.
    """
    extracted_data = []
    doc = fitz.open(pdf_path)

    # Convert each PDF page to an image for OCR processing
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Convert PDF page to image
        image_path = f"temp_page_{page_num}.png"
        pix.save(image_path)

        # Perform OCR on the image
        reader = easyocr.Reader(['en', 'ar'])  # Supports English and Arabic
        results = reader.readtext(image_path, detail=1)  # detail=1 returns bounding box info

        # Extract text and positions
        page_data = []
        for (bbox, text, confidence) in results:
            (x0, y0), (x1, y1) = bbox[0], bbox[2]
            page_data.append({
                "text": text,
                "x0": x0,
                "y0": y1,  # Adjust to bottom-left corner (PDF coordinates)
                "x1": x1,
                "y1": y0,
                "font_size": y1 - y0  # Approximate font size
            })

        extracted_data.append(page_data)

        # Cleanup temporary image
        os.remove(image_path)

    return extracted_data


def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
    """
    Overlay extracted text onto the original PDF using fonts from different font families.
    :param pdf_path: Path to the input PDF file.
    :param extracted_data: List of extracted text with positions.
    :param output_pdf_path: Path to save the output PDF file.
    """
    doc = fitz.open(pdf_path)

    # Define font families for overlay
    font_families = ["Times-Roman", "Helvetica", "Courier"]  # Example font families

    for page_num, page_data in enumerate(extracted_data):
        page = doc[page_num]

        for item in page_data:
            font = font_families[page_num % len(font_families)]  # Rotate fonts across pages
            page.insert_text(
                (item["x0"], item["y0"]),
                item["text"],
                fontsize=item["font_size"],
                fontname=font,
                color=(0, 0, 0),  # Black text
                render_mode=0  # Ensure text is not outlined
            )

    doc.save(output_pdf_path)
    print(f"PDF saved to: {output_pdf_path}")


def process_pdf(uploaded_pdf, output_pdf_path):
    """
    Process the uploaded PDF to extract text using OCR and overlay it as editable text.
    :param uploaded_pdf: The uploaded PDF file.
    :param output_pdf_path: Path to save the output PDF file.
    """
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(uploaded_pdf.read())
        temp_pdf_path = temp_pdf.name

    # Step 1: Extract text using OCR
    extracted_data = extract_text_with_positions_using_ocr(temp_pdf_path)

    # Step 2: Overlay extracted text onto the original PDF
    overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)

    # Cleanup temporary file
    if os.path.exists(temp_pdf_path):
        os.remove(temp_pdf_path)


# Streamlit App
def main():
    st.title("PDF Text OCR and Overlay Tool")
    st.write("Upload a PDF to extract and overlay text as editable layers with different fonts.")

    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
    if uploaded_file:
        output_pdf_path = "corrected_output_with_fonts.pdf"

        with st.spinner("Processing your PDF..."):
            process_pdf(uploaded_file, output_pdf_path)

        st.success("PDF processing complete!")

        # Provide a download button for the processed PDF
        with open(output_pdf_path, "rb") as f:
            st.download_button(
                label="Download Corrected PDF",
                data=f,
                file_name="corrected_output_with_fonts.pdf",
                mime="application/pdf"
            )

        # Cleanup the processed output PDF
        if os.path.exists(output_pdf_path):
            os.remove(output_pdf_path)


if __name__ == "__main__":
    main()