|
import os |
|
import fitz |
|
import easyocr |
|
import tempfile |
|
import streamlit as st |
|
|
|
|
|
def extract_text_with_positions_using_ocr(pdf_path): |
|
""" |
|
Extract text with bounding box positions using OCR for both English and Arabic text. |
|
:param pdf_path: Path to the input PDF file. |
|
:return: List of dictionaries containing text and positions for each page. |
|
""" |
|
extracted_data = [] |
|
doc = fitz.open(pdf_path) |
|
|
|
|
|
for page_num in range(len(doc)): |
|
page = doc.load_page(page_num) |
|
pix = page.get_pixmap(dpi=300) |
|
image_path = f"temp_page_{page_num}.png" |
|
pix.save(image_path) |
|
|
|
|
|
reader = easyocr.Reader(['en', 'ar']) |
|
results = reader.readtext(image_path, detail=1) |
|
|
|
|
|
page_data = [] |
|
for (bbox, text, confidence) in results: |
|
(x0, y0), (x1, y1) = bbox[0], bbox[2] |
|
page_data.append({ |
|
"text": text, |
|
"x0": x0, |
|
"y0": y1, |
|
"x1": x1, |
|
"y1": y0, |
|
"font_size": y1 - y0 |
|
}) |
|
|
|
extracted_data.append(page_data) |
|
|
|
|
|
os.remove(image_path) |
|
|
|
return extracted_data |
|
|
|
|
|
def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path): |
|
""" |
|
Overlay extracted text onto the original PDF using fonts from different font families. |
|
:param pdf_path: Path to the input PDF file. |
|
:param extracted_data: List of extracted text with positions. |
|
:param output_pdf_path: Path to save the output PDF file. |
|
""" |
|
doc = fitz.open(pdf_path) |
|
|
|
|
|
font_families = ["Times-Roman", "Helvetica", "Courier"] |
|
|
|
for page_num, page_data in enumerate(extracted_data): |
|
page = doc[page_num] |
|
|
|
for item in page_data: |
|
font = font_families[page_num % len(font_families)] |
|
page.insert_text( |
|
(item["x0"], item["y0"]), |
|
item["text"], |
|
fontsize=item["font_size"], |
|
fontname=font, |
|
color=(0, 0, 0), |
|
render_mode=0 |
|
) |
|
|
|
doc.save(output_pdf_path) |
|
print(f"PDF saved to: {output_pdf_path}") |
|
|
|
|
|
def process_pdf(uploaded_pdf, output_pdf_path): |
|
""" |
|
Process the uploaded PDF to extract text using OCR and overlay it as editable text. |
|
:param uploaded_pdf: The uploaded PDF file. |
|
:param output_pdf_path: Path to save the output PDF file. |
|
""" |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: |
|
temp_pdf.write(uploaded_pdf.read()) |
|
temp_pdf_path = temp_pdf.name |
|
|
|
|
|
extracted_data = extract_text_with_positions_using_ocr(temp_pdf_path) |
|
|
|
|
|
overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path) |
|
|
|
|
|
if os.path.exists(temp_pdf_path): |
|
os.remove(temp_pdf_path) |
|
|
|
|
|
|
|
def main(): |
|
st.title("PDF Text OCR and Overlay Tool") |
|
st.write("Upload a PDF to extract and overlay text as editable layers with different fonts.") |
|
|
|
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"]) |
|
if uploaded_file: |
|
output_pdf_path = "corrected_output_with_fonts.pdf" |
|
|
|
with st.spinner("Processing your PDF..."): |
|
process_pdf(uploaded_file, output_pdf_path) |
|
|
|
st.success("PDF processing complete!") |
|
|
|
|
|
with open(output_pdf_path, "rb") as f: |
|
st.download_button( |
|
label="Download Corrected PDF", |
|
data=f, |
|
file_name="corrected_output_with_fonts.pdf", |
|
mime="application/pdf" |
|
) |
|
|
|
|
|
if os.path.exists(output_pdf_path): |
|
os.remove(output_pdf_path) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |