Spaces:

SuriRaja
/

Curves_to_text

Sleeping

App Files Files Community

SuriRaja commited on Jan 7

Commit

aab2516

verified ·

1 Parent(s): fa89e59

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -20

app.py CHANGED Viewed

@@ -1,32 +1,40 @@
 import os
-from pdf2image import convert_from_path
-import easyocr
 import fitz  # PyMuPDF
 from PIL import Image
 import streamlit as st
-import tempfile
 # Step 1: Convert PDF to Images
 def pdf_to_images(pdf_path, output_folder, dpi=300):
-    """Convert PDF pages to high-resolution images."""
-    images = convert_from_path(pdf_path, dpi=dpi)
     image_paths = []
-    for i, image in enumerate(images):
-        image_path = f"{output_folder}/page_{i+1}.png"
-        image.save(image_path, "PNG")
         image_paths.append(image_path)
     return image_paths
 # Step 2: Perform OCR on Images
 def extract_text_from_images(image_paths):
     """Perform OCR on images to extract text."""
     reader = easyocr.Reader(["en", "ar"], gpu=False)  # Add languages as needed
     text_data = []
     for image_path in image_paths:
         results = reader.readtext(image_path, detail=1)
         text_data.append(results)
     return text_data
 # Step 3: Rebuild the PDF with Extracted Text
 def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
     """Overlay extracted text onto the original PDF."""
@@ -34,6 +42,7 @@ def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
         output_pdf = fitz.open()
         for page_num, page in enumerate(pdf):
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
@@ -55,38 +64,38 @@ def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
         # Save rebuilt PDF
         output_pdf.save(output_pdf_path)
 # Full Workflow
 def process_pdf(uploaded_pdf, output_folder):
     """Full process: PDF to images, OCR, and rebuild."""
     os.makedirs(output_folder, exist_ok=True)
-    # Step 1: Save UploadedFile to a Temporary Location
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
-        temp_file.write(uploaded_pdf.read())
-        temp_pdf_path = temp_file.name  # Path to the temporary PDF file
-    # Step 2: Rasterize PDF
     print("Converting PDF to images...")
-    image_paths = pdf_to_images(temp_pdf_path, output_folder)
-    # Step 3: Extract Text via OCR
     print("Performing OCR on images...")
     text_data = extract_text_from_images(image_paths)
-    # Step 4: Rebuild the PDF
     output_pdf_path = "rebuilt_output.pdf"
     print("Rebuilding the PDF with extracted text...")
-    rebuild_pdf(temp_pdf_path, text_data, output_pdf_path)
     return output_pdf_path
-# Streamlit App for Upload and Processing
 st.title("PDF Text Extraction and Rebuild")
 uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
 if uploaded_pdf:
     output_folder = "temp_images"
     st.write("Processing your PDF...")
-    output_pdf_path = process_pdf(uploaded_pdf, output_folder)
     st.write("Download the processed PDF below:")
     with open(output_pdf_path, "rb") as f:
         st.download_button("Download PDF", f, file_name="processed_output.pdf")

 import os
 import fitz  # PyMuPDF
+import easyocr
 from PIL import Image
 import streamlit as st
 # Step 1: Convert PDF to Images
 def pdf_to_images(pdf_path, output_folder, dpi=300):
+    """Convert PDF pages to high-resolution images using PyMuPDF."""
+    pdf_document = fitz.open(pdf_path)
     image_paths = []
+    for page_num in range(len(pdf_document)):
+        page = pdf_document[page_num]
+        # Render page as an image
+        pix = page.get_pixmap(dpi=dpi)
+        image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
+        pix.save(image_path)
         image_paths.append(image_path)
     return image_paths
 # Step 2: Perform OCR on Images
 def extract_text_from_images(image_paths):
     """Perform OCR on images to extract text."""
     reader = easyocr.Reader(["en", "ar"], gpu=False)  # Add languages as needed
     text_data = []
     for image_path in image_paths:
         results = reader.readtext(image_path, detail=1)
         text_data.append(results)
     return text_data
 # Step 3: Rebuild the PDF with Extracted Text
 def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
     """Overlay extracted text onto the original PDF."""
         output_pdf = fitz.open()
         for page_num, page in enumerate(pdf):
+            # Render page as an image for reference
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         # Save rebuilt PDF
         output_pdf.save(output_pdf_path)
 # Full Workflow
 def process_pdf(uploaded_pdf, output_folder):
     """Full process: PDF to images, OCR, and rebuild."""
     os.makedirs(output_folder, exist_ok=True)
+    # Step 1: Convert PDF to Images
     print("Converting PDF to images...")
+    image_paths = pdf_to_images(uploaded_pdf, output_folder)
+    # Step 2: Perform OCR
     print("Performing OCR on images...")
     text_data = extract_text_from_images(image_paths)
+    # Step 3: Rebuild the PDF
     output_pdf_path = "rebuilt_output.pdf"
     print("Rebuilding the PDF with extracted text...")
+    rebuild_pdf(uploaded_pdf, text_data, output_pdf_path)
     return output_pdf_path
+# Streamlit App
 st.title("PDF Text Extraction and Rebuild")
 uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
 if uploaded_pdf:
     output_folder = "temp_images"
     st.write("Processing your PDF...")
+    with open("temp_uploaded.pdf", "wb") as temp_file:
+        temp_file.write(uploaded_pdf.read())
+    output_pdf_path = process_pdf("temp_uploaded.pdf", output_folder)
     st.write("Download the processed PDF below:")
     with open(output_pdf_path, "rb") as f:
         st.download_button("Download PDF", f, file_name="processed_output.pdf")