Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,32 +1,40 @@
|
|
1 |
import os
|
2 |
-
from pdf2image import convert_from_path
|
3 |
-
import easyocr
|
4 |
import fitz # PyMuPDF
|
|
|
5 |
from PIL import Image
|
6 |
import streamlit as st
|
7 |
-
|
8 |
|
9 |
# Step 1: Convert PDF to Images
|
10 |
def pdf_to_images(pdf_path, output_folder, dpi=300):
|
11 |
-
"""Convert PDF pages to high-resolution images."""
|
12 |
-
|
13 |
image_paths = []
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
17 |
image_paths.append(image_path)
|
|
|
18 |
return image_paths
|
19 |
|
|
|
20 |
# Step 2: Perform OCR on Images
|
21 |
def extract_text_from_images(image_paths):
|
22 |
"""Perform OCR on images to extract text."""
|
23 |
reader = easyocr.Reader(["en", "ar"], gpu=False) # Add languages as needed
|
24 |
text_data = []
|
|
|
25 |
for image_path in image_paths:
|
26 |
results = reader.readtext(image_path, detail=1)
|
27 |
text_data.append(results)
|
|
|
28 |
return text_data
|
29 |
|
|
|
30 |
# Step 3: Rebuild the PDF with Extracted Text
|
31 |
def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
|
32 |
"""Overlay extracted text onto the original PDF."""
|
@@ -34,6 +42,7 @@ def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
|
|
34 |
output_pdf = fitz.open()
|
35 |
|
36 |
for page_num, page in enumerate(pdf):
|
|
|
37 |
pix = page.get_pixmap(dpi=300)
|
38 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
39 |
|
@@ -55,38 +64,38 @@ def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
|
|
55 |
# Save rebuilt PDF
|
56 |
output_pdf.save(output_pdf_path)
|
57 |
|
|
|
58 |
# Full Workflow
|
59 |
def process_pdf(uploaded_pdf, output_folder):
|
60 |
"""Full process: PDF to images, OCR, and rebuild."""
|
61 |
os.makedirs(output_folder, exist_ok=True)
|
62 |
|
63 |
-
# Step 1:
|
64 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
65 |
-
temp_file.write(uploaded_pdf.read())
|
66 |
-
temp_pdf_path = temp_file.name # Path to the temporary PDF file
|
67 |
-
|
68 |
-
# Step 2: Rasterize PDF
|
69 |
print("Converting PDF to images...")
|
70 |
-
image_paths = pdf_to_images(
|
71 |
|
72 |
-
# Step
|
73 |
print("Performing OCR on images...")
|
74 |
text_data = extract_text_from_images(image_paths)
|
75 |
|
76 |
-
# Step
|
77 |
output_pdf_path = "rebuilt_output.pdf"
|
78 |
print("Rebuilding the PDF with extracted text...")
|
79 |
-
rebuild_pdf(
|
80 |
|
81 |
return output_pdf_path
|
82 |
|
83 |
-
|
|
|
84 |
st.title("PDF Text Extraction and Rebuild")
|
85 |
uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
|
86 |
if uploaded_pdf:
|
87 |
output_folder = "temp_images"
|
88 |
st.write("Processing your PDF...")
|
89 |
-
|
|
|
|
|
|
|
90 |
st.write("Download the processed PDF below:")
|
91 |
with open(output_pdf_path, "rb") as f:
|
92 |
st.download_button("Download PDF", f, file_name="processed_output.pdf")
|
|
|
1 |
import os
|
|
|
|
|
2 |
import fitz # PyMuPDF
|
3 |
+
import easyocr
|
4 |
from PIL import Image
|
5 |
import streamlit as st
|
6 |
+
|
7 |
|
8 |
# Step 1: Convert PDF to Images
|
9 |
def pdf_to_images(pdf_path, output_folder, dpi=300):
|
10 |
+
"""Convert PDF pages to high-resolution images using PyMuPDF."""
|
11 |
+
pdf_document = fitz.open(pdf_path)
|
12 |
image_paths = []
|
13 |
+
|
14 |
+
for page_num in range(len(pdf_document)):
|
15 |
+
page = pdf_document[page_num]
|
16 |
+
# Render page as an image
|
17 |
+
pix = page.get_pixmap(dpi=dpi)
|
18 |
+
image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
|
19 |
+
pix.save(image_path)
|
20 |
image_paths.append(image_path)
|
21 |
+
|
22 |
return image_paths
|
23 |
|
24 |
+
|
25 |
# Step 2: Perform OCR on Images
|
26 |
def extract_text_from_images(image_paths):
|
27 |
"""Perform OCR on images to extract text."""
|
28 |
reader = easyocr.Reader(["en", "ar"], gpu=False) # Add languages as needed
|
29 |
text_data = []
|
30 |
+
|
31 |
for image_path in image_paths:
|
32 |
results = reader.readtext(image_path, detail=1)
|
33 |
text_data.append(results)
|
34 |
+
|
35 |
return text_data
|
36 |
|
37 |
+
|
38 |
# Step 3: Rebuild the PDF with Extracted Text
|
39 |
def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
|
40 |
"""Overlay extracted text onto the original PDF."""
|
|
|
42 |
output_pdf = fitz.open()
|
43 |
|
44 |
for page_num, page in enumerate(pdf):
|
45 |
+
# Render page as an image for reference
|
46 |
pix = page.get_pixmap(dpi=300)
|
47 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
48 |
|
|
|
64 |
# Save rebuilt PDF
|
65 |
output_pdf.save(output_pdf_path)
|
66 |
|
67 |
+
|
68 |
# Full Workflow
|
69 |
def process_pdf(uploaded_pdf, output_folder):
|
70 |
"""Full process: PDF to images, OCR, and rebuild."""
|
71 |
os.makedirs(output_folder, exist_ok=True)
|
72 |
|
73 |
+
# Step 1: Convert PDF to Images
|
|
|
|
|
|
|
|
|
|
|
74 |
print("Converting PDF to images...")
|
75 |
+
image_paths = pdf_to_images(uploaded_pdf, output_folder)
|
76 |
|
77 |
+
# Step 2: Perform OCR
|
78 |
print("Performing OCR on images...")
|
79 |
text_data = extract_text_from_images(image_paths)
|
80 |
|
81 |
+
# Step 3: Rebuild the PDF
|
82 |
output_pdf_path = "rebuilt_output.pdf"
|
83 |
print("Rebuilding the PDF with extracted text...")
|
84 |
+
rebuild_pdf(uploaded_pdf, text_data, output_pdf_path)
|
85 |
|
86 |
return output_pdf_path
|
87 |
|
88 |
+
|
89 |
+
# Streamlit App
|
90 |
st.title("PDF Text Extraction and Rebuild")
|
91 |
uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
|
92 |
if uploaded_pdf:
|
93 |
output_folder = "temp_images"
|
94 |
st.write("Processing your PDF...")
|
95 |
+
with open("temp_uploaded.pdf", "wb") as temp_file:
|
96 |
+
temp_file.write(uploaded_pdf.read())
|
97 |
+
|
98 |
+
output_pdf_path = process_pdf("temp_uploaded.pdf", output_folder)
|
99 |
st.write("Download the processed PDF below:")
|
100 |
with open(output_pdf_path, "rb") as f:
|
101 |
st.download_button("Download PDF", f, file_name="processed_output.pdf")
|