SuriRaja commited on
Commit
aab2516
·
verified ·
1 Parent(s): fa89e59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -20
app.py CHANGED
@@ -1,32 +1,40 @@
1
  import os
2
- from pdf2image import convert_from_path
3
- import easyocr
4
  import fitz # PyMuPDF
 
5
  from PIL import Image
6
  import streamlit as st
7
- import tempfile
8
 
9
  # Step 1: Convert PDF to Images
10
  def pdf_to_images(pdf_path, output_folder, dpi=300):
11
- """Convert PDF pages to high-resolution images."""
12
- images = convert_from_path(pdf_path, dpi=dpi)
13
  image_paths = []
14
- for i, image in enumerate(images):
15
- image_path = f"{output_folder}/page_{i+1}.png"
16
- image.save(image_path, "PNG")
 
 
 
 
17
  image_paths.append(image_path)
 
18
  return image_paths
19
 
 
20
  # Step 2: Perform OCR on Images
21
  def extract_text_from_images(image_paths):
22
  """Perform OCR on images to extract text."""
23
  reader = easyocr.Reader(["en", "ar"], gpu=False) # Add languages as needed
24
  text_data = []
 
25
  for image_path in image_paths:
26
  results = reader.readtext(image_path, detail=1)
27
  text_data.append(results)
 
28
  return text_data
29
 
 
30
  # Step 3: Rebuild the PDF with Extracted Text
31
  def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
32
  """Overlay extracted text onto the original PDF."""
@@ -34,6 +42,7 @@ def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
34
  output_pdf = fitz.open()
35
 
36
  for page_num, page in enumerate(pdf):
 
37
  pix = page.get_pixmap(dpi=300)
38
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
39
 
@@ -55,38 +64,38 @@ def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
55
  # Save rebuilt PDF
56
  output_pdf.save(output_pdf_path)
57
 
 
58
  # Full Workflow
59
  def process_pdf(uploaded_pdf, output_folder):
60
  """Full process: PDF to images, OCR, and rebuild."""
61
  os.makedirs(output_folder, exist_ok=True)
62
 
63
- # Step 1: Save UploadedFile to a Temporary Location
64
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
65
- temp_file.write(uploaded_pdf.read())
66
- temp_pdf_path = temp_file.name # Path to the temporary PDF file
67
-
68
- # Step 2: Rasterize PDF
69
  print("Converting PDF to images...")
70
- image_paths = pdf_to_images(temp_pdf_path, output_folder)
71
 
72
- # Step 3: Extract Text via OCR
73
  print("Performing OCR on images...")
74
  text_data = extract_text_from_images(image_paths)
75
 
76
- # Step 4: Rebuild the PDF
77
  output_pdf_path = "rebuilt_output.pdf"
78
  print("Rebuilding the PDF with extracted text...")
79
- rebuild_pdf(temp_pdf_path, text_data, output_pdf_path)
80
 
81
  return output_pdf_path
82
 
83
- # Streamlit App for Upload and Processing
 
84
  st.title("PDF Text Extraction and Rebuild")
85
  uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
86
  if uploaded_pdf:
87
  output_folder = "temp_images"
88
  st.write("Processing your PDF...")
89
- output_pdf_path = process_pdf(uploaded_pdf, output_folder)
 
 
 
90
  st.write("Download the processed PDF below:")
91
  with open(output_pdf_path, "rb") as f:
92
  st.download_button("Download PDF", f, file_name="processed_output.pdf")
 
1
  import os
 
 
2
  import fitz # PyMuPDF
3
+ import easyocr
4
  from PIL import Image
5
  import streamlit as st
6
+
7
 
8
  # Step 1: Convert PDF to Images
9
  def pdf_to_images(pdf_path, output_folder, dpi=300):
10
+ """Convert PDF pages to high-resolution images using PyMuPDF."""
11
+ pdf_document = fitz.open(pdf_path)
12
  image_paths = []
13
+
14
+ for page_num in range(len(pdf_document)):
15
+ page = pdf_document[page_num]
16
+ # Render page as an image
17
+ pix = page.get_pixmap(dpi=dpi)
18
+ image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
19
+ pix.save(image_path)
20
  image_paths.append(image_path)
21
+
22
  return image_paths
23
 
24
+
25
  # Step 2: Perform OCR on Images
26
  def extract_text_from_images(image_paths):
27
  """Perform OCR on images to extract text."""
28
  reader = easyocr.Reader(["en", "ar"], gpu=False) # Add languages as needed
29
  text_data = []
30
+
31
  for image_path in image_paths:
32
  results = reader.readtext(image_path, detail=1)
33
  text_data.append(results)
34
+
35
  return text_data
36
 
37
+
38
  # Step 3: Rebuild the PDF with Extracted Text
39
  def rebuild_pdf(original_pdf_path, text_data, output_pdf_path):
40
  """Overlay extracted text onto the original PDF."""
 
42
  output_pdf = fitz.open()
43
 
44
  for page_num, page in enumerate(pdf):
45
+ # Render page as an image for reference
46
  pix = page.get_pixmap(dpi=300)
47
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
48
 
 
64
  # Save rebuilt PDF
65
  output_pdf.save(output_pdf_path)
66
 
67
+
68
  # Full Workflow
69
  def process_pdf(uploaded_pdf, output_folder):
70
  """Full process: PDF to images, OCR, and rebuild."""
71
  os.makedirs(output_folder, exist_ok=True)
72
 
73
+ # Step 1: Convert PDF to Images
 
 
 
 
 
74
  print("Converting PDF to images...")
75
+ image_paths = pdf_to_images(uploaded_pdf, output_folder)
76
 
77
+ # Step 2: Perform OCR
78
  print("Performing OCR on images...")
79
  text_data = extract_text_from_images(image_paths)
80
 
81
+ # Step 3: Rebuild the PDF
82
  output_pdf_path = "rebuilt_output.pdf"
83
  print("Rebuilding the PDF with extracted text...")
84
+ rebuild_pdf(uploaded_pdf, text_data, output_pdf_path)
85
 
86
  return output_pdf_path
87
 
88
+
89
+ # Streamlit App
90
  st.title("PDF Text Extraction and Rebuild")
91
  uploaded_pdf = st.file_uploader("Upload PDF", type=["pdf"])
92
  if uploaded_pdf:
93
  output_folder = "temp_images"
94
  st.write("Processing your PDF...")
95
+ with open("temp_uploaded.pdf", "wb") as temp_file:
96
+ temp_file.write(uploaded_pdf.read())
97
+
98
+ output_pdf_path = process_pdf("temp_uploaded.pdf", output_folder)
99
  st.write("Download the processed PDF below:")
100
  with open(output_pdf_path, "rb") as f:
101
  st.download_button("Download PDF", f, file_name="processed_output.pdf")