Teapack1's picture
initial
99afe26
raw
history blame
1.93 kB
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import io
import os
def extract_images_from_pdf(pdf_path, output_folder, dpi=300):
doc = fitz.open(pdf_path)
images = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
img_bytes = pix.tobytes("ppm")
# Convert PPM to JPEG using PIL and save
image = Image.open(io.BytesIO(img_bytes))
output_filename = f"{output_folder}/page_{page_num + 1}.jpg"
image.save(output_filename, "JPEG")
# Also store the image bytes for OCR processing
img_converted_bytes = io.BytesIO()
image.save(img_converted_bytes, format='JPEG')
images.append(img_converted_bytes.getvalue())
print(f"Images saved to {output_folder}")
return images
def ocr_images(images):
text_from_images = []
for image_bytes in images:
image = Image.open(io.BytesIO(image_bytes))
text = pytesseract.image_to_string(image, lang='ces')
text_from_images.append(text)
return text_from_images
def save_text_to_file(text, file_path):
with open(file_path, 'w', encoding='utf-8') as file:
file.write(text)
# Path to your PDF file and output locations
pdf_path = "norm.pdf"
output_folder = "extracted_images"
output_text_file = "ocr_results.txt"
# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)
# Extract images from the PDF and save them as JPEG
images = extract_images_from_pdf(pdf_path, output_folder)
# Transcribe text from the extracted images with Czech language
transcribed_texts = ocr_images(images)
# Combine all texts into a single string
combined_text = "\n".join(transcribed_texts)
# Save OCR results to a text file
save_text_to_file(combined_text, output_text_file)
print(f"OCR results saved to {output_text_file}")