Spaces:
Sleeping
Sleeping
import fitz # PyMuPDF | |
from PIL import Image | |
import pytesseract | |
import io | |
import os | |
def extract_images_from_pdf(pdf_path, output_folder, dpi=300): | |
doc = fitz.open(pdf_path) | |
images = [] | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72)) | |
img_bytes = pix.tobytes("ppm") | |
# Convert PPM to JPEG using PIL and save | |
image = Image.open(io.BytesIO(img_bytes)) | |
output_filename = f"{output_folder}/page_{page_num + 1}.jpg" | |
image.save(output_filename, "JPEG") | |
# Also store the image bytes for OCR processing | |
img_converted_bytes = io.BytesIO() | |
image.save(img_converted_bytes, format='JPEG') | |
images.append(img_converted_bytes.getvalue()) | |
print(f"Images saved to {output_folder}") | |
return images | |
def ocr_images(images): | |
text_from_images = [] | |
for image_bytes in images: | |
image = Image.open(io.BytesIO(image_bytes)) | |
text = pytesseract.image_to_string(image, lang='ces') | |
text_from_images.append(text) | |
return text_from_images | |
def save_text_to_file(text, file_path): | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.write(text) | |
# Path to your PDF file and output locations | |
pdf_path = "norm.pdf" | |
output_folder = "extracted_images" | |
output_text_file = "ocr_results.txt" | |
# Ensure the output directory exists | |
os.makedirs(output_folder, exist_ok=True) | |
# Extract images from the PDF and save them as JPEG | |
images = extract_images_from_pdf(pdf_path, output_folder) | |
# Transcribe text from the extracted images with Czech language | |
transcribed_texts = ocr_images(images) | |
# Combine all texts into a single string | |
combined_text = "\n".join(transcribed_texts) | |
# Save OCR results to a text file | |
save_text_to_file(combined_text, output_text_file) | |
print(f"OCR results saved to {output_text_file}") | |