Spaces:

Teapack1
/

RAG-Retrieve-Ingest-cz-eng

Sleeping

App Files Files Community

RAG-Retrieve-Ingest-cz-eng / support /images-text.py

Teapack1

initial

99afe26 12 months ago

raw

history blame

1.93 kB

	import fitz # PyMuPDF
	from PIL import Image
	import pytesseract
	import io
	import os

	def extract_images_from_pdf(pdf_path, output_folder, dpi=300):
	doc = fitz.open(pdf_path)
	images = []
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
	img_bytes = pix.tobytes("ppm")

	# Convert PPM to JPEG using PIL and save
	image = Image.open(io.BytesIO(img_bytes))
	output_filename = f"{output_folder}/page_{page_num + 1}.jpg"
	image.save(output_filename, "JPEG")

	# Also store the image bytes for OCR processing
	img_converted_bytes = io.BytesIO()
	image.save(img_converted_bytes, format='JPEG')
	images.append(img_converted_bytes.getvalue())

	print(f"Images saved to {output_folder}")
	return images

	def ocr_images(images):
	text_from_images = []
	for image_bytes in images:
	image = Image.open(io.BytesIO(image_bytes))
	text = pytesseract.image_to_string(image, lang='ces')
	text_from_images.append(text)
	return text_from_images

	def save_text_to_file(text, file_path):
	with open(file_path, 'w', encoding='utf-8') as file:
	file.write(text)

	# Path to your PDF file and output locations
	pdf_path = "norm.pdf"
	output_folder = "extracted_images"
	output_text_file = "ocr_results.txt"

	# Ensure the output directory exists
	os.makedirs(output_folder, exist_ok=True)

	# Extract images from the PDF and save them as JPEG
	images = extract_images_from_pdf(pdf_path, output_folder)

	# Transcribe text from the extracted images with Czech language
	transcribed_texts = ocr_images(images)

	# Combine all texts into a single string
	combined_text = "\n".join(transcribed_texts)

	# Save OCR results to a text file
	save_text_to_file(combined_text, output_text_file)

	print(f"OCR results saved to {output_text_file}")