Spaces:

pierreguillou
/

arquiteturia

Sleeping

App Files Files Community

arquiteturia / helpers /text_extraction.py

pierreguillou

Create text_extraction.py

c1c25ed verified about 1 month ago

raw

history blame contribute delete

3.31 kB

	## PDF processing up to text extraction

	import os
	import shutil
	import fitz
	from PIL import Image
	import numpy as np
	import cv2
	import pytesseract
	from pytesseract import Output
	import zipfile
	from pdf2image import convert_from_path
	import json

	def convert_to_rgb(image_path):
	img = Image.open(image_path)
	rgb_img = img.convert("RGB")
	return rgb_img

	def preprocess_image(image):
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
	resized = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
	return resized

	def extract_vertical_blocks(image):
	image_np = np.array(image)
	data = pytesseract.image_to_data(image_np, lang='fra', output_type=Output.DICT)

	blocks = []
	current_block = ""
	current_block_coords = [float('inf'), float('inf'), 0, 0]
	last_bottom = -1
	line_height = 0

	for i in range(len(data['text'])):
	if int(data['conf'][i]) > 0:
	text = data['text'][i]
	x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]

	if line_height == 0:
	line_height = h * 1.2

	if y > last_bottom + line_height:
	if current_block:
	blocks.append({
	"text": current_block.strip(),
	"coords": current_block_coords
	})
	current_block = ""
	current_block_coords = [float('inf'), float('inf'), 0, 0]

	current_block += text + " "
	current_block_coords[0] = min(current_block_coords[0], x)
	current_block_coords[1] = min(current_block_coords[1], y)
	current_block_coords[2] = max(current_block_coords[2], x + w)
	current_block_coords[3] = max(current_block_coords[3], y + h)

	last_bottom = y + h

	if current_block:
	blocks.append({
	"text": current_block.strip(),
	"coords": current_block_coords
	})

	return blocks

	def draw_blocks_on_image(image_path, blocks, output_path):
	image = cv2.imread(image_path)
	for block in blocks:
	coords = block['coords']
	cv2.rectangle(image, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 2)
	cv2.imwrite(output_path, image)
	return output_path

	def process_image(image, output_folder, page_number):
	image = convert_to_rgb(image)
	blocks = extract_vertical_blocks(image)
	base_name = f'page_{page_number + 1}.png'
	image_path = os.path.join(output_folder, base_name)
	image.save(image_path)
	annotated_image_path = os.path.join(output_folder, f'annotated_{base_name}')
	annotated_image_path = draw_blocks_on_image(image_path, blocks, annotated_image_path)
	return blocks, annotated_image_path

	def save_extracted_text(blocks, page_number, output_folder):
	text_file_path = os.path.join(output_folder, 'extracted_text.txt')
	with open(text_file_path, 'a', encoding='utf-8') as f:
	f.write(f"[PAGE {page_number}]\n")
	for block in blocks:
	f.write(block['text'] + "\n")
	f.write("[FIN DE PAGE]\n\n")
	return text_file_path