Spaces:

AnnasBlackHat
/

Supertmarket-Receipt-Extractor

Runtime error

Supertmarket-Receipt-Extractor / inference /ocr.py

Annas Dev

return json

ffd0bdc almost 2 years ago

2.06 kB

	import os
	import pandas as pd

	def is_image(img_path):
	ext = os.path.splitext(img_path)[1]
	result = ext == ".jpg" or ext == ".png"
	if not result: print('NOT IMAGE: ', img_path)
	return result

	def run_tesseract_on_image(image_path): # -> tsv output path
	print('--- run tesseract on ', image_path)
	image_name = os.path.basename(image_path)
	image_name = image_name[:image_name.find('.')]
	error_code = os.system(f'''
	tesseract "{image_path}" "tmp/{image_name}" -l eng tsv
	''')
	if not error_code:
	return f"tmp/{image_name}.tsv"
	else:
	raise ValueError('Tesseract OCR Error please verify image format PNG,JPG,JPEG')


	def clean_tesseract_output(tsv_output_path):
	print('clean tesseract output for: ', tsv_output_path)
	ocr_df = pd.read_csv(tsv_output_path, sep='\t')
	ocr_df = ocr_df.dropna()
	ocr_df = ocr_df.drop(ocr_df[ocr_df.text.str.strip() == ''].index)
	text_output = ' '.join(ocr_df.text.tolist())
	words = []
	for index, row in ocr_df.iterrows():
	word = {}
	origin_box = [row['left'], row['top'], row['left'] +
	row['width'], row['top']+row['height']]
	word['word_text'] = row['text']
	word['word_box'] = origin_box
	words.append(word)
	return words


	def prepare_batch_for_inference(image_paths):
	# tesseract_outputs is a list of paths
	inference_batch = dict()
	tesseract_outputs = [run_tesseract_on_image(
	image_path) for image_path in image_paths if (is_image(image_path))]

	print('tesseract has run on all images...')
	# clean_outputs is a list of lists
	clean_outputs = [clean_tesseract_output(
	tsv_path) for tsv_path in tesseract_outputs]
	word_lists = [[word['word_text'] for word in clean_output]
	for clean_output in clean_outputs]
	boxes_lists = [[word['word_box'] for word in clean_output]
	for clean_output in clean_outputs]
	inference_batch = {
	"image_path": image_paths,
	"bboxes": boxes_lists,
	"words": word_lists
	}
	# print('inference_batch:', inference_batch)
	return inference_batch