Annas Dev
return json
ffd0bdc
import os
import pandas as pd
def is_image(img_path):
ext = os.path.splitext(img_path)[1]
result = ext == ".jpg" or ext == ".png"
if not result: print('NOT IMAGE: ', img_path)
return result
def run_tesseract_on_image(image_path): # -> tsv output path
print('--- run tesseract on ', image_path)
image_name = os.path.basename(image_path)
image_name = image_name[:image_name.find('.')]
error_code = os.system(f'''
tesseract "{image_path}" "tmp/{image_name}" -l eng tsv
''')
if not error_code:
return f"tmp/{image_name}.tsv"
else:
raise ValueError('Tesseract OCR Error please verify image format PNG,JPG,JPEG')
def clean_tesseract_output(tsv_output_path):
print('clean tesseract output for: ', tsv_output_path)
ocr_df = pd.read_csv(tsv_output_path, sep='\t')
ocr_df = ocr_df.dropna()
ocr_df = ocr_df.drop(ocr_df[ocr_df.text.str.strip() == ''].index)
text_output = ' '.join(ocr_df.text.tolist())
words = []
for index, row in ocr_df.iterrows():
word = {}
origin_box = [row['left'], row['top'], row['left'] +
row['width'], row['top']+row['height']]
word['word_text'] = row['text']
word['word_box'] = origin_box
words.append(word)
return words
def prepare_batch_for_inference(image_paths):
# tesseract_outputs is a list of paths
inference_batch = dict()
tesseract_outputs = [run_tesseract_on_image(
image_path) for image_path in image_paths if (is_image(image_path))]
print('tesseract has run on all images...')
# clean_outputs is a list of lists
clean_outputs = [clean_tesseract_output(
tsv_path) for tsv_path in tesseract_outputs]
word_lists = [[word['word_text'] for word in clean_output]
for clean_output in clean_outputs]
boxes_lists = [[word['word_box'] for word in clean_output]
for clean_output in clean_outputs]
inference_batch = {
"image_path": image_paths,
"bboxes": boxes_lists,
"words": word_lists
}
# print('inference_batch:', inference_batch)
return inference_batch