Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
def is_image(img_path): | |
ext = os.path.splitext(img_path)[1] | |
result = ext == ".jpg" or ext == ".png" | |
if not result: print('NOT IMAGE: ', img_path) | |
return result | |
def run_tesseract_on_image(image_path): # -> tsv output path | |
print('--- run tesseract on ', image_path) | |
image_name = os.path.basename(image_path) | |
image_name = image_name[:image_name.find('.')] | |
error_code = os.system(f''' | |
tesseract "{image_path}" "tmp/{image_name}" -l eng tsv | |
''') | |
if not error_code: | |
return f"tmp/{image_name}.tsv" | |
else: | |
raise ValueError('Tesseract OCR Error please verify image format PNG,JPG,JPEG') | |
def clean_tesseract_output(tsv_output_path): | |
print('clean tesseract output for: ', tsv_output_path) | |
ocr_df = pd.read_csv(tsv_output_path, sep='\t') | |
ocr_df = ocr_df.dropna() | |
ocr_df = ocr_df.drop(ocr_df[ocr_df.text.str.strip() == ''].index) | |
text_output = ' '.join(ocr_df.text.tolist()) | |
words = [] | |
for index, row in ocr_df.iterrows(): | |
word = {} | |
origin_box = [row['left'], row['top'], row['left'] + | |
row['width'], row['top']+row['height']] | |
word['word_text'] = row['text'] | |
word['word_box'] = origin_box | |
words.append(word) | |
return words | |
def prepare_batch_for_inference(image_paths): | |
# tesseract_outputs is a list of paths | |
inference_batch = dict() | |
tesseract_outputs = [run_tesseract_on_image( | |
image_path) for image_path in image_paths if (is_image(image_path))] | |
print('tesseract has run on all images...') | |
# clean_outputs is a list of lists | |
clean_outputs = [clean_tesseract_output( | |
tsv_path) for tsv_path in tesseract_outputs] | |
word_lists = [[word['word_text'] for word in clean_output] | |
for clean_output in clean_outputs] | |
boxes_lists = [[word['word_box'] for word in clean_output] | |
for clean_output in clean_outputs] | |
inference_batch = { | |
"image_path": image_paths, | |
"bboxes": boxes_lists, | |
"words": word_lists | |
} | |
# print('inference_batch:', inference_batch) | |
return inference_batch | |