import os
import pandas as pd

def is_image(img_path):
  ext = os.path.splitext(img_path)[1]
  result = ext == ".jpg" or ext == ".png"
  if not result: print('NOT IMAGE: ', img_path)
  return result

def run_tesseract_on_image(image_path):  # -> tsv output path
  print('--- run tesseract on ', image_path)    
  image_name = os.path.basename(image_path)
  image_name = image_name[:image_name.find('.')]
  error_code = os.system(f'''
  tesseract "{image_path}" "/content/{image_name}" -l eng tsv
  ''')
  if not error_code:
    return f"/content/{image_name}.tsv"
  else:
    raise ValueError('Tesseract OCR Error please verify image format PNG,JPG,JPEG')


def clean_tesseract_output(tsv_output_path):
  print('clean tesseract output for: ', tsv_output_path)
  ocr_df = pd.read_csv(tsv_output_path, sep='\t')
  ocr_df = ocr_df.dropna()
  ocr_df = ocr_df.drop(ocr_df[ocr_df.text.str.strip() == ''].index)
  text_output = ' '.join(ocr_df.text.tolist())
  words = []
  for index, row in ocr_df.iterrows():
    word = {}
    origin_box = [row['left'], row['top'], row['left'] +
                  row['width'], row['top']+row['height']]
    word['word_text'] = row['text']
    word['word_box'] = origin_box
    words.append(word)
  return words


def prepare_batch_for_inference(image_paths):
  # tesseract_outputs is a list of paths
  inference_batch = dict()
  tesseract_outputs = [run_tesseract_on_image(
      image_path) for image_path in image_paths if (is_image(image_path))]
  
  print('tesseract has run on all images...')
  # clean_outputs is a list of lists
  clean_outputs = [clean_tesseract_output(
      tsv_path) for tsv_path in tesseract_outputs]
  word_lists = [[word['word_text'] for word in clean_output]
                for clean_output in clean_outputs]
  boxes_lists = [[word['word_box'] for word in clean_output]
                 for clean_output in clean_outputs]
  inference_batch = {
      "image_path": image_paths,
      "bboxes": boxes_lists,
      "words": word_lists
  }
  print('inference_batch:', inference_batch)
  return inference_batch