Spaces:
Runtime error
Runtime error
File size: 5,265 Bytes
dcf7eb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import os
import json
import numpy as np
import pytesseract
from PIL import Image, ImageDraw
PAD_TOKEN_BOX = [0, 0, 0, 0]
max_seq_len = 512
## Function: 1
## Purpose: Resize and align the bounding box for the different sized image
def resize_align_bbox(bbox, orig_w, orig_h, target_w, target_h):
x_scale = target_w / orig_w
y_scale = target_h / orig_h
orig_left, orig_top, orig_right, orig_bottom = bbox
x = int(np.round(orig_left * x_scale))
y = int(np.round(orig_top * y_scale))
xmax = int(np.round(orig_right * x_scale))
ymax = int(np.round(orig_bottom * y_scale))
return [x, y, xmax, ymax]
## Function: 2
## Purpose: Reading the json file from the path and return the dictionary
def load_json_file(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
return data
## Function: 3
## Purpose: Getting the address of specific file type, eg: .pdf, .tif, so and so
def get_specific_file(path, last_entry = 'tif'):
base_path = path
for i in os.listdir(path):
if i.endswith(last_entry):
return os.path.join(base_path, i)
return '-1'
## Function: 4
def get_tokens_with_boxes(unnormalized_word_boxes, list_of_words, tokenizer, pad_token_id = 0, pad_token_box = [0, 0, 0, 0], max_seq_len = 512):
'''
This function returns two items:
1. unnormalized_token_boxes -> a list of len = max_seq_len, containing the boxes corresponding to the tokenized words,
one box might repeat as per the tokenization procedure
2. tokenized_words -> tokenized words corresponding to the tokenizer and the list_of_words
'''
assert len(unnormalized_word_boxes) == len(list_of_words), "Bounding box length!= total words length"
length_of_box = len(unnormalized_word_boxes)
unnormalized_token_boxes = []
tokenized_words = []
for box, word in zip(unnormalized_word_boxes, list_of_words):
current_tokens = tokenizer(word, add_special_tokens = False).input_ids
unnormalized_token_boxes.extend([box]*len(current_tokens))
tokenized_words.extend(current_tokens)
if len(unnormalized_token_boxes)<max_seq_len:
unnormalized_token_boxes.extend([pad_token_box] * (max_seq_len-len(unnormalized_token_boxes)))
if len(tokenized_words)< max_seq_len:
tokenized_words.extend([pad_token_id]* (max_seq_len-len(tokenized_words)))
return unnormalized_token_boxes[:max_seq_len], tokenized_words[:max_seq_len]
## Function: 5
## Function, which would only be used when the below function is used
def get_topleft_bottomright_coordinates(df_row):
left, top, width, height = df_row["left"], df_row["top"], df_row["width"], df_row["height"]
return [left, top, left + width, top + height]
## Function: 6
## If the OCR is not provided, this function would help in extracting OCR
def apply_ocr(tif_path):
"""
Returns words and its bounding boxes from an image
"""
img = Image.open(tif_path).convert("RGB")
ocr_df = pytesseract.image_to_data(img, output_type="data.frame")
ocr_df = ocr_df.dropna().reset_index(drop=True)
float_cols = ocr_df.select_dtypes("float").columns
ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
ocr_df = ocr_df.replace(r"^\s*$", np.nan, regex=True)
ocr_df = ocr_df.dropna().reset_index(drop=True)
words = list(ocr_df.text.apply(lambda x: str(x).strip()))
actual_bboxes = ocr_df.apply(get_topleft_bottomright_coordinates, axis=1).values.tolist()
# add as extra columns
assert len(words) == len(actual_bboxes)
return {"words": words, "bbox": actual_bboxes}
## Function: 7
## Merging all the above functions, for the purpose of extracting the image, bounding box and the tokens (sentence wise)
def create_features(
image_path,
tokenizer,
target_size = (1000, 1000),
max_seq_length=512,
use_ocr = False,
bounding_box = None,
words = None
):
'''
We assume that the bounding box provided are given as per the image scale (i.e not normalized), so that we just need to scale it as per the ratio
'''
img = Image.open(image_path).convert("RGB")
width_old, height_old = img.size
img = img.resize(target_size)
width, height = img.size
## Rescaling the bounding box as per the image size
if (use_ocr == False) and (bounding_box == None or words == None):
raise Exception('Please provide the bounding box and words or pass the argument "use_ocr" = True')
if use_ocr == True:
entries = apply_ocr(image_path)
bounding_box = entries["bbox"]
words = entries["words"]
bounding_box = list(map(lambda x: resize_align_bbox(x,width_old,height_old, width, height), bounding_box))
boxes, tokenized_words = get_tokens_with_boxes(unnormalized_word_boxes = bounding_box,
list_of_words = words,
tokenizer = tokenizer,
pad_token_id = 0,
pad_token_box = PAD_TOKEN_BOX,
max_seq_len = max_seq_length
)
return img, boxes, tokenized_words
|