################# cnocr ################## from cnocr import CnOcr def validate(text): invalid_list = [' ',','] for char in invalid_list: text = text.replace(char, '') return text def check_bank(text): text = text.replace(' ', '') bank_list = ['bankofchina','hangseng','hsbc','sc'] for bank in bank_list: if bank in text: return bank else: return False def check_bank_name(img_path): # BOCH - "Consolidated Statement 2023-01-01" # HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07" # HSBC - "Statement - HSBC One Account 2023-02-10" # Standard Chartered - "statementOfAccount 2023-02-01" standard_names = {'boch': "Consolidated Statement", 'hangseng': "Statement of", 'hsbc': "Statement - HSBC One Account", 'sc': "statementOfAccount"} for bank_name in standard_names: if bank_name in str(img_path) or standard_names[bank_name] in str(img_path): return bank_name def check_mr(text): openings = ['mr', 'ms', 'miss', 'mrs'] words = text.lower().split() if words and words[0] in openings: return ''.join(words[1:]) else: return text def get_info_from_bank(img_path, file_name): # Running the model ocr = CnOcr(rec_model_name='densenet_lite_136-gru') out = ocr.ocr(img_path) # Data bank_data = { "name_on_bs": "", "address": "", "bank": check_bank_name(file_name), "date": "", "asset": 0.0, "liabilities": "" } asset_y = [722,747] asset_equa = '' asset_iterations = 2 liabilities_y = [747,800] count = 0 invalid_list = ['', ' ', ','] for item in out: detected_text = item['text'] raw_detected_text = detected_text.lower() #raw_detected_text = detected_text positions = item['position'] if raw_detected_text in invalid_list or raw_detected_text is None: pass elif ((positions[0][0] >= 147) and (positions[0][1] >= 265) and (positions[2][0] <= 400) and (positions[2][1] <= 295)): if (raw_detected_text != ''): # name bank_data["name_on_bs"] += raw_detected_text bank_data["name_on_bs"] = check_mr(bank_data["name_on_bs"]) elif ((positions[0][0] >= 113) and (positions[0][1] >= 291) and (positions[2][0] <= 500) and (positions[2][1] <= 381)): if (raw_detected_text != ''): # position bank_data["address"] += raw_detected_text bank_data["address"] += ' ' elif ((positions[0][0] >= 996) and (positions[0][1] >= 289) and (positions[2][0] <= 1083) and (positions[2][1] <= 314)): if (raw_detected_text != ''): # statement date bank_data["date"] += raw_detected_text elif ((positions[0][0] >= 900) and (positions[0][1] >= asset_y[0]) and (positions[2][0] <= 1120) and (positions[2][1] <= asset_y[1])): # # take a look at the y0/y1 position if (raw_detected_text != '' and count <= asset_iterations and ('DR' not in raw_detected_text)): # asset asset_equa += raw_detected_text asset_equa += '+' raw_detected_text = raw_detected_text.replace(',', '') #raw_detected_text = validate(raw_detected_text).lower() asset_float = float(raw_detected_text) bank_data["asset"] += asset_float asset_y[0] += 21 asset_y[1] += 27 liabilities_y[1] += 27 count += 1 elif 'DR' in raw_detected_text: bank_data["liabilities"] = validate(raw_detected_text) elif ((positions[0][0] >= 900) and (positions[0][1] >= liabilities_y[0]) and (positions[2][0] <= 1130) and (positions[2][1] <= liabilities_y[1])): if (raw_detected_text != '' and 'dr' in raw_detected_text): # liabilities raw_detected_text = raw_detected_text.replace('dr','') bank_data["liabilities"] = validate(raw_detected_text) elif check_bank(raw_detected_text) != False: # bank bank_data["bank"] = check_bank(raw_detected_text) # print('------------From bank statement------------') # print(f'Name: {bank_data["name_on_bs"]}') # print(f'Address: {bank_data["address"]}') # print(f'Bank: {bank_data["bank"]}') # print(f'Date: {bank_data["date"]}') # print(f'Asset: {asset_equa} = {bank_data["asset"]}') # print(f'Liabilities: {bank_data["liabilities"]}') # post_data(bank_data["bank"], bank_data["name_on_bs"], bank_data["address"], bank_data["asset"], bank_data["liabilities"], bank_data["date"]) return bank_data ########## Posting data through API ############ import requests import data_encryption # POST /api/v1/users HTTP/1.1 def post_data(bank, name, address, asset, liabilities, date): # endpoint = 'http://ipygg-api-test-env.ap-east-1.elasticbeanstalk.com/SBT/api/v1/users' data = { "endpoint": "/SBT", "apiType": "store_statement_verif", "requestId": 'request_1234', "userId": 'user1', "bank": bank, "nameStatement": name, "address": address, "asset": str(asset), "liability": liabilities, "statementDate": date } encrypted_data = data_encryption.encrypt(data) # request = requests.post(url=endpoint, data=encrypted_data) # def extract_pdf_data(img_path='hangseng_page-0001.jpg'): # page_number = 1 # images = f'hangseng_page-000{page_number}.jpg' # get_info_from_bank(img_path)