hf-similarity-check / extract_pdf.py
Mitul Mohammad Abdullah Al Mukit
update
9312707
################# cnocr ##################
from cnocr import CnOcr
def validate(text):
invalid_list = [' ',',']
for char in invalid_list:
text = text.replace(char, '')
return text
def check_bank(text):
text = text.replace(' ', '')
bank_list = ['bankofchina','hangseng','hsbc','sc']
for bank in bank_list:
if bank in text:
return bank
else:
return False
def check_bank_name(img_path):
# BOCH - "Consolidated Statement 2023-01-01"
# HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07"
# HSBC - "Statement - HSBC One Account 2023-02-10"
# Standard Chartered - "statementOfAccount 2023-02-01"
standard_names = {'boch': "Consolidated Statement",
'hangseng': "Statement of",
'hsbc': "Statement - HSBC One Account",
'sc': "statementOfAccount"}
for bank_name in standard_names:
if bank_name in str(img_path) or standard_names[bank_name] in str(img_path):
return bank_name
def check_mr(text):
openings = ['mr', 'ms', 'miss', 'mrs']
words = text.lower().split()
if words and words[0] in openings:
return ''.join(words[1:])
else:
return text
def get_info_from_bank(img_path, file_name):
# Running the model
ocr = CnOcr(rec_model_name='densenet_lite_136-gru')
out = ocr.ocr(img_path)
# Data
bank_data = {
"nameStatement": "",
"address": "",
"bank": check_bank_name(file_name),
"date": "",
"asset": 0.0,
"liabilities": ""
}
asset_y = [722,747]
asset_equa = ''
asset_iterations = 2
liabilities_y = [747,800]
count = 0
invalid_list = ['', ' ', ',']
for item in out:
detected_text = item['text']
raw_detected_text = detected_text.lower()
#raw_detected_text = detected_text
positions = item['position']
if raw_detected_text in invalid_list or raw_detected_text is None:
pass
elif ((positions[0][0] >= 147) and (positions[0][1] >= 265) and (positions[2][0] <= 400) and (positions[2][1] <= 295)):
if (raw_detected_text != ''): # name
bank_data["nameStatement"] += raw_detected_text
bank_data["nameStatement"] = check_mr(bank_data["nameStatement"])
elif ((positions[0][0] >= 113) and (positions[0][1] >= 291) and (positions[2][0] <= 500) and (positions[2][1] <= 381)):
if (raw_detected_text != ''): # position
bank_data["address"] += raw_detected_text
bank_data["address"] += ' '
elif ((positions[0][0] >= 996) and (positions[0][1] >= 289) and (positions[2][0] <= 1083) and (positions[2][1] <= 314)):
if (raw_detected_text != ''): # statement date
bank_data["date"] += raw_detected_text
elif ((positions[0][0] >= 900) and (positions[0][1] >= asset_y[0]) and (positions[2][0] <= 1120) and (positions[2][1] <= asset_y[1])): #
# take a look at the y0/y1 position
if (raw_detected_text != '' and count <= asset_iterations and ('DR' not in raw_detected_text)): # asset
asset_equa += raw_detected_text
asset_equa += '+'
raw_detected_text = raw_detected_text.replace(',', '')
#raw_detected_text = validate(raw_detected_text).lower()
asset_float = float(raw_detected_text)
bank_data["asset"] += asset_float
asset_y[0] += 21
asset_y[1] += 27
liabilities_y[1] += 27
count += 1
elif 'DR' in raw_detected_text:
bank_data["liabilities"] = validate(raw_detected_text)
elif ((positions[0][0] >= 900) and (positions[0][1] >= liabilities_y[0]) and (positions[2][0] <= 1130) and (positions[2][1] <= liabilities_y[1])):
if (raw_detected_text != '' and 'dr' in raw_detected_text): # liabilities
raw_detected_text = raw_detected_text.replace('dr','')
bank_data["liabilities"] = validate(raw_detected_text)
elif check_bank(raw_detected_text) != False: # bank
bank_data["bank"] = check_bank(raw_detected_text)
# print('------------From bank statement------------')
# print(f'Name: {bank_data["nameStatement"]}')
# print(f'Address: {bank_data["address"]}')
# print(f'Bank: {bank_data["bank"]}')
# print(f'Date: {bank_data["date"]}')
# print(f'Asset: {asset_equa} = {bank_data["asset"]}')
# print(f'Liabilities: {bank_data["liabilities"]}')
# post_data(bank_data["bank"], bank_data["nameStatement"], bank_data["address"], bank_data["asset"], bank_data["liabilities"], bank_data["date"])
return bank_data
########## Posting data through API ############
import requests
import data_encryption
# POST /api/v1/users HTTP/1.1
def post_data(bank, name, address, asset, liabilities, date):
# endpoint = 'http://ipygg-api-test-env.ap-east-1.elasticbeanstalk.com/SBT/api/v1/users'
data = {
"endpoint": "/SBT",
"apiType": "store_statement_verif",
"requestId": 'request_1234',
"userId": 'user1',
"bank": bank,
"nameStatement": name,
"address": address,
"asset": str(asset),
"liability": liabilities,
"statementDate": date
}
encrypted_data = data_encryption.encrypt(data)
# request = requests.post(url=endpoint, data=encrypted_data)
# def extract_pdf_data(img_path='hangseng_page-0001.jpg'):
# page_number = 1
# images = f'hangseng_page-000{page_number}.jpg'
# get_info_from_bank(img_path)