File size: 5,755 Bytes
1f72938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
################# cnocr ##################
from cnocr import CnOcr

def validate(text):
    invalid_list = [' ',',']
    for char in invalid_list:
        text = text.replace(char, '')
    return text

def check_bank(text):
    text = text.replace(' ', '')
    bank_list = ['bankofchina','hangseng','hsbc','sc']
    for bank in bank_list:
        if bank in text: 
            return bank
        else:
            return False
        
def check_bank_name(img_path):
    # BOCH - "Consolidated Statement 2023-01-01"
    # HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07"
    # HSBC - "Statement - HSBC One Account 2023-02-10"
    # Standard Chartered - "statementOfAccount 2023-02-01"
    standard_names = {'boch': "Consolidated Statement",
                        'hangseng': "Statement of",
                        'hsbc': "Statement - HSBC One Account",
                        'sc': "statementOfAccount"}
    for bank_name in standard_names:
        if bank_name in str(img_path) or standard_names[bank_name] in str(img_path):
            return bank_name
        
def check_mr(text):
    openings = ['mr', 'ms', 'miss', 'mrs']
    words = text.lower().split()
    if words and words[0] in openings:
        return ''.join(words[1:])
    else:
        return text

def get_info_from_bank(img_path, file_name):
    # Running the model
    ocr = CnOcr(rec_model_name='densenet_lite_136-gru')
    out = ocr.ocr(img_path)
    # Data
    bank_data = {
        "name_on_bs": "",
        "address": "",
        "bank": check_bank_name(file_name),
        "date": "",
        "asset": 0.0,
        "liabilities": ""
    }
    
    asset_y = [722,747]
    asset_equa = ''
    asset_iterations = 2
    liabilities_y = [747,800]
    count = 0
    invalid_list = ['', ' ', ',']

    for item in out:
        detected_text = item['text']
        raw_detected_text = detected_text.lower()
        #raw_detected_text = detected_text
        positions = item['position']
        if raw_detected_text in invalid_list or raw_detected_text is None:
            pass
        elif ((positions[0][0] >= 147) and (positions[0][1] >= 265) and (positions[2][0] <= 400) and (positions[2][1] <= 295)):
            if (raw_detected_text != ''): # name
                bank_data["name_on_bs"] += raw_detected_text
                bank_data["name_on_bs"] = check_mr(bank_data["name_on_bs"])
        elif ((positions[0][0] >= 113) and (positions[0][1] >= 291) and (positions[2][0] <= 500) and (positions[2][1] <= 381)):
            if (raw_detected_text != ''): # position
                bank_data["address"] += raw_detected_text
                bank_data["address"] += ' '
        elif ((positions[0][0] >= 996) and (positions[0][1] >= 289) and (positions[2][0] <= 1083) and (positions[2][1] <= 314)):
            if (raw_detected_text != ''): # statement date
                bank_data["date"] += raw_detected_text
        elif ((positions[0][0] >= 900) and (positions[0][1] >= asset_y[0]) and (positions[2][0] <= 1120) and (positions[2][1] <= asset_y[1])): # 
            # take a look at the y0/y1 position
            if (raw_detected_text != '' and count <= asset_iterations and ('DR' not in raw_detected_text)): # asset
                asset_equa += raw_detected_text 
                asset_equa += '+' 
                raw_detected_text = raw_detected_text.replace(',', '')
                #raw_detected_text = validate(raw_detected_text).lower()
                asset_float = float(raw_detected_text)
                bank_data["asset"] += asset_float
                asset_y[0] += 21
                asset_y[1] += 27
                liabilities_y[1] += 27
                count += 1
            elif 'DR' in raw_detected_text:
                bank_data["liabilities"] = validate(raw_detected_text)
        elif ((positions[0][0] >= 900) and (positions[0][1] >= liabilities_y[0]) and (positions[2][0] <= 1130) and (positions[2][1] <= liabilities_y[1])):
            if (raw_detected_text != '' and 'dr' in raw_detected_text): # liabilities
                raw_detected_text = raw_detected_text.replace('dr','')
                bank_data["liabilities"] = validate(raw_detected_text)
        elif check_bank(raw_detected_text) != False:  # bank
            bank_data["bank"] = check_bank(raw_detected_text)


    # print('------------From bank statement------------')
    # print(f'Name: {bank_data["name_on_bs"]}')
    # print(f'Address: {bank_data["address"]}')
    # print(f'Bank: {bank_data["bank"]}')
    # print(f'Date: {bank_data["date"]}')
    # print(f'Asset: {asset_equa} = {bank_data["asset"]}')
    # print(f'Liabilities: {bank_data["liabilities"]}')
    # post_data(bank_data["bank"], bank_data["name_on_bs"], bank_data["address"], bank_data["asset"], bank_data["liabilities"], bank_data["date"])
    return bank_data

########## Posting data through API ############
import requests
import data_encryption
# POST /api/v1/users HTTP/1.1

def post_data(bank, name, address, asset, liabilities, date):
    # endpoint = 'http://ipygg-api-test-env.ap-east-1.elasticbeanstalk.com/SBT/api/v1/users'
    data = {
        "endpoint": "/SBT",
        "apiType": "store_statement_verif",
        "requestId": 'request_1234',
        "userId": 'user1',
        "bank": bank,
        "nameStatement": name,
        "address": address,
        "asset": str(asset),
        "liability": liabilities,
        "statementDate": date
    }

    encrypted_data = data_encryption.encrypt(data)

    # request = requests.post(url=endpoint, data=encrypted_data)

# def extract_pdf_data(img_path='hangseng_page-0001.jpg'):
#     page_number = 1
#     images = f'hangseng_page-000{page_number}.jpg'
#     get_info_from_bank(img_path)