Spaces:
Sleeping
Sleeping
################# cnocr ################## | |
from cnocr import CnOcr | |
from pdfquery import PDFQuery | |
import openai | |
def validate(text): | |
invalid_list = [' ',','] | |
for char in invalid_list: | |
text = text.replace(char, '') | |
return text | |
def check_bank(text): | |
text = text.replace(' ', '') | |
bank_list = ['bankofchina','hangseng','hsbc','sc'] | |
for bank in bank_list: | |
if bank in text: | |
return bank | |
else: | |
return False | |
def check_bank_name(img_path): | |
# BOCH - "Consolidated Statement 2023-01-01" | |
# HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07" | |
# HSBC - "Statement - HSBC One Account 2023-02-10" | |
# Standard Chartered - "statementOfAccount 2023-02-01" | |
standard_names = {'boch': "Consolidated Statement", | |
'hangseng': "Statement of", | |
'hsbc': "Statement - HSBC One Account", | |
'sc': "statementOfAccount"} | |
for bank_name in standard_names: | |
if bank_name in str(img_path) or standard_names[bank_name] in str(img_path): | |
return bank_name | |
def check_mr(text): | |
openings = ['mr', 'ms', 'miss', 'mrs'] | |
words = text.lower().split() | |
if words and words[0] in openings: | |
return ''.join(words[1:]) | |
else: | |
return text | |
def get_info_from_bank(img_path, pdf_path): | |
# Running the model | |
ocr = CnOcr(rec_model_name='densenet_lite_136-gru') | |
out = ocr.ocr(img_path) | |
# Data | |
bank_data = { | |
"name_on_bs": "", | |
"address": "", | |
"bank": "", | |
"date": "", | |
"asset": 0.0, | |
"liabilities": "" | |
} | |
# { | |
# "Customer Name": "MR CHIU CHUNG YIN", | |
# "Address": "FLAT 13,8/F,OILOK HOUSE, YAU OI ESTATE, TUEN MUN NT", | |
# "Bank Name": "HSBC", | |
# "Statement Issue Date": "10 January 2023", | |
# "Total Asset": "7,265.80", | |
# "Total Liability": "7,265.80" | |
# } | |
openai.api_key = "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" | |
invalid_list = [' ',','] | |
data_set_1 = [] | |
pdf = PDFQuery(pdf_path) | |
pdf.load(0) | |
text_elements = pdf.pq('LTTextLineHorizontal').text() | |
text_elements = text_elements.replace("cid:", "") | |
for item in out: | |
if item['text'] not in invalid_list: | |
data_set_1.append(item['text']) | |
completion = openai.ChatCompletion.create( | |
model = "gpt-3.5-turbo", | |
temperature = 0.2, | |
messages = [ | |
{"role": "system", "content": "You are an AI assistant for extracting data from bank statements. Uppercase and lowercase letters are the same. List results in a dictionary format."}, | |
{"role": "user", "content": f"Extract data from the following 2 sets of text: {data_set_1} and {text_elements}. (1.) Data that locate in the front part of the text: customer full name, address in Hong Kong (including flat, floor, court/estate, region in Hong Kong), bank name, bank statement issue date (verly likely to be within 1-2 years), (2.) Data that mainly locate in the other part of the text: total asset (including investments and deposits) and total liability (often contains DR and includes credit card but might be zero) of the current month."}, | |
# {"role": "assistant", "content": "Q: How do you make 7 even? A: Take away the s."}, | |
# {"role": "user", "content": "Write one related to programmers."} | |
] | |
) | |
bs_data = completion['choices'][0]['message']['content'] | |
print(bs_data) | |
return bs_data | |
# get_info_from_bank('hangseng_page-0001.jpg','hangseng.pdf') | |
# get_info_from_bank('hsbc_one_account_page-0001.jpg','hsbc_one_account.pdf') | |
# get_info_from_bank('boch_consolidated.jpg','boch_consolidated.pdf') | |
get_info_from_bank('hsbc_one_account_page-10001.jpg','hsbc_one_account_page-10001.pdf') | |