hf-similarity-check / extraction_data.py
Mitul Mohammad Abdullah Al Mukit
first commit
1f72938
raw
history blame
3.88 kB
################# cnocr ##################
from cnocr import CnOcr
from pdfquery import PDFQuery
import openai
def validate(text):
invalid_list = [' ',',']
for char in invalid_list:
text = text.replace(char, '')
return text
def check_bank(text):
text = text.replace(' ', '')
bank_list = ['bankofchina','hangseng','hsbc','sc']
for bank in bank_list:
if bank in text:
return bank
else:
return False
def check_bank_name(img_path):
# BOCH - "Consolidated Statement 2023-01-01"
# HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07"
# HSBC - "Statement - HSBC One Account 2023-02-10"
# Standard Chartered - "statementOfAccount 2023-02-01"
standard_names = {'boch': "Consolidated Statement",
'hangseng': "Statement of",
'hsbc': "Statement - HSBC One Account",
'sc': "statementOfAccount"}
for bank_name in standard_names:
if bank_name in str(img_path) or standard_names[bank_name] in str(img_path):
return bank_name
def check_mr(text):
openings = ['mr', 'ms', 'miss', 'mrs']
words = text.lower().split()
if words and words[0] in openings:
return ''.join(words[1:])
else:
return text
def get_info_from_bank(img_path, pdf_path):
# Running the model
ocr = CnOcr(rec_model_name='densenet_lite_136-gru')
out = ocr.ocr(img_path)
# Data
bank_data = {
"name_on_bs": "",
"address": "",
"bank": "",
"date": "",
"asset": 0.0,
"liabilities": ""
}
# {
# "Customer Name": "MR CHIU CHUNG YIN",
# "Address": "FLAT 13,8/F,OILOK HOUSE, YAU OI ESTATE, TUEN MUN NT",
# "Bank Name": "HSBC",
# "Statement Issue Date": "10 January 2023",
# "Total Asset": "7,265.80",
# "Total Liability": "7,265.80"
# }
openai.api_key = "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"
invalid_list = [' ',',']
data_set_1 = []
pdf = PDFQuery(pdf_path)
pdf.load(0)
text_elements = pdf.pq('LTTextLineHorizontal').text()
text_elements = text_elements.replace("cid:", "")
for item in out:
if item['text'] not in invalid_list:
data_set_1.append(item['text'])
completion = openai.ChatCompletion.create(
model = "gpt-3.5-turbo",
temperature = 0.2,
messages = [
{"role": "system", "content": "You are an AI assistant for extracting data from bank statements. Uppercase and lowercase letters are the same. List results in a dictionary format."},
{"role": "user", "content": f"Extract data from the following 2 sets of text: {data_set_1} and {text_elements}. (1.) Data that locate in the front part of the text: customer full name, address in Hong Kong (including flat, floor, court/estate, region in Hong Kong), bank name, bank statement issue date (verly likely to be within 1-2 years), (2.) Data that mainly locate in the other part of the text: total asset (including investments and deposits) and total liability (often contains DR and includes credit card but might be zero) of the current month."},
# {"role": "assistant", "content": "Q: How do you make 7 even? A: Take away the s."},
# {"role": "user", "content": "Write one related to programmers."}
]
)
bs_data = completion['choices'][0]['message']['content']
print(bs_data)
return bs_data
# get_info_from_bank('hangseng_page-0001.jpg','hangseng.pdf')
# get_info_from_bank('hsbc_one_account_page-0001.jpg','hsbc_one_account.pdf')
# get_info_from_bank('boch_consolidated.jpg','boch_consolidated.pdf')
get_info_from_bank('hsbc_one_account_page-10001.jpg','hsbc_one_account_page-10001.pdf')