################# cnocr ################## from cnocr import CnOcr from pdfquery import PDFQuery import openai def validate(text): invalid_list = [' ',','] for char in invalid_list: text = text.replace(char, '') return text def check_bank(text): text = text.replace(' ', '') bank_list = ['bankofchina','hangseng','hsbc','sc'] for bank in bank_list: if bank in text: return bank else: return False def check_bank_name(img_path): # BOCH - "Consolidated Statement 2023-01-01" # HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07" # HSBC - "Statement - HSBC One Account 2023-02-10" # Standard Chartered - "statementOfAccount 2023-02-01" standard_names = {'boch': "Consolidated Statement", 'hangseng': "Statement of", 'hsbc': "Statement - HSBC One Account", 'sc': "statementOfAccount"} for bank_name in standard_names: if bank_name in str(img_path) or standard_names[bank_name] in str(img_path): return bank_name def check_mr(text): openings = ['mr', 'ms', 'miss', 'mrs'] words = text.lower().split() if words and words[0] in openings: return ''.join(words[1:]) else: return text def get_info_from_bank(img_path, pdf_path): # Running the model ocr = CnOcr(rec_model_name='densenet_lite_136-gru') out = ocr.ocr(img_path) # Data bank_data = { "name_on_bs": "", "address": "", "bank": "", "date": "", "asset": 0.0, "liabilities": "" } # { # "Customer Name": "MR CHIU CHUNG YIN", # "Address": "FLAT 13,8/F,OILOK HOUSE, YAU OI ESTATE, TUEN MUN NT", # "Bank Name": "HSBC", # "Statement Issue Date": "10 January 2023", # "Total Asset": "7,265.80", # "Total Liability": "7,265.80" # } openai.api_key = "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" invalid_list = [' ',','] data_set_1 = [] pdf = PDFQuery(pdf_path) pdf.load(0) text_elements = pdf.pq('LTTextLineHorizontal').text() text_elements = text_elements.replace("cid:", "") for item in out: if item['text'] not in invalid_list: data_set_1.append(item['text']) completion = openai.ChatCompletion.create( model = "gpt-3.5-turbo", temperature = 0.2, messages = [ {"role": "system", "content": "You are an AI assistant for extracting data from bank statements. Uppercase and lowercase letters are the same. List results in a dictionary format."}, {"role": "user", "content": f"Extract data from the following 2 sets of text: {data_set_1} and {text_elements}. (1.) Data that locate in the front part of the text: customer full name, address in Hong Kong (including flat, floor, court/estate, region in Hong Kong), bank name, bank statement issue date (verly likely to be within 1-2 years), (2.) Data that mainly locate in the other part of the text: total asset (including investments and deposits) and total liability (often contains DR and includes credit card but might be zero) of the current month."}, # {"role": "assistant", "content": "Q: How do you make 7 even? A: Take away the s."}, # {"role": "user", "content": "Write one related to programmers."} ] ) bs_data = completion['choices'][0]['message']['content'] print(bs_data) return bs_data # get_info_from_bank('hangseng_page-0001.jpg','hangseng.pdf') # get_info_from_bank('hsbc_one_account_page-0001.jpg','hsbc_one_account.pdf') # get_info_from_bank('boch_consolidated.jpg','boch_consolidated.pdf') get_info_from_bank('hsbc_one_account_page-10001.jpg','hsbc_one_account_page-10001.pdf')