Spaces:
Running
Running
updated model to extract bank_name and cheque_date
Browse files- predict_cheque_parser.py +25 -23
predict_cheque_parser.py
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
|
|
|
|
2 |
from word2number import w2n
|
3 |
from dateutil import relativedelta
|
4 |
from datetime import datetime
|
5 |
from word2number import w2n
|
6 |
-
from textblob import Word
|
7 |
from PIL import Image
|
8 |
import torch
|
9 |
import re
|
10 |
|
11 |
-
CHEQUE_PARSER_MODEL = "shivi/donut-
|
12 |
-
TASK_PROMPT = "<
|
13 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
|
15 |
def load_donut_model_and_processor():
|
@@ -21,7 +22,6 @@ def load_donut_model_and_processor():
|
|
21 |
def prepare_data_using_processor(donut_processor,image_path):
|
22 |
## Pass image through donut processor's feature extractor and retrieve image tensor
|
23 |
image = load_image(image_path)
|
24 |
-
print("type image:", type(image))
|
25 |
pixel_values = donut_processor(image, return_tensors="pt").pixel_values
|
26 |
pixel_values = pixel_values.to(device)
|
27 |
|
@@ -70,28 +70,31 @@ def parse_cheque_with_donut(input_image_path):
|
|
70 |
|
71 |
payee_name = cheque_details_json['cheque_details'][2]['payee_name']
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
stale_cheque = check_if_cheque_is_stale(cheque_date)
|
77 |
|
78 |
-
return payee_name,amt_in_words,amt_in_figures,cheque_date,macthing_amts,stale_cheque
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
words = [word.lower() for word in words]
|
84 |
-
for word in words:
|
85 |
-
word = Word(word)
|
86 |
-
corrected_word = word.correct()+' '
|
87 |
-
corrected_amt_in_words += corrected_word
|
88 |
-
return corrected_amt_in_words
|
89 |
|
90 |
def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
|
91 |
macthing_amts = False
|
92 |
if len(legal_amount) == 0:
|
93 |
return macthing_amts
|
94 |
-
|
|
|
95 |
print("corrected_amt_in_words:",corrected_amt_in_words)
|
96 |
|
97 |
numeric_legal_amt = w2n.word_to_num(corrected_amt_in_words)
|
@@ -102,13 +105,12 @@ def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
|
|
102 |
|
103 |
def check_if_cheque_is_stale(cheque_issue_date):
|
104 |
stale_check = False
|
105 |
-
current_date = datetime.now().strftime('%d/%m/%
|
106 |
-
current_date_ = datetime.strptime(current_date, "%d/%m/%
|
107 |
-
cheque_issue_date_ = datetime.strptime(cheque_issue_date, "%d/%m/%
|
108 |
relative_diff = relativedelta.relativedelta(current_date_, cheque_issue_date_)
|
109 |
months_difference = (relative_diff.years * 12) + relative_diff.months
|
110 |
print("months_difference:",months_difference)
|
111 |
if months_difference > 3:
|
112 |
stale_check = True
|
113 |
-
return stale_check
|
114 |
-
|
|
|
1 |
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
2 |
+
import pkg_resources
|
3 |
+
from symspellpy import SymSpell
|
4 |
from word2number import w2n
|
5 |
from dateutil import relativedelta
|
6 |
from datetime import datetime
|
7 |
from word2number import w2n
|
|
|
8 |
from PIL import Image
|
9 |
import torch
|
10 |
import re
|
11 |
|
12 |
+
CHEQUE_PARSER_MODEL = "shivi/donut-cheque-parser"
|
13 |
+
TASK_PROMPT = "<parse-cheque>"
|
14 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
|
16 |
def load_donut_model_and_processor():
|
|
|
22 |
def prepare_data_using_processor(donut_processor,image_path):
|
23 |
## Pass image through donut processor's feature extractor and retrieve image tensor
|
24 |
image = load_image(image_path)
|
|
|
25 |
pixel_values = donut_processor(image, return_tensors="pt").pixel_values
|
26 |
pixel_values = pixel_values.to(device)
|
27 |
|
|
|
70 |
|
71 |
payee_name = cheque_details_json['cheque_details'][2]['payee_name']
|
72 |
|
73 |
+
bank_name = cheque_details_json['cheque_details'][3]['bank_name']
|
74 |
+
cheque_date = cheque_details_json['cheque_details'][4]['cheque_date']
|
75 |
+
|
76 |
stale_cheque = check_if_cheque_is_stale(cheque_date)
|
77 |
|
78 |
+
return payee_name,amt_in_words,amt_in_figures,bank_name,cheque_date,macthing_amts,stale_cheque
|
79 |
+
|
80 |
+
def spell_check(amt_in_words):
|
81 |
+
sym_spell = SymSpell(max_dictionary_edit_distance=2,prefix_length=7)
|
82 |
+
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_82_765.txt")
|
83 |
+
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
|
84 |
+
|
85 |
+
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
|
86 |
+
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
|
87 |
|
88 |
+
suggestions = sym_spell.lookup_compound(amt_in_words, max_edit_distance=2)
|
89 |
+
|
90 |
+
return suggestions[0].term
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
|
93 |
macthing_amts = False
|
94 |
if len(legal_amount) == 0:
|
95 |
return macthing_amts
|
96 |
+
|
97 |
+
corrected_amt_in_words = spell_check(legal_amount)
|
98 |
print("corrected_amt_in_words:",corrected_amt_in_words)
|
99 |
|
100 |
numeric_legal_amt = w2n.word_to_num(corrected_amt_in_words)
|
|
|
105 |
|
106 |
def check_if_cheque_is_stale(cheque_issue_date):
|
107 |
stale_check = False
|
108 |
+
current_date = datetime.now().strftime('%d/%m/%y')
|
109 |
+
current_date_ = datetime.strptime(current_date, "%d/%m/%y")
|
110 |
+
cheque_issue_date_ = datetime.strptime(cheque_issue_date, "%d/%m/%y")
|
111 |
relative_diff = relativedelta.relativedelta(current_date_, cheque_issue_date_)
|
112 |
months_difference = (relative_diff.years * 12) + relative_diff.months
|
113 |
print("months_difference:",months_difference)
|
114 |
if months_difference > 3:
|
115 |
stale_check = True
|
116 |
+
return stale_check
|
|