Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,12 +7,12 @@ import torch
|
|
7 |
# Wczytanie modelu LayoutLMv3
|
8 |
model_name = "kryman27/layoutlmv3-finetuned"
|
9 |
model = LayoutLMForTokenClassification.from_pretrained(model_name)
|
10 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
|
12 |
# Reguły do wykrywania NIP, kwot, dat
|
13 |
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
|
14 |
-
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\s?(PLN|zł|EUR|USD)?\b') #
|
15 |
-
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
|
16 |
payment_keywords = ["data płatności", "termin płatności", "zapłata", "zapłacono", "płatność"]
|
17 |
seller_keywords = ["sprzedawca", "faktura wystawiona przez", "wystawca", "nazwa firmy"]
|
18 |
|
@@ -23,17 +23,17 @@ def extract_invoice_data(pdf_file):
|
|
23 |
for page in pdf.pages:
|
24 |
extracted_words = page.extract_words()
|
25 |
for word in extracted_words:
|
26 |
-
words.append(word['text'])
|
27 |
-
bbox = [int(word['x0']), int(word['top']), int(word['x1']), int(word['bottom'])]
|
28 |
-
boxes.append(bbox)
|
29 |
|
30 |
page_text = page.extract_text()
|
31 |
if page_text:
|
32 |
full_text.append(page_text.lower())
|
33 |
|
34 |
-
full_text = "\n".join(full_text)
|
35 |
|
36 |
-
# Tokenizacja
|
37 |
encoding = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt", truncation=True)
|
38 |
|
39 |
# Predykcja modelu
|
@@ -44,13 +44,12 @@ def extract_invoice_data(pdf_file):
|
|
44 |
# Przetwarzanie wyników
|
45 |
entities = []
|
46 |
for token, pred in zip(words, predictions):
|
47 |
-
if pred > 0:
|
48 |
entities.append((token, model.config.id2label[pred]))
|
49 |
|
50 |
# 🏢 Wyszukiwanie nazwy sprzedawcy
|
51 |
seller_name = [token for token, label in entities if "ORG" in label]
|
52 |
|
53 |
-
# Jeśli model nie znalazł, szukamy w tekście
|
54 |
if not seller_name:
|
55 |
for line in full_text.split("\n"):
|
56 |
if any(keyword in line for keyword in seller_keywords):
|
@@ -62,7 +61,7 @@ def extract_invoice_data(pdf_file):
|
|
62 |
|
63 |
# 💰 Wyszukiwanie kwoty całkowitej (największa kwota z walutą)
|
64 |
kwoty = kwota_pattern.findall(full_text)
|
65 |
-
kwoty = [k
|
66 |
total_amount = max(map(float, kwoty), default=None) if kwoty else None
|
67 |
|
68 |
# 📆 Wyszukiwanie daty płatności
|
|
|
7 |
# Wczytanie modelu LayoutLMv3
|
8 |
model_name = "kryman27/layoutlmv3-finetuned"
|
9 |
model = LayoutLMForTokenClassification.from_pretrained(model_name)
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
|
12 |
# Reguły do wykrywania NIP, kwot, dat
|
13 |
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
|
14 |
+
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\s?(PLN|zł|EUR|USD)?\b') # Kwoty z walutami
|
15 |
+
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
|
16 |
payment_keywords = ["data płatności", "termin płatności", "zapłata", "zapłacono", "płatność"]
|
17 |
seller_keywords = ["sprzedawca", "faktura wystawiona przez", "wystawca", "nazwa firmy"]
|
18 |
|
|
|
23 |
for page in pdf.pages:
|
24 |
extracted_words = page.extract_words()
|
25 |
for word in extracted_words:
|
26 |
+
words.append(word['text'])
|
27 |
+
bbox = [int(word['x0']), int(word['top']), int(word['x1']), int(word['bottom'])]
|
28 |
+
boxes.append(bbox)
|
29 |
|
30 |
page_text = page.extract_text()
|
31 |
if page_text:
|
32 |
full_text.append(page_text.lower())
|
33 |
|
34 |
+
full_text = "\n".join(full_text)
|
35 |
|
36 |
+
# Tokenizacja + bounding boxes
|
37 |
encoding = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt", truncation=True)
|
38 |
|
39 |
# Predykcja modelu
|
|
|
44 |
# Przetwarzanie wyników
|
45 |
entities = []
|
46 |
for token, pred in zip(words, predictions):
|
47 |
+
if pred > 0:
|
48 |
entities.append((token, model.config.id2label[pred]))
|
49 |
|
50 |
# 🏢 Wyszukiwanie nazwy sprzedawcy
|
51 |
seller_name = [token for token, label in entities if "ORG" in label]
|
52 |
|
|
|
53 |
if not seller_name:
|
54 |
for line in full_text.split("\n"):
|
55 |
if any(keyword in line for keyword in seller_keywords):
|
|
|
61 |
|
62 |
# 💰 Wyszukiwanie kwoty całkowitej (największa kwota z walutą)
|
63 |
kwoty = kwota_pattern.findall(full_text)
|
64 |
+
kwoty = [k.replace(",", ".") for k in kwoty if isinstance(k, str) and k.replace(",", ".").replace(".", "").isdigit()] # ✅ Poprawiona linia
|
65 |
total_amount = max(map(float, kwoty), default=None) if kwoty else None
|
66 |
|
67 |
# 📆 Wyszukiwanie daty płatności
|