szczotar commited on
Commit
1125d95
·
1 Parent(s): fe3447a
.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "20356e27-98f6-4a19-b0ec-d1d2e92029f1",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
14
+ " warnings.warn(\n",
15
+ "C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
16
+ " warnings.warn(\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "from handler import EndpointHandler\n",
22
+ " \n",
23
+ "# init handler\n",
24
+ "my_handler = EndpointHandler(path=\"Szczotar93/Layoutlm_Inkaso_2\")"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 5,
30
+ "id": "63a53fa9-c2ae-425c-9a8a-ec2415753630",
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor, LayoutLMTokenizer, AutoModelForTokenClassification, AutoProcessor\n",
35
+ "\n",
36
+ "from PIL import Image, ImageDraw, ImageFont\n",
37
+ "import torch\n",
38
+ "import pandas as pd\n",
39
+ "import pytesseract\n",
40
+ "\n",
41
+ "pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 2,
47
+ "id": "684076e3-9ec2-4c99-af1e-6860a3e355e9",
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "from PIL import Image\n",
52
+ "filename = r\"C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\documentsImages\\test\\2.png\""
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 3,
58
+ "id": "660f4096-9122-41fc-b38a-fd5299a16df5",
59
+ "metadata": {},
60
+ "outputs": [],
61
+ "source": [
62
+ "img = Image.open(filename)"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 4,
68
+ "id": "04ee0572-093c-4e30-872c-24216c807e4c",
69
+ "metadata": {},
70
+ "outputs": [
71
+ {
72
+ "data": {
73
+ "text/plain": [
74
+ "'C:\\\\Users\\\\ArturSzczotarski\\\\LLM env\\\\du_env\\\\documentsImages\\\\test\\\\2.png'"
75
+ ]
76
+ },
77
+ "execution_count": 4,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "img.filename"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 6,
89
+ "id": "096b6ffb-767e-45a2-bf4b-1f6d3f67f3a4",
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "data": {
94
+ "text/plain": [
95
+ "{'predictions': [[{'word': 'km 1792 /', 'label': 'doc id', 'score': '0.94'},\n",
96
+ " {'word': 'wezwanie do dokonywania potraceh ztur',\n",
97
+ " 'label': 'title',\n",
98
+ " 'score': '0.98'},\n",
99
+ " {'word': 'kredyt inkaso s. a', 'label': 'creditor name', 'score': '0.95'},\n",
100
+ " {'word': '02 - 672 warszawa domaniewska 39',\n",
101
+ " 'label': 'creditor address',\n",
102
+ " 'score': '0.97'},\n",
103
+ " {'word': '##ter mateusz garbula kanaria. -',\n",
104
+ " 'label': 'creditor proxy',\n",
105
+ " 'score': '0.92'}]]}"
106
+ ]
107
+ },
108
+ "execution_count": 6,
109
+ "metadata": {},
110
+ "output_type": "execute_result"
111
+ }
112
+ ],
113
+ "source": [
114
+ "my_handler({\"inputs\": img})"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 9,
120
+ "id": "52580570-51a6-4d73-aab0-ba3bf2af41f2",
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "data": {
125
+ "text/plain": [
126
+ "'C:\\\\Users\\\\ArturSzczotarski\\\\LLM env\\\\du_env\\\\documentsImages\\\\test\\\\2.png'"
127
+ ]
128
+ },
129
+ "execution_count": 9,
130
+ "metadata": {},
131
+ "output_type": "execute_result"
132
+ }
133
+ ],
134
+ "source": [
135
+ "img.filename"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 8,
141
+ "id": "04c995ad-634b-4057-92ad-25d329371911",
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "ename": "TypeError",
146
+ "evalue": "object of type 'PngImageFile' has no len()",
147
+ "output_type": "error",
148
+ "traceback": [
149
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
150
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
151
+ "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataLoader\n\u001b[1;32m----> 3\u001b[0m dataloader \u001b[38;5;241m=\u001b[39m \u001b[43mDataLoader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimg\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
152
+ "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\dataloader.py:350\u001b[0m, in \u001b[0;36mDataLoader.__init__\u001b[1;34m(self, dataset, batch_size, shuffle, sampler, batch_sampler, num_workers, collate_fn, pin_memory, drop_last, timeout, worker_init_fn, multiprocessing_context, generator, prefetch_factor, persistent_workers, pin_memory_device)\u001b[0m\n\u001b[0;32m 348\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m: \u001b[38;5;66;03m# map-style\u001b[39;00m\n\u001b[0;32m 349\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m shuffle:\n\u001b[1;32m--> 350\u001b[0m sampler \u001b[38;5;241m=\u001b[39m \u001b[43mRandomSampler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[0;32m 351\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 352\u001b[0m sampler \u001b[38;5;241m=\u001b[39m SequentialSampler(dataset) \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n",
153
+ "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\sampler.py:142\u001b[0m, in \u001b[0;36mRandomSampler.__init__\u001b[1;34m(self, data_source, replacement, num_samples, generator)\u001b[0m\n\u001b[0;32m 139\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplacement, \u001b[38;5;28mbool\u001b[39m):\n\u001b[0;32m 140\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreplacement should be a boolean value, but got replacement=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplacement\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 142\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_samples\u001b[49m, \u001b[38;5;28mint\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_samples \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 143\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_samples should be a positive integer value, but got num_samples=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_samples\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
154
+ "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\sampler.py:149\u001b[0m, in \u001b[0;36mRandomSampler.num_samples\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 145\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[0;32m 146\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnum_samples\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mint\u001b[39m:\n\u001b[0;32m 147\u001b[0m \u001b[38;5;66;03m# dataset size might change at runtime\u001b[39;00m\n\u001b[0;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_samples \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 149\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_source\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 150\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_samples\n",
155
+ "\u001b[1;31mTypeError\u001b[0m: object of type 'PngImageFile' has no len()"
156
+ ]
157
+ }
158
+ ],
159
+ "source": [
160
+ "from torch.utils.data import DataLoader\n",
161
+ "\n",
162
+ "dataloader = DataLoader(img, batch_size=1, shuffle=True)"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "id": "e604a2bd-a068-46bb-82d8-4fba7fc6212b",
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": []
172
+ }
173
+ ],
174
+ "metadata": {
175
+ "kernelspec": {
176
+ "display_name": "du_env",
177
+ "language": "python",
178
+ "name": "du_env"
179
+ },
180
+ "language_info": {
181
+ "codemirror_mode": {
182
+ "name": "ipython",
183
+ "version": 3
184
+ },
185
+ "file_extension": ".py",
186
+ "mimetype": "text/x-python",
187
+ "name": "python",
188
+ "nbconvert_exporter": "python",
189
+ "pygments_lexer": "ipython3",
190
+ "version": "3.11.7"
191
+ }
192
+ },
193
+ "nbformat": 4,
194
+ "nbformat_minor": 5
195
+ }
.ipynb_checkpoints/handler-checkpoint.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor
3
+ import torch
4
+ from subprocess import run
5
+
6
+ # install tesseract-ocr and pytesseract
7
+ # run("apt install -y tesseract-ocr", shell=True, check=True)
8
+ run("pip install pytesseract", shell=True, check=True)
9
+
10
+ # helper function to unnormalize bboxes for drawing onto the image
11
+ def unnormalize_box(bbox, width, height):
12
+ return [
13
+ width * (bbox[0] / 1000),
14
+ height * (bbox[1] / 1000),
15
+ width * (bbox[2] / 1000),
16
+ height * (bbox[3] / 1000),
17
+ ]
18
+
19
+ def predict(Image, processor, model):
20
+ """Process document and prepare the data for LayoutLM inference
21
+
22
+ Args:
23
+ urls (List[str]): Batch of pre-signed document urls
24
+ Returns:
25
+ (List[List[Dict]]): Features extraction
26
+ """
27
+
28
+
29
+ # images = [get_image_from_url(url) for url in urls]
30
+ encoding = processor(
31
+ images = Image,
32
+ return_tensors="pt",
33
+ padding="max_length",
34
+ truncation=True,
35
+ )
36
+ del encoding["image"] # LayoutLM doesn't require the image
37
+ outputs = model(**encoding)
38
+ results = process_outputs(
39
+ outputs, encoding=encoding,
40
+ images=Image, model=model,
41
+ processor=processor,
42
+ threshold = 0.75
43
+ )
44
+ return results, encoding
45
+ def get_uniqueLabelList(labels):
46
+ uqnieue_labels =[]
47
+ for label in labels[0]:
48
+ try:
49
+ label_short = label.split("-")[1]
50
+ if label_short not in uqnieue_labels:
51
+ uqnieue_labels.append(label_short)
52
+ except:
53
+ if label not in uqnieue_labels:
54
+ uqnieue_labels.append(label)
55
+ else:
56
+ pass
57
+ return uqnieue_labels
58
+
59
+ def process_outputs(outputs, encoding, images, model, processor, threshold):
60
+ scores, _ = torch.max(outputs.logits.softmax(axis=-1), dim=-1)
61
+ scores = scores.tolist()
62
+ predictions = outputs.logits.argmax(-1)
63
+ labels = [[model.config.id2label[pred.item()] for pred in prediction] for prediction in predictions]
64
+ results = _process_outputs(
65
+ encoding=encoding,
66
+ tokenizer=processor.tokenizer,
67
+ processor = processor,
68
+ labels=labels,
69
+ scores=scores,
70
+ images=images,
71
+ threshold = threshold
72
+ )
73
+ return results
74
+
75
+ def _process_outputs(encoding, tokenizer, labels, scores, images, processor, threshold):
76
+ results = []
77
+
78
+ width, height = images.size
79
+ entities = []
80
+ previous_word_idx = 0
81
+ unique_lables = get_uniqueLabelList(labels)
82
+ # tokens = tokenizer.convert_ids_to_tokens(input_ids)
83
+ # word_ids = encoding.word_ids(batch_index=batch_idx)
84
+ # word = ""
85
+ entite_wordsidx = []
86
+ for idx, label in enumerate(unique_lables):
87
+ score_sum = float(0)
88
+ if label != "O":
89
+ for ix, pred in enumerate(labels[0]):
90
+ if scores[0][ix] > threshold:
91
+ if label in pred:
92
+ score_sum += scores[0][ix]
93
+ entite_wordsidx.append(ix)
94
+ #
95
+
96
+ try:
97
+ score_mean = f'{score_sum/len(entite_wordsidx):.2f}'
98
+ except:
99
+ score_mean = 0.0
100
+ # entite_wordsidx.append(entite_wordsidx[-1] + 1)
101
+ entities.append(
102
+ {
103
+ "word": processor.decode(encoding.input_ids[0][entite_wordsidx]),
104
+ "label": unique_lables[idx],
105
+ "score": score_mean
106
+ ,
107
+ }
108
+ )
109
+
110
+ entite_wordsidx = []
111
+
112
+
113
+ results.append(entities)
114
+ return results
115
+
116
+ def unnormalize_box(bbox, width, height):
117
+ return [
118
+ int(width * (bbox[0] / 1000)),
119
+ int(height * (bbox[1] / 1000)),
120
+ int(width * (bbox[2] / 1000)),
121
+ int(height * (bbox[3] / 1000)),
122
+ ]
123
+ def get_image_from_url(Image):
124
+ return Image.open(f).convert("RGB") # LayoutLMv2Processor requires RGB format
125
+ # set device
126
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
127
+
128
+
129
+ class EndpointHandler:
130
+ def __init__(self, path=""):
131
+ # load model and processor from path
132
+ self.model = LayoutLMForTokenClassification.from_pretrained(path).to(device)
133
+ self.processor = LayoutLMv2Processor.from_pretrained(path)
134
+
135
+ def __call__(self, data: Dict[str, bytes]) -> Dict[str, List[Any]]:
136
+ """
137
+ Args:
138
+ data (:obj:):
139
+ includes the deserialized image file as PIL.Image
140
+ """
141
+ # process input
142
+ image = data.pop("inputs", data)
143
+ print(image.filename)
144
+
145
+ result, encod = predict(image, self.processor, self.model)
146
+ return {"predictions": result}
Untitled.ipynb ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "20356e27-98f6-4a19-b0ec-d1d2e92029f1",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
14
+ " warnings.warn(\n",
15
+ "C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
16
+ " warnings.warn(\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "from handler import EndpointHandler\n",
22
+ " \n",
23
+ "# init handler\n",
24
+ "my_handler = EndpointHandler(path=\"Szczotar93/Layoutlm_Inkaso_2\")"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 5,
30
+ "id": "63a53fa9-c2ae-425c-9a8a-ec2415753630",
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor, LayoutLMTokenizer, AutoModelForTokenClassification, AutoProcessor\n",
35
+ "\n",
36
+ "from PIL import Image, ImageDraw, ImageFont\n",
37
+ "import torch\n",
38
+ "import pandas as pd\n",
39
+ "import pytesseract\n",
40
+ "\n",
41
+ "pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 2,
47
+ "id": "684076e3-9ec2-4c99-af1e-6860a3e355e9",
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "from PIL import Image\n",
52
+ "filename = r\"C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\documentsImages\\test\\2.png\""
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 3,
58
+ "id": "660f4096-9122-41fc-b38a-fd5299a16df5",
59
+ "metadata": {},
60
+ "outputs": [],
61
+ "source": [
62
+ "img = Image.open(filename)"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 4,
68
+ "id": "04ee0572-093c-4e30-872c-24216c807e4c",
69
+ "metadata": {},
70
+ "outputs": [
71
+ {
72
+ "data": {
73
+ "text/plain": [
74
+ "'C:\\\\Users\\\\ArturSzczotarski\\\\LLM env\\\\du_env\\\\documentsImages\\\\test\\\\2.png'"
75
+ ]
76
+ },
77
+ "execution_count": 4,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "img.filename"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 6,
89
+ "id": "096b6ffb-767e-45a2-bf4b-1f6d3f67f3a4",
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "data": {
94
+ "text/plain": [
95
+ "{'predictions': [[{'word': 'km 1792 /', 'label': 'doc id', 'score': '0.94'},\n",
96
+ " {'word': 'wezwanie do dokonywania potraceh ztur',\n",
97
+ " 'label': 'title',\n",
98
+ " 'score': '0.98'},\n",
99
+ " {'word': 'kredyt inkaso s. a', 'label': 'creditor name', 'score': '0.95'},\n",
100
+ " {'word': '02 - 672 warszawa domaniewska 39',\n",
101
+ " 'label': 'creditor address',\n",
102
+ " 'score': '0.97'},\n",
103
+ " {'word': '##ter mateusz garbula kanaria. -',\n",
104
+ " 'label': 'creditor proxy',\n",
105
+ " 'score': '0.92'}]]}"
106
+ ]
107
+ },
108
+ "execution_count": 6,
109
+ "metadata": {},
110
+ "output_type": "execute_result"
111
+ }
112
+ ],
113
+ "source": [
114
+ "my_handler({\"inputs\": img})"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 9,
120
+ "id": "52580570-51a6-4d73-aab0-ba3bf2af41f2",
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "data": {
125
+ "text/plain": [
126
+ "'C:\\\\Users\\\\ArturSzczotarski\\\\LLM env\\\\du_env\\\\documentsImages\\\\test\\\\2.png'"
127
+ ]
128
+ },
129
+ "execution_count": 9,
130
+ "metadata": {},
131
+ "output_type": "execute_result"
132
+ }
133
+ ],
134
+ "source": [
135
+ "img.filename"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 8,
141
+ "id": "04c995ad-634b-4057-92ad-25d329371911",
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "ename": "TypeError",
146
+ "evalue": "object of type 'PngImageFile' has no len()",
147
+ "output_type": "error",
148
+ "traceback": [
149
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
150
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
151
+ "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataLoader\n\u001b[1;32m----> 3\u001b[0m dataloader \u001b[38;5;241m=\u001b[39m \u001b[43mDataLoader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimg\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
152
+ "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\dataloader.py:350\u001b[0m, in \u001b[0;36mDataLoader.__init__\u001b[1;34m(self, dataset, batch_size, shuffle, sampler, batch_sampler, num_workers, collate_fn, pin_memory, drop_last, timeout, worker_init_fn, multiprocessing_context, generator, prefetch_factor, persistent_workers, pin_memory_device)\u001b[0m\n\u001b[0;32m 348\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m: \u001b[38;5;66;03m# map-style\u001b[39;00m\n\u001b[0;32m 349\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m shuffle:\n\u001b[1;32m--> 350\u001b[0m sampler \u001b[38;5;241m=\u001b[39m \u001b[43mRandomSampler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[0;32m 351\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 352\u001b[0m sampler \u001b[38;5;241m=\u001b[39m SequentialSampler(dataset) \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n",
153
+ "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\sampler.py:142\u001b[0m, in \u001b[0;36mRandomSampler.__init__\u001b[1;34m(self, data_source, replacement, num_samples, generator)\u001b[0m\n\u001b[0;32m 139\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplacement, \u001b[38;5;28mbool\u001b[39m):\n\u001b[0;32m 140\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreplacement should be a boolean value, but got replacement=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplacement\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 142\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_samples\u001b[49m, \u001b[38;5;28mint\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_samples \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 143\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_samples should be a positive integer value, but got num_samples=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_samples\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
154
+ "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\sampler.py:149\u001b[0m, in \u001b[0;36mRandomSampler.num_samples\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 145\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[0;32m 146\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnum_samples\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mint\u001b[39m:\n\u001b[0;32m 147\u001b[0m \u001b[38;5;66;03m# dataset size might change at runtime\u001b[39;00m\n\u001b[0;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_samples \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 149\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_source\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 150\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_samples\n",
155
+ "\u001b[1;31mTypeError\u001b[0m: object of type 'PngImageFile' has no len()"
156
+ ]
157
+ }
158
+ ],
159
+ "source": [
160
+ "from torch.utils.data import DataLoader\n",
161
+ "\n",
162
+ "dataloader = DataLoader(img, batch_size=1, shuffle=True)"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "id": "e604a2bd-a068-46bb-82d8-4fba7fc6212b",
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": []
172
+ }
173
+ ],
174
+ "metadata": {
175
+ "kernelspec": {
176
+ "display_name": "du_env",
177
+ "language": "python",
178
+ "name": "du_env"
179
+ },
180
+ "language_info": {
181
+ "codemirror_mode": {
182
+ "name": "ipython",
183
+ "version": 3
184
+ },
185
+ "file_extension": ".py",
186
+ "mimetype": "text/x-python",
187
+ "name": "python",
188
+ "nbconvert_exporter": "python",
189
+ "pygments_lexer": "ipython3",
190
+ "version": "3.11.7"
191
+ }
192
+ },
193
+ "nbformat": 4,
194
+ "nbformat_minor": 5
195
+ }
__pycache__/handler.cpython-311.pyc ADDED
Binary file (6.83 kB). View file
 
handler.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor
3
+ import torch
4
+ from subprocess import run
5
+
6
+ # install tesseract-ocr and pytesseract
7
+ run("apt install -y tesseract-ocr", shell=True, check=True)
8
+ run("pip install pytesseract", shell=True, check=True)
9
+
10
+ # helper function to unnormalize bboxes for drawing onto the image
11
+ def unnormalize_box(bbox, width, height):
12
+ return [
13
+ width * (bbox[0] / 1000),
14
+ height * (bbox[1] / 1000),
15
+ width * (bbox[2] / 1000),
16
+ height * (bbox[3] / 1000),
17
+ ]
18
+
19
+ def predict(Image, processor, model):
20
+ """Process document and prepare the data for LayoutLM inference
21
+
22
+ Args:
23
+ urls (List[str]): Batch of pre-signed document urls
24
+ Returns:
25
+ (List[List[Dict]]): Features extraction
26
+ """
27
+
28
+
29
+ # images = [get_image_from_url(url) for url in urls]
30
+ encoding = processor(
31
+ images = Image,
32
+ return_tensors="pt",
33
+ padding="max_length",
34
+ truncation=True,
35
+ )
36
+ del encoding["image"] # LayoutLM doesn't require the image
37
+ outputs = model(**encoding)
38
+ results = process_outputs(
39
+ outputs, encoding=encoding,
40
+ images=Image, model=model,
41
+ processor=processor,
42
+ threshold = 0.75
43
+ )
44
+ return results, encoding
45
+ def get_uniqueLabelList(labels):
46
+ uqnieue_labels =[]
47
+ for label in labels[0]:
48
+ try:
49
+ label_short = label.split("-")[1]
50
+ if label_short not in uqnieue_labels:
51
+ uqnieue_labels.append(label_short)
52
+ except:
53
+ if label not in uqnieue_labels:
54
+ uqnieue_labels.append(label)
55
+ else:
56
+ pass
57
+ return uqnieue_labels
58
+
59
+ def process_outputs(outputs, encoding, images, model, processor, threshold):
60
+ scores, _ = torch.max(outputs.logits.softmax(axis=-1), dim=-1)
61
+ scores = scores.tolist()
62
+ predictions = outputs.logits.argmax(-1)
63
+ labels = [[model.config.id2label[pred.item()] for pred in prediction] for prediction in predictions]
64
+ results = _process_outputs(
65
+ encoding=encoding,
66
+ tokenizer=processor.tokenizer,
67
+ processor = processor,
68
+ labels=labels,
69
+ scores=scores,
70
+ images=images,
71
+ threshold = threshold
72
+ )
73
+ return results
74
+
75
+ def _process_outputs(encoding, tokenizer, labels, scores, images, processor, threshold):
76
+ results = []
77
+
78
+ width, height = images.size
79
+ entities = []
80
+ previous_word_idx = 0
81
+ unique_lables = get_uniqueLabelList(labels)
82
+ # tokens = tokenizer.convert_ids_to_tokens(input_ids)
83
+ # word_ids = encoding.word_ids(batch_index=batch_idx)
84
+ # word = ""
85
+ entite_wordsidx = []
86
+ for idx, label in enumerate(unique_lables):
87
+ score_sum = float(0)
88
+ if label != "O":
89
+ for ix, pred in enumerate(labels[0]):
90
+ if scores[0][ix] > threshold:
91
+ if label in pred:
92
+ score_sum += scores[0][ix]
93
+ entite_wordsidx.append(ix)
94
+ #
95
+
96
+ try:
97
+ score_mean = f'{score_sum/len(entite_wordsidx):.2f}'
98
+ except:
99
+ score_mean = 0.0
100
+ # entite_wordsidx.append(entite_wordsidx[-1] + 1)
101
+ entities.append(
102
+ {
103
+ "word": processor.decode(encoding.input_ids[0][entite_wordsidx]),
104
+ "label": unique_lables[idx],
105
+ "score": score_mean
106
+ ,
107
+ }
108
+ )
109
+
110
+ entite_wordsidx = []
111
+
112
+
113
+ results.append(entities)
114
+ return results
115
+
116
+ def unnormalize_box(bbox, width, height):
117
+ return [
118
+ int(width * (bbox[0] / 1000)),
119
+ int(height * (bbox[1] / 1000)),
120
+ int(width * (bbox[2] / 1000)),
121
+ int(height * (bbox[3] / 1000)),
122
+ ]
123
+ def get_image_from_url(Image):
124
+ return Image.open(f).convert("RGB") # LayoutLMv2Processor requires RGB format
125
+ # set device
126
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
127
+
128
+
129
+ class EndpointHandler:
130
+ def __init__(self, path=""):
131
+ # load model and processor from path
132
+ self.model = LayoutLMForTokenClassification.from_pretrained(path).to(device)
133
+ self.processor = LayoutLMv2Processor.from_pretrained(path, apply_ocr=True)
134
+
135
+ def __call__(self, data: Dict[str, bytes]) -> Dict[str, List[Any]]:
136
+ """
137
+ Args:
138
+ data (:obj:):
139
+ includes the deserialized image file as PIL.Image
140
+ """
141
+ # process input
142
+ image = data.pop("inputs", data)
143
+
144
+ result, encod = predict(image, self.processor, self.model)
145
+ return {"predictions": result}