hanlder

Files changed (5) hide show

.ipynb_checkpoints/Untitled-checkpoint.ipynb +195 -0
.ipynb_checkpoints/handler-checkpoint.py +146 -0
Untitled.ipynb +195 -0
__pycache__/handler.cpython-311.pyc +0 -0
handler.py +145 -0

.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "20356e27-98f6-4a19-b0ec-d1d2e92029f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from handler import EndpointHandler\n",
+    " \n",
+    "# init handler\n",
+    "my_handler = EndpointHandler(path=\"Szczotar93/Layoutlm_Inkaso_2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "63a53fa9-c2ae-425c-9a8a-ec2415753630",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor, LayoutLMTokenizer, AutoModelForTokenClassification, AutoProcessor\n",
+    "\n",
+    "from PIL import Image, ImageDraw, ImageFont\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "import pytesseract\n",
+    "\n",
+    "pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "684076e3-9ec2-4c99-af1e-6860a3e355e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "filename = r\"C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\documentsImages\\test\\2.png\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "660f4096-9122-41fc-b38a-fd5299a16df5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img = Image.open(filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "04ee0572-093c-4e30-872c-24216c807e4c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'C:\\\\Users\\\\ArturSzczotarski\\\\LLM env\\\\du_env\\\\documentsImages\\\\test\\\\2.png'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "img.filename"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "096b6ffb-767e-45a2-bf4b-1f6d3f67f3a4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'predictions': [[{'word': 'km 1792 /', 'label': 'doc id', 'score': '0.94'},\n",
+       "   {'word': 'wezwanie do dokonywania potraceh ztur',\n",
+       "    'label': 'title',\n",
+       "    'score': '0.98'},\n",
+       "   {'word': 'kredyt inkaso s. a', 'label': 'creditor name', 'score': '0.95'},\n",
+       "   {'word': '02 - 672 warszawa domaniewska 39',\n",
+       "    'label': 'creditor address',\n",
+       "    'score': '0.97'},\n",
+       "   {'word': '##ter mateusz garbula kanaria. -',\n",
+       "    'label': 'creditor proxy',\n",
+       "    'score': '0.92'}]]}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "my_handler({\"inputs\": img})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "52580570-51a6-4d73-aab0-ba3bf2af41f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'C:\\\\Users\\\\ArturSzczotarski\\\\LLM env\\\\du_env\\\\documentsImages\\\\test\\\\2.png'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "img.filename"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "04c995ad-634b-4057-92ad-25d329371911",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "object of type 'PngImageFile' has no len()",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataLoader\n\u001b[1;32m----> 3\u001b[0m dataloader \u001b[38;5;241m=\u001b[39m \u001b[43mDataLoader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimg\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\dataloader.py:350\u001b[0m, in \u001b[0;36mDataLoader.__init__\u001b[1;34m(self, dataset, batch_size, shuffle, sampler, batch_sampler, num_workers, collate_fn, pin_memory, drop_last, timeout, worker_init_fn, multiprocessing_context, generator, prefetch_factor, persistent_workers, pin_memory_device)\u001b[0m\n\u001b[0;32m    348\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:  \u001b[38;5;66;03m# map-style\u001b[39;00m\n\u001b[0;32m    349\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m shuffle:\n\u001b[1;32m--> 350\u001b[0m         sampler \u001b[38;5;241m=\u001b[39m \u001b[43mRandomSampler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[0;32m    351\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    352\u001b[0m         sampler \u001b[38;5;241m=\u001b[39m SequentialSampler(dataset)  \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n",
+      "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\sampler.py:142\u001b[0m, in \u001b[0;36mRandomSampler.__init__\u001b[1;34m(self, data_source, replacement, num_samples, generator)\u001b[0m\n\u001b[0;32m    139\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplacement, \u001b[38;5;28mbool\u001b[39m):\n\u001b[0;32m    140\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreplacement should be a boolean value, but got replacement=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplacement\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 142\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_samples\u001b[49m, \u001b[38;5;28mint\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_samples \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m    143\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_samples should be a positive integer value, but got num_samples=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_samples\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\sampler.py:149\u001b[0m, in \u001b[0;36mRandomSampler.num_samples\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    145\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[0;32m    146\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnum_samples\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mint\u001b[39m:\n\u001b[0;32m    147\u001b[0m     \u001b[38;5;66;03m# dataset size might change at runtime\u001b[39;00m\n\u001b[0;32m    148\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_samples \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 149\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_source\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    150\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_samples\n",
+      "\u001b[1;31mTypeError\u001b[0m: object of type 'PngImageFile' has no len()"
+     ]
+    }
+   ],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "dataloader = DataLoader(img, batch_size=1, shuffle=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e604a2bd-a068-46bb-82d8-4fba7fc6212b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "du_env",
+   "language": "python",
+   "name": "du_env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

.ipynb_checkpoints/handler-checkpoint.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from typing import Dict, List, Any
+from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor
+import torch
+from subprocess import run
+# install tesseract-ocr and pytesseract
+# run("apt install -y tesseract-ocr", shell=True, check=True)
+run("pip install pytesseract", shell=True, check=True)
+# helper function to unnormalize bboxes for drawing onto the image
+def unnormalize_box(bbox, width, height):
+    return [
+        width * (bbox[0] / 1000),
+        height * (bbox[1] / 1000),
+        width * (bbox[2] / 1000),
+        height * (bbox[3] / 1000),
+    ]
+def predict(Image, processor, model):
+    """Process document and prepare the data for LayoutLM inference
+    Args:
+        urls (List[str]): Batch of pre-signed document urls
+    Returns:
+        (List[List[Dict]]): Features extraction
+    """
+#     images = [get_image_from_url(url) for url in urls]
+    encoding = processor(
+            images = Image,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+        )
+    del encoding["image"] # LayoutLM doesn't require the image
+    outputs = model(**encoding)
+    results = process_outputs(
+        outputs, encoding=encoding,
+        images=Image, model=model,
+        processor=processor,
+        threshold = 0.75
+    )
+    return results, encoding
+def get_uniqueLabelList(labels):
+    uqnieue_labels =[]
+    for label in labels[0]:
+        try:
+            label_short = label.split("-")[1]
+            if label_short not in uqnieue_labels:
+                uqnieue_labels.append(label_short)
+        except:
+            if label not in uqnieue_labels:
+                uqnieue_labels.append(label)
+            else:
+                pass
+    return uqnieue_labels
+def process_outputs(outputs, encoding, images, model, processor, threshold):
+    scores, _ = torch.max(outputs.logits.softmax(axis=-1), dim=-1)
+    scores = scores.tolist()
+    predictions = outputs.logits.argmax(-1)
+    labels = [[model.config.id2label[pred.item()] for pred in prediction] for prediction in predictions]
+    results = _process_outputs(
+        encoding=encoding,
+        tokenizer=processor.tokenizer,
+        processor = processor,
+        labels=labels,
+        scores=scores,
+        images=images,
+        threshold = threshold
+    )
+    return results
+def _process_outputs(encoding, tokenizer, labels, scores, images, processor, threshold):
+    results = []
+    width, height = images.size
+    entities = []
+    previous_word_idx = 0
+    unique_lables = get_uniqueLabelList(labels)
+#     tokens = tokenizer.convert_ids_to_tokens(input_ids)
+#     word_ids = encoding.word_ids(batch_index=batch_idx)
+#     word = ""
+    entite_wordsidx = []
+    for idx, label in enumerate(unique_lables):
+        score_sum = float(0)
+        if label != "O":
+            for ix, pred in enumerate(labels[0]):
+                if scores[0][ix] > threshold:
+                    if label in pred:
+                        score_sum += scores[0][ix]
+                        entite_wordsidx.append(ix)
+#
+                try:
+                    score_mean = f'{score_sum/len(entite_wordsidx):.2f}'
+                except:
+                    score_mean = 0.0
+            # entite_wordsidx.append(entite_wordsidx[-1] + 1)
+            entities.append(
+                {
+                    "word": processor.decode(encoding.input_ids[0][entite_wordsidx]),
+                    "label": unique_lables[idx],
+                    "score": score_mean
+                    ,
+                }
+            )
+            entite_wordsidx = []
+    results.append(entities)
+    return results
+def unnormalize_box(bbox, width, height):
+    return [
+        int(width * (bbox[0] / 1000)),
+        int(height * (bbox[1] / 1000)),
+        int(width * (bbox[2] / 1000)),
+        int(height * (bbox[3] / 1000)),
+    ]
+def get_image_from_url(Image):
+    return Image.open(f).convert("RGB") # LayoutLMv2Processor requires RGB format
+# set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class EndpointHandler:
+    def __init__(self, path=""):
+        # load model and processor from path
+        self.model = LayoutLMForTokenClassification.from_pretrained(path).to(device)
+        self.processor = LayoutLMv2Processor.from_pretrained(path)
+    def __call__(self, data: Dict[str, bytes]) -> Dict[str, List[Any]]:
+        """
+        Args:
+            data (:obj:):
+                includes the deserialized image file as PIL.Image
+        """
+        # process input
+        image = data.pop("inputs", data)
+        print(image.filename)
+        result, encod = predict(image, self.processor, self.model)
+        return {"predictions": result}

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "20356e27-98f6-4a19-b0ec-d1d2e92029f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from handler import EndpointHandler\n",
+    " \n",
+    "# init handler\n",
+    "my_handler = EndpointHandler(path=\"Szczotar93/Layoutlm_Inkaso_2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "63a53fa9-c2ae-425c-9a8a-ec2415753630",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor, LayoutLMTokenizer, AutoModelForTokenClassification, AutoProcessor\n",
+    "\n",
+    "from PIL import Image, ImageDraw, ImageFont\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "import pytesseract\n",
+    "\n",
+    "pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "684076e3-9ec2-4c99-af1e-6860a3e355e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "filename = r\"C:\\Users\\ArturSzczotarski\\LLM env\\du_env\\documentsImages\\test\\2.png\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "660f4096-9122-41fc-b38a-fd5299a16df5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img = Image.open(filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "04ee0572-093c-4e30-872c-24216c807e4c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'C:\\\\Users\\\\ArturSzczotarski\\\\LLM env\\\\du_env\\\\documentsImages\\\\test\\\\2.png'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "img.filename"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "096b6ffb-767e-45a2-bf4b-1f6d3f67f3a4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'predictions': [[{'word': 'km 1792 /', 'label': 'doc id', 'score': '0.94'},\n",
+       "   {'word': 'wezwanie do dokonywania potraceh ztur',\n",
+       "    'label': 'title',\n",
+       "    'score': '0.98'},\n",
+       "   {'word': 'kredyt inkaso s. a', 'label': 'creditor name', 'score': '0.95'},\n",
+       "   {'word': '02 - 672 warszawa domaniewska 39',\n",
+       "    'label': 'creditor address',\n",
+       "    'score': '0.97'},\n",
+       "   {'word': '##ter mateusz garbula kanaria. -',\n",
+       "    'label': 'creditor proxy',\n",
+       "    'score': '0.92'}]]}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "my_handler({\"inputs\": img})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "52580570-51a6-4d73-aab0-ba3bf2af41f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'C:\\\\Users\\\\ArturSzczotarski\\\\LLM env\\\\du_env\\\\documentsImages\\\\test\\\\2.png'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "img.filename"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "04c995ad-634b-4057-92ad-25d329371911",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "object of type 'PngImageFile' has no len()",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataLoader\n\u001b[1;32m----> 3\u001b[0m dataloader \u001b[38;5;241m=\u001b[39m \u001b[43mDataLoader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimg\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\dataloader.py:350\u001b[0m, in \u001b[0;36mDataLoader.__init__\u001b[1;34m(self, dataset, batch_size, shuffle, sampler, batch_sampler, num_workers, collate_fn, pin_memory, drop_last, timeout, worker_init_fn, multiprocessing_context, generator, prefetch_factor, persistent_workers, pin_memory_device)\u001b[0m\n\u001b[0;32m    348\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:  \u001b[38;5;66;03m# map-style\u001b[39;00m\n\u001b[0;32m    349\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m shuffle:\n\u001b[1;32m--> 350\u001b[0m         sampler \u001b[38;5;241m=\u001b[39m \u001b[43mRandomSampler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[0;32m    351\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    352\u001b[0m         sampler \u001b[38;5;241m=\u001b[39m SequentialSampler(dataset)  \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n",
+      "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\sampler.py:142\u001b[0m, in \u001b[0;36mRandomSampler.__init__\u001b[1;34m(self, data_source, replacement, num_samples, generator)\u001b[0m\n\u001b[0;32m    139\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplacement, \u001b[38;5;28mbool\u001b[39m):\n\u001b[0;32m    140\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreplacement should be a boolean value, but got replacement=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplacement\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 142\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_samples\u001b[49m, \u001b[38;5;28mint\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_samples \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m    143\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_samples should be a positive integer value, but got num_samples=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_samples\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[1;32m~\\LLM env\\du_env\\Lib\\site-packages\\torch\\utils\\data\\sampler.py:149\u001b[0m, in \u001b[0;36mRandomSampler.num_samples\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    145\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[0;32m    146\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnum_samples\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mint\u001b[39m:\n\u001b[0;32m    147\u001b[0m     \u001b[38;5;66;03m# dataset size might change at runtime\u001b[39;00m\n\u001b[0;32m    148\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_samples \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 149\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_source\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    150\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_samples\n",
+      "\u001b[1;31mTypeError\u001b[0m: object of type 'PngImageFile' has no len()"
+     ]
+    }
+   ],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "dataloader = DataLoader(img, batch_size=1, shuffle=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e604a2bd-a068-46bb-82d8-4fba7fc6212b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "du_env",
+   "language": "python",
+   "name": "du_env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

__pycache__/handler.cpython-311.pyc ADDED Viewed

Binary file (6.83 kB). View file

handler.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from typing import Dict, List, Any
+from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor
+import torch
+from subprocess import run
+# install tesseract-ocr and pytesseract
+run("apt install -y tesseract-ocr", shell=True, check=True)
+run("pip install pytesseract", shell=True, check=True)
+# helper function to unnormalize bboxes for drawing onto the image
+def unnormalize_box(bbox, width, height):
+    return [
+        width * (bbox[0] / 1000),
+        height * (bbox[1] / 1000),
+        width * (bbox[2] / 1000),
+        height * (bbox[3] / 1000),
+    ]
+def predict(Image, processor, model):
+    """Process document and prepare the data for LayoutLM inference
+    Args:
+        urls (List[str]): Batch of pre-signed document urls
+    Returns:
+        (List[List[Dict]]): Features extraction
+    """
+#     images = [get_image_from_url(url) for url in urls]
+    encoding = processor(
+            images = Image,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+        )
+    del encoding["image"] # LayoutLM doesn't require the image
+    outputs = model(**encoding)
+    results = process_outputs(
+        outputs, encoding=encoding,
+        images=Image, model=model,
+        processor=processor,
+        threshold = 0.75
+    )
+    return results, encoding
+def get_uniqueLabelList(labels):
+    uqnieue_labels =[]
+    for label in labels[0]:
+        try:
+            label_short = label.split("-")[1]
+            if label_short not in uqnieue_labels:
+                uqnieue_labels.append(label_short)
+        except:
+            if label not in uqnieue_labels:
+                uqnieue_labels.append(label)
+            else:
+                pass
+    return uqnieue_labels
+def process_outputs(outputs, encoding, images, model, processor, threshold):
+    scores, _ = torch.max(outputs.logits.softmax(axis=-1), dim=-1)
+    scores = scores.tolist()
+    predictions = outputs.logits.argmax(-1)
+    labels = [[model.config.id2label[pred.item()] for pred in prediction] for prediction in predictions]
+    results = _process_outputs(
+        encoding=encoding,
+        tokenizer=processor.tokenizer,
+        processor = processor,
+        labels=labels,
+        scores=scores,
+        images=images,
+        threshold = threshold
+    )
+    return results
+def _process_outputs(encoding, tokenizer, labels, scores, images, processor, threshold):
+    results = []
+    width, height = images.size
+    entities = []
+    previous_word_idx = 0
+    unique_lables = get_uniqueLabelList(labels)
+#     tokens = tokenizer.convert_ids_to_tokens(input_ids)
+#     word_ids = encoding.word_ids(batch_index=batch_idx)
+#     word = ""
+    entite_wordsidx = []
+    for idx, label in enumerate(unique_lables):
+        score_sum = float(0)
+        if label != "O":
+            for ix, pred in enumerate(labels[0]):
+                if scores[0][ix] > threshold:
+                    if label in pred:
+                        score_sum += scores[0][ix]
+                        entite_wordsidx.append(ix)
+#
+                try:
+                    score_mean = f'{score_sum/len(entite_wordsidx):.2f}'
+                except:
+                    score_mean = 0.0
+            # entite_wordsidx.append(entite_wordsidx[-1] + 1)
+            entities.append(
+                {
+                    "word": processor.decode(encoding.input_ids[0][entite_wordsidx]),
+                    "label": unique_lables[idx],
+                    "score": score_mean
+                    ,
+                }
+            )
+            entite_wordsidx = []
+    results.append(entities)
+    return results
+def unnormalize_box(bbox, width, height):
+    return [
+        int(width * (bbox[0] / 1000)),
+        int(height * (bbox[1] / 1000)),
+        int(width * (bbox[2] / 1000)),
+        int(height * (bbox[3] / 1000)),
+    ]
+def get_image_from_url(Image):
+    return Image.open(f).convert("RGB") # LayoutLMv2Processor requires RGB format
+# set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class EndpointHandler:
+    def __init__(self, path=""):
+        # load model and processor from path
+        self.model = LayoutLMForTokenClassification.from_pretrained(path).to(device)
+        self.processor = LayoutLMv2Processor.from_pretrained(path, apply_ocr=True)
+    def __call__(self, data: Dict[str, bytes]) -> Dict[str, List[Any]]:
+        """
+        Args:
+            data (:obj:):
+                includes the deserialized image file as PIL.Image
+        """
+        # process input
+        image = data.pop("inputs", data)
+        result, encod = predict(image, self.processor, self.model)
+        return {"predictions": result}