Add initial

Files changed (12) hide show

README.md +18 -0
added_tokens.json +4 -0
config.json +31 -0
handler.py +27 -0
merges.txt +0 -0
preprocessor_config.json +4 -0
pytorch_model.bin +3 -0
special_tokens_map.json +15 -0
test_handler.py +8 -0
tokenizer.json +0 -0
tokenizer_config.json +287 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+---
+language:
+  - en
+---
+# MarkupLM
+**Multimodal (text +markup language) pre-training for [Document AI](https://www.microsoft.com/en-us/research/project/document-ai/)**
+## Introduction
+MarkupLM is a simple but effective multi-modal pre-training method of text and markup language for visually-rich document understanding and information extraction tasks, such as webpage QA and webpage information extraction. MarkupLM archives the SOTA results on multiple datasets. For more details, please refer to our paper:
+[MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518)  Junlong Li, Yiheng Xu, Lei Cui, Furu Wei
+## Usage
+We refer to the [docs](https://huggingface.co/docs/transformers/main/en/model_doc/markuplm) and [demo notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MarkupLM).

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<end-of-node>": 50266,
+  "[empty-title]": 50265
+}

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "MarkupLMForPretraining"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_depth": 50,
+  "max_position_embeddings": 514,
+  "max_xpath_subs_unit_embeddings": 1024,
+  "max_xpath_tag_unit_embeddings": 256,
+  "model_type": "markuplm",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float16",
+  "transformers_version": "4.10.2",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50267,
+  "xpath_unit_hidden_size": 32
+}

handler.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from typing import  Dict, List, Any
+from transformers import AutoProcessor, MarkupLMModel
+class EndpointHandler():
+    def __init__(self, path=""):
+        self.processor = AutoProcessor.from_pretrained("microsoft/markuplm-large")
+        self.model = MarkupLMModel.from_pretrained("microsoft/markuplm-large")
+    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
+        """
+        Args:
+            data (:obj:):
+                includes the input data and the parameters for the inference.
+        Return:
+            A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
+                - "label": A string representing what the label/class is. There can be multiple labels.
+                - "score": A score between 0 and 1 describing how confident the model is for this label/class.
+        """
+        #print(data)
+        inputs = data.pop("inputs", data)
+        encoding = self.processor(inputs, return_tensors="pt")
+        output = self.model(**encoding)
+        return {"last_hidden_state": output.last_hidden_state[0].tolist(),
+                "pooler_output": output.pooler_output[0].tolist()}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "feature_extractor_type": "MarkupLMFeatureExtractor",
+  "processor_class": "MarkupLMProcessor"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab0d2dd424d27dc7e038983444bf698958e8837ae60cce7aafa1c2f7f125a79a
+size 750370881

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

test_handler.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from handler import EndpointHandler
+my_handler = EndpointHandler(path=".")
+html_string = "<html> <head> <title>Page Title</title> </head> </html>"
+payload = {"inputs": html_string}
+print(my_handler(payload))

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,287 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "max_depth": 50,
+  "max_width": 1000,
+  "model_max_length": 512,
+  "name_or_path": "microsoft/markuplm-base",
+  "only_label_first_subword": true,
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token_label": -100,
+  "pad_width": 1001,
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "special_tokens_map_file": null,
+  "tags_dict": {
+    "a": 0,
+    "abbr": 1,
+    "acronym": 2,
+    "address": 3,
+    "altGlyph": 4,
+    "altGlyphDef": 5,
+    "altGlyphItem": 6,
+    "animate": 7,
+    "animateColor": 8,
+    "animateMotion": 9,
+    "animateTransform": 10,
+    "applet": 11,
+    "area": 12,
+    "article": 13,
+    "aside": 14,
+    "audio": 15,
+    "b": 16,
+    "base": 17,
+    "basefont": 18,
+    "bdi": 19,
+    "bdo": 20,
+    "bgsound": 21,
+    "big": 22,
+    "blink": 23,
+    "blockquote": 24,
+    "body": 25,
+    "br": 26,
+    "button": 27,
+    "canvas": 28,
+    "caption": 29,
+    "center": 30,
+    "circle": 31,
+    "cite": 32,
+    "clipPath": 33,
+    "code": 34,
+    "col": 35,
+    "colgroup": 36,
+    "color-profile": 37,
+    "content": 38,
+    "cursor": 39,
+    "data": 40,
+    "datalist": 41,
+    "dd": 42,
+    "defs": 43,
+    "del": 44,
+    "desc": 45,
+    "details": 46,
+    "dfn": 47,
+    "dialog": 48,
+    "dir": 49,
+    "div": 50,
+    "dl": 51,
+    "dt": 52,
+    "ellipse": 53,
+    "em": 54,
+    "embed": 55,
+    "feBlend": 56,
+    "feColorMatrix": 57,
+    "feComponentTransfer": 58,
+    "feComposite": 59,
+    "feConvolveMatrix": 60,
+    "feDiffuseLighting": 61,
+    "feDisplacementMap": 62,
+    "feDistantLight": 63,
+    "feFlood": 64,
+    "feFuncA": 65,
+    "feFuncB": 66,
+    "feFuncG": 67,
+    "feFuncR": 68,
+    "feGaussianBlur": 69,
+    "feImage": 70,
+    "feMerge": 71,
+    "feMergeNode": 72,
+    "feMorphology": 73,
+    "feOffset": 74,
+    "fePointLight": 75,
+    "feSpecularLighting": 76,
+    "feSpotLight": 77,
+    "feTile": 78,
+    "feTurbulence": 79,
+    "fieldset": 80,
+    "figcaption": 81,
+    "figure": 82,
+    "filter": 83,
+    "font": 89,
+    "font-face": 88,
+    "font-face-format": 84,
+    "font-face-name": 85,
+    "font-face-src": 86,
+    "font-face-uri": 87,
+    "footer": 90,
+    "foreignObject": 91,
+    "form": 92,
+    "frame": 93,
+    "frameset": 94,
+    "g": 95,
+    "glyph": 96,
+    "glyphRef": 97,
+    "h1": 98,
+    "h2": 99,
+    "h3": 100,
+    "h4": 101,
+    "h5": 102,
+    "h6": 103,
+    "head": 104,
+    "header": 105,
+    "hgroup": 106,
+    "hkern": 107,
+    "hr": 108,
+    "html": 109,
+    "i": 110,
+    "iframe": 111,
+    "image": 112,
+    "img": 113,
+    "input": 114,
+    "ins": 115,
+    "kbd": 116,
+    "keygen": 117,
+    "label": 118,
+    "legend": 119,
+    "li": 120,
+    "line": 121,
+    "linearGradient": 122,
+    "link": 123,
+    "main": 124,
+    "map": 125,
+    "mark": 126,
+    "marker": 127,
+    "marquee": 128,
+    "mask": 129,
+    "math": 130,
+    "menu": 131,
+    "menuitem": 132,
+    "meta": 133,
+    "metadata": 134,
+    "meter": 135,
+    "missing-glyph": 136,
+    "mpath": 137,
+    "nav": 138,
+    "nobr": 139,
+    "noembed": 140,
+    "noframes": 141,
+    "noscript": 142,
+    "object": 143,
+    "ol": 144,
+    "optgroup": 145,
+    "option": 146,
+    "output": 147,
+    "p": 148,
+    "param": 149,
+    "path": 150,
+    "pattern": 151,
+    "picture": 152,
+    "plaintext": 153,
+    "polygon": 154,
+    "polyline": 155,
+    "portal": 156,
+    "pre": 157,
+    "progress": 158,
+    "q": 159,
+    "radialGradient": 160,
+    "rb": 161,
+    "rect": 162,
+    "rp": 163,
+    "rt": 164,
+    "rtc": 165,
+    "ruby": 166,
+    "s": 167,
+    "samp": 168,
+    "script": 169,
+    "section": 170,
+    "select": 171,
+    "set": 172,
+    "shadow": 173,
+    "slot": 174,
+    "small": 175,
+    "source": 176,
+    "spacer": 177,
+    "span": 178,
+    "stop": 179,
+    "strike": 180,
+    "strong": 181,
+    "style": 182,
+    "sub": 183,
+    "summary": 184,
+    "sup": 185,
+    "svg": 186,
+    "switch": 187,
+    "symbol": 188,
+    "table": 189,
+    "tbody": 190,
+    "td": 191,
+    "template": 192,
+    "text": 193,
+    "textPath": 194,
+    "textarea": 195,
+    "tfoot": 196,
+    "th": 197,
+    "thead": 198,
+    "time": 199,
+    "title": 200,
+    "tr": 201,
+    "track": 202,
+    "tref": 203,
+    "tspan": 204,
+    "tt": 205,
+    "u": 206,
+    "ul": 207,
+    "use": 208,
+    "var": 209,
+    "video": 210,
+    "view": 211,
+    "vkern": 212,
+    "wbr": 213,
+    "xmp": 214
+  },
+  "tokenizer_class": "MarkupLMTokenizer",
+  "trim_offsets": false,
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff