Jasper Lu commited on
Commit
2cf97f7
1 Parent(s): c9c0a6d

Add initial

Browse files
README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ ---
5
+
6
+ # MarkupLM
7
+
8
+ **Multimodal (text +markup language) pre-training for [Document AI](https://www.microsoft.com/en-us/research/project/document-ai/)**
9
+
10
+ ## Introduction
11
+
12
+ MarkupLM is a simple but effective multi-modal pre-training method of text and markup language for visually-rich document understanding and information extraction tasks, such as webpage QA and webpage information extraction. MarkupLM archives the SOTA results on multiple datasets. For more details, please refer to our paper:
13
+
14
+ [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei
15
+
16
+ ## Usage
17
+
18
+ We refer to the [docs](https://huggingface.co/docs/transformers/main/en/model_doc/markuplm) and [demo notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MarkupLM).
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<end-of-node>": 50266,
3
+ "[empty-title]": 50265
4
+ }
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MarkupLMForPretraining"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_depth": 50,
17
+ "max_position_embeddings": 514,
18
+ "max_xpath_subs_unit_embeddings": 1024,
19
+ "max_xpath_tag_unit_embeddings": 256,
20
+ "model_type": "markuplm",
21
+ "num_attention_heads": 16,
22
+ "num_hidden_layers": 24,
23
+ "pad_token_id": 1,
24
+ "position_embedding_type": "absolute",
25
+ "torch_dtype": "float16",
26
+ "transformers_version": "4.10.2",
27
+ "type_vocab_size": 1,
28
+ "use_cache": true,
29
+ "vocab_size": 50267,
30
+ "xpath_unit_hidden_size": 32
31
+ }
handler.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import AutoProcessor, MarkupLMModel
3
+
4
+
5
+ class EndpointHandler():
6
+ def __init__(self, path=""):
7
+ self.processor = AutoProcessor.from_pretrained("microsoft/markuplm-large")
8
+ self.model = MarkupLMModel.from_pretrained("microsoft/markuplm-large")
9
+
10
+
11
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
12
+ """
13
+ Args:
14
+ data (:obj:):
15
+ includes the input data and the parameters for the inference.
16
+ Return:
17
+ A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
18
+ - "label": A string representing what the label/class is. There can be multiple labels.
19
+ - "score": A score between 0 and 1 describing how confident the model is for this label/class.
20
+ """
21
+ #print(data)
22
+ inputs = data.pop("inputs", data)
23
+ encoding = self.processor(inputs, return_tensors="pt")
24
+ output = self.model(**encoding)
25
+ return {"last_hidden_state": output.last_hidden_state[0].tolist(),
26
+ "pooler_output": output.pooler_output[0].tolist()}
27
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "MarkupLMFeatureExtractor",
3
+ "processor_class": "MarkupLMProcessor"
4
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab0d2dd424d27dc7e038983444bf698958e8837ae60cce7aafa1c2f7f125a79a
3
+ size 750370881
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
test_handler.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from handler import EndpointHandler
2
+
3
+ my_handler = EndpointHandler(path=".")
4
+
5
+ html_string = "<html> <head> <title>Page Title</title> </head> </html>"
6
+ payload = {"inputs": html_string}
7
+
8
+ print(my_handler(payload))
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "max_depth": 50,
37
+ "max_width": 1000,
38
+ "model_max_length": 512,
39
+ "name_or_path": "microsoft/markuplm-base",
40
+ "only_label_first_subword": true,
41
+ "pad_token": {
42
+ "__type": "AddedToken",
43
+ "content": "<pad>",
44
+ "lstrip": false,
45
+ "normalized": true,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "pad_token_label": -100,
50
+ "pad_width": 1001,
51
+ "sep_token": {
52
+ "__type": "AddedToken",
53
+ "content": "</s>",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ "special_tokens_map_file": null,
60
+ "tags_dict": {
61
+ "a": 0,
62
+ "abbr": 1,
63
+ "acronym": 2,
64
+ "address": 3,
65
+ "altGlyph": 4,
66
+ "altGlyphDef": 5,
67
+ "altGlyphItem": 6,
68
+ "animate": 7,
69
+ "animateColor": 8,
70
+ "animateMotion": 9,
71
+ "animateTransform": 10,
72
+ "applet": 11,
73
+ "area": 12,
74
+ "article": 13,
75
+ "aside": 14,
76
+ "audio": 15,
77
+ "b": 16,
78
+ "base": 17,
79
+ "basefont": 18,
80
+ "bdi": 19,
81
+ "bdo": 20,
82
+ "bgsound": 21,
83
+ "big": 22,
84
+ "blink": 23,
85
+ "blockquote": 24,
86
+ "body": 25,
87
+ "br": 26,
88
+ "button": 27,
89
+ "canvas": 28,
90
+ "caption": 29,
91
+ "center": 30,
92
+ "circle": 31,
93
+ "cite": 32,
94
+ "clipPath": 33,
95
+ "code": 34,
96
+ "col": 35,
97
+ "colgroup": 36,
98
+ "color-profile": 37,
99
+ "content": 38,
100
+ "cursor": 39,
101
+ "data": 40,
102
+ "datalist": 41,
103
+ "dd": 42,
104
+ "defs": 43,
105
+ "del": 44,
106
+ "desc": 45,
107
+ "details": 46,
108
+ "dfn": 47,
109
+ "dialog": 48,
110
+ "dir": 49,
111
+ "div": 50,
112
+ "dl": 51,
113
+ "dt": 52,
114
+ "ellipse": 53,
115
+ "em": 54,
116
+ "embed": 55,
117
+ "feBlend": 56,
118
+ "feColorMatrix": 57,
119
+ "feComponentTransfer": 58,
120
+ "feComposite": 59,
121
+ "feConvolveMatrix": 60,
122
+ "feDiffuseLighting": 61,
123
+ "feDisplacementMap": 62,
124
+ "feDistantLight": 63,
125
+ "feFlood": 64,
126
+ "feFuncA": 65,
127
+ "feFuncB": 66,
128
+ "feFuncG": 67,
129
+ "feFuncR": 68,
130
+ "feGaussianBlur": 69,
131
+ "feImage": 70,
132
+ "feMerge": 71,
133
+ "feMergeNode": 72,
134
+ "feMorphology": 73,
135
+ "feOffset": 74,
136
+ "fePointLight": 75,
137
+ "feSpecularLighting": 76,
138
+ "feSpotLight": 77,
139
+ "feTile": 78,
140
+ "feTurbulence": 79,
141
+ "fieldset": 80,
142
+ "figcaption": 81,
143
+ "figure": 82,
144
+ "filter": 83,
145
+ "font": 89,
146
+ "font-face": 88,
147
+ "font-face-format": 84,
148
+ "font-face-name": 85,
149
+ "font-face-src": 86,
150
+ "font-face-uri": 87,
151
+ "footer": 90,
152
+ "foreignObject": 91,
153
+ "form": 92,
154
+ "frame": 93,
155
+ "frameset": 94,
156
+ "g": 95,
157
+ "glyph": 96,
158
+ "glyphRef": 97,
159
+ "h1": 98,
160
+ "h2": 99,
161
+ "h3": 100,
162
+ "h4": 101,
163
+ "h5": 102,
164
+ "h6": 103,
165
+ "head": 104,
166
+ "header": 105,
167
+ "hgroup": 106,
168
+ "hkern": 107,
169
+ "hr": 108,
170
+ "html": 109,
171
+ "i": 110,
172
+ "iframe": 111,
173
+ "image": 112,
174
+ "img": 113,
175
+ "input": 114,
176
+ "ins": 115,
177
+ "kbd": 116,
178
+ "keygen": 117,
179
+ "label": 118,
180
+ "legend": 119,
181
+ "li": 120,
182
+ "line": 121,
183
+ "linearGradient": 122,
184
+ "link": 123,
185
+ "main": 124,
186
+ "map": 125,
187
+ "mark": 126,
188
+ "marker": 127,
189
+ "marquee": 128,
190
+ "mask": 129,
191
+ "math": 130,
192
+ "menu": 131,
193
+ "menuitem": 132,
194
+ "meta": 133,
195
+ "metadata": 134,
196
+ "meter": 135,
197
+ "missing-glyph": 136,
198
+ "mpath": 137,
199
+ "nav": 138,
200
+ "nobr": 139,
201
+ "noembed": 140,
202
+ "noframes": 141,
203
+ "noscript": 142,
204
+ "object": 143,
205
+ "ol": 144,
206
+ "optgroup": 145,
207
+ "option": 146,
208
+ "output": 147,
209
+ "p": 148,
210
+ "param": 149,
211
+ "path": 150,
212
+ "pattern": 151,
213
+ "picture": 152,
214
+ "plaintext": 153,
215
+ "polygon": 154,
216
+ "polyline": 155,
217
+ "portal": 156,
218
+ "pre": 157,
219
+ "progress": 158,
220
+ "q": 159,
221
+ "radialGradient": 160,
222
+ "rb": 161,
223
+ "rect": 162,
224
+ "rp": 163,
225
+ "rt": 164,
226
+ "rtc": 165,
227
+ "ruby": 166,
228
+ "s": 167,
229
+ "samp": 168,
230
+ "script": 169,
231
+ "section": 170,
232
+ "select": 171,
233
+ "set": 172,
234
+ "shadow": 173,
235
+ "slot": 174,
236
+ "small": 175,
237
+ "source": 176,
238
+ "spacer": 177,
239
+ "span": 178,
240
+ "stop": 179,
241
+ "strike": 180,
242
+ "strong": 181,
243
+ "style": 182,
244
+ "sub": 183,
245
+ "summary": 184,
246
+ "sup": 185,
247
+ "svg": 186,
248
+ "switch": 187,
249
+ "symbol": 188,
250
+ "table": 189,
251
+ "tbody": 190,
252
+ "td": 191,
253
+ "template": 192,
254
+ "text": 193,
255
+ "textPath": 194,
256
+ "textarea": 195,
257
+ "tfoot": 196,
258
+ "th": 197,
259
+ "thead": 198,
260
+ "time": 199,
261
+ "title": 200,
262
+ "tr": 201,
263
+ "track": 202,
264
+ "tref": 203,
265
+ "tspan": 204,
266
+ "tt": 205,
267
+ "u": 206,
268
+ "ul": 207,
269
+ "use": 208,
270
+ "var": 209,
271
+ "video": 210,
272
+ "view": 211,
273
+ "vkern": 212,
274
+ "wbr": 213,
275
+ "xmp": 214
276
+ },
277
+ "tokenizer_class": "MarkupLMTokenizer",
278
+ "trim_offsets": false,
279
+ "unk_token": {
280
+ "__type": "AddedToken",
281
+ "content": "<unk>",
282
+ "lstrip": false,
283
+ "normalized": true,
284
+ "rstrip": false,
285
+ "single_word": false
286
+ }
287
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff