KoichiYasuoka commited on
Commit
9a393dc
1 Parent(s): 0709563

initial release

Browse files
README.md CHANGED
@@ -1,3 +1,27 @@
1
- ---
2
- license: cc-by-sa-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "en"
4
+ tags:
5
+ - "english"
6
+ - "token-classification"
7
+ - "pos"
8
+ datasets:
9
+ - "universal_dependencies"
10
+ license: "cc-by-sa-4.0"
11
+ pipeline_tag: "token-classification"
12
+ ---
13
+
14
+ # xlm-roberta-base-english-upos
15
+
16
+ ## Model Description
17
+
18
+ This is a XLM-RoBERTa model pre-trained on English corpus for POS-tagging, derived from [xlm-roberta-base](https://huggingface.co/xlm-roberta-base). Every word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech).
19
+
20
+ ## How to Use
21
+
22
+ ```py
23
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
24
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/xlm-roberta-base-english-upos")
25
+ model=AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/xlm-roberta-base-english-upos")
26
+ ```
27
+
config.json ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "ADJ",
14
+ "1": "ADJ+PART",
15
+ "2": "ADP",
16
+ "3": "ADV",
17
+ "4": "ADV+PART",
18
+ "5": "AUX",
19
+ "6": "AUX+PART",
20
+ "7": "B-ADJ",
21
+ "8": "B-ADJ+PART",
22
+ "9": "B-ADP",
23
+ "10": "B-ADP+ADP",
24
+ "11": "B-ADV",
25
+ "12": "B-AUX",
26
+ "13": "B-AUX+PART",
27
+ "14": "B-AUX+PART+VERB",
28
+ "15": "B-CCONJ",
29
+ "16": "B-DET",
30
+ "17": "B-INTJ",
31
+ "18": "B-NOUN",
32
+ "19": "B-NOUN+PART",
33
+ "20": "B-NUM",
34
+ "21": "B-PART",
35
+ "22": "B-PRON",
36
+ "23": "B-PRON+AUX",
37
+ "24": "B-PROPN",
38
+ "25": "B-PROPN+PART",
39
+ "26": "B-PUNCT",
40
+ "27": "B-SCONJ",
41
+ "28": "B-SYM",
42
+ "29": "B-VERB",
43
+ "30": "B-VERB+ADV",
44
+ "31": "B-VERB+PART",
45
+ "32": "B-X",
46
+ "33": "CCONJ",
47
+ "34": "DET",
48
+ "35": "I-ADJ",
49
+ "36": "I-ADJ+PART",
50
+ "37": "I-ADP",
51
+ "38": "I-ADP+ADP",
52
+ "39": "I-ADV",
53
+ "40": "I-AUX",
54
+ "41": "I-AUX+PART",
55
+ "42": "I-AUX+PART+VERB",
56
+ "43": "I-CCONJ",
57
+ "44": "I-DET",
58
+ "45": "I-INTJ",
59
+ "46": "I-NOUN",
60
+ "47": "I-NOUN+PART",
61
+ "48": "I-NUM",
62
+ "49": "I-PART",
63
+ "50": "I-PRON",
64
+ "51": "I-PRON+AUX",
65
+ "52": "I-PROPN",
66
+ "53": "I-PROPN+PART",
67
+ "54": "I-PUNCT",
68
+ "55": "I-SCONJ",
69
+ "56": "I-SYM",
70
+ "57": "I-VERB",
71
+ "58": "I-VERB+ADV",
72
+ "59": "I-VERB+PART",
73
+ "60": "I-X",
74
+ "61": "INTJ",
75
+ "62": "NOUN",
76
+ "63": "NOUN+ADP",
77
+ "64": "NOUN+PART",
78
+ "65": "NUM",
79
+ "66": "PART",
80
+ "67": "PRON",
81
+ "68": "PRON+AUX",
82
+ "69": "PRON+VERB",
83
+ "70": "PROPN",
84
+ "71": "PUNCT",
85
+ "72": "SCONJ",
86
+ "73": "SYM",
87
+ "74": "VERB",
88
+ "75": "VERB+PART",
89
+ "76": "X"
90
+ },
91
+ "initializer_range": 0.02,
92
+ "intermediate_size": 3072,
93
+ "label2id": {
94
+ "ADJ": 0,
95
+ "ADJ+PART": 1,
96
+ "ADP": 2,
97
+ "ADV": 3,
98
+ "ADV+PART": 4,
99
+ "AUX": 5,
100
+ "AUX+PART": 6,
101
+ "B-ADJ": 7,
102
+ "B-ADJ+PART": 8,
103
+ "B-ADP": 9,
104
+ "B-ADP+ADP": 10,
105
+ "B-ADV": 11,
106
+ "B-AUX": 12,
107
+ "B-AUX+PART": 13,
108
+ "B-AUX+PART+VERB": 14,
109
+ "B-CCONJ": 15,
110
+ "B-DET": 16,
111
+ "B-INTJ": 17,
112
+ "B-NOUN": 18,
113
+ "B-NOUN+PART": 19,
114
+ "B-NUM": 20,
115
+ "B-PART": 21,
116
+ "B-PRON": 22,
117
+ "B-PRON+AUX": 23,
118
+ "B-PROPN": 24,
119
+ "B-PROPN+PART": 25,
120
+ "B-PUNCT": 26,
121
+ "B-SCONJ": 27,
122
+ "B-SYM": 28,
123
+ "B-VERB": 29,
124
+ "B-VERB+ADV": 30,
125
+ "B-VERB+PART": 31,
126
+ "B-X": 32,
127
+ "CCONJ": 33,
128
+ "DET": 34,
129
+ "I-ADJ": 35,
130
+ "I-ADJ+PART": 36,
131
+ "I-ADP": 37,
132
+ "I-ADP+ADP": 38,
133
+ "I-ADV": 39,
134
+ "I-AUX": 40,
135
+ "I-AUX+PART": 41,
136
+ "I-AUX+PART+VERB": 42,
137
+ "I-CCONJ": 43,
138
+ "I-DET": 44,
139
+ "I-INTJ": 45,
140
+ "I-NOUN": 46,
141
+ "I-NOUN+PART": 47,
142
+ "I-NUM": 48,
143
+ "I-PART": 49,
144
+ "I-PRON": 50,
145
+ "I-PRON+AUX": 51,
146
+ "I-PROPN": 52,
147
+ "I-PROPN+PART": 53,
148
+ "I-PUNCT": 54,
149
+ "I-SCONJ": 55,
150
+ "I-SYM": 56,
151
+ "I-VERB": 57,
152
+ "I-VERB+ADV": 58,
153
+ "I-VERB+PART": 59,
154
+ "I-X": 60,
155
+ "INTJ": 61,
156
+ "NOUN": 62,
157
+ "NOUN+ADP": 63,
158
+ "NOUN+PART": 64,
159
+ "NUM": 65,
160
+ "PART": 66,
161
+ "PRON": 67,
162
+ "PRON+AUX": 68,
163
+ "PRON+VERB": 69,
164
+ "PROPN": 70,
165
+ "PUNCT": 71,
166
+ "SCONJ": 72,
167
+ "SYM": 73,
168
+ "VERB": 74,
169
+ "VERB+PART": 75,
170
+ "X": 76
171
+ },
172
+ "layer_norm_eps": 1e-05,
173
+ "max_position_embeddings": 514,
174
+ "model_type": "xlm-roberta",
175
+ "num_attention_heads": 12,
176
+ "num_hidden_layers": 12,
177
+ "output_past": true,
178
+ "pad_token_id": 1,
179
+ "position_embedding_type": "absolute",
180
+ "task_specific_params": {
181
+ "upos_multiword": {
182
+ "ADJ+PART": {
183
+ "elses": [
184
+ "else",
185
+ "s"
186
+ ],
187
+ "others": [
188
+ "other",
189
+ "s"
190
+ ]
191
+ },
192
+ "ADP+ADP": {
193
+ "OUTTA": [
194
+ "OUT",
195
+ "TA"
196
+ ]
197
+ },
198
+ "ADV+PART": {
199
+ "into": [
200
+ "in",
201
+ "to"
202
+ ]
203
+ },
204
+ "AUX+PART": {
205
+ "Aren't": [
206
+ "Are",
207
+ "n't"
208
+ ],
209
+ "CANNOT": [
210
+ "CAN",
211
+ "NOT"
212
+ ],
213
+ "CANT": [
214
+ "CA",
215
+ "NT"
216
+ ],
217
+ "Can't": [
218
+ "Ca",
219
+ "n't"
220
+ ],
221
+ "DON'T": [
222
+ "DO",
223
+ "N'T"
224
+ ],
225
+ "Don't": [
226
+ "Do",
227
+ "n't"
228
+ ],
229
+ "Dont": [
230
+ "Do",
231
+ "nt"
232
+ ],
233
+ "WASN'T": [
234
+ "WAS",
235
+ "N'T"
236
+ ],
237
+ "ain't": [
238
+ "ai",
239
+ "n't"
240
+ ],
241
+ "aint": [
242
+ "ai",
243
+ "nt"
244
+ ],
245
+ "aren't": [
246
+ "are",
247
+ "n't"
248
+ ],
249
+ "arent": [
250
+ "are",
251
+ "nt"
252
+ ],
253
+ "can't": [
254
+ "ca",
255
+ "n't"
256
+ ],
257
+ "cannot": [
258
+ "can",
259
+ "not"
260
+ ],
261
+ "cant": [
262
+ "ca",
263
+ "nt"
264
+ ],
265
+ "can\u2019t": [
266
+ "ca",
267
+ "n\u2019t"
268
+ ],
269
+ "couldn't": [
270
+ "could",
271
+ "n't"
272
+ ],
273
+ "couldnt": [
274
+ "could",
275
+ "nt"
276
+ ],
277
+ "didn't": [
278
+ "did",
279
+ "n't"
280
+ ],
281
+ "didnt": [
282
+ "did",
283
+ "nt"
284
+ ],
285
+ "didn\u2019t": [
286
+ "did",
287
+ "n\u2019t"
288
+ ],
289
+ "doesn't": [
290
+ "does",
291
+ "n't"
292
+ ],
293
+ "doesnt": [
294
+ "does",
295
+ "nt"
296
+ ],
297
+ "don't": [
298
+ "do",
299
+ "n't"
300
+ ],
301
+ "dont": [
302
+ "do",
303
+ "nt"
304
+ ],
305
+ "don\u2019t": [
306
+ "do",
307
+ "n\u2019t"
308
+ ],
309
+ "haven't": [
310
+ "have",
311
+ "n't"
312
+ ],
313
+ "havn't": [
314
+ "hav",
315
+ "n't"
316
+ ],
317
+ "isn't": [
318
+ "is",
319
+ "n't"
320
+ ],
321
+ "wasent": [
322
+ "wase",
323
+ "nt"
324
+ ],
325
+ "wasn't": [
326
+ "was",
327
+ "n't"
328
+ ],
329
+ "won't": [
330
+ "wo",
331
+ "n't"
332
+ ],
333
+ "wont": [
334
+ "wo",
335
+ "nt"
336
+ ],
337
+ "won\u2019t": [
338
+ "wo",
339
+ "n\u2019t"
340
+ ],
341
+ "wouldn't": [
342
+ "would",
343
+ "n't"
344
+ ],
345
+ "wouldnt": [
346
+ "would",
347
+ "nt"
348
+ ]
349
+ },
350
+ "AUX+PART+VERB": {
351
+ "dunno": [
352
+ "du",
353
+ "n",
354
+ "no"
355
+ ]
356
+ },
357
+ "NOUN+ADP": {
358
+ "sorta": [
359
+ "sort",
360
+ "a"
361
+ ]
362
+ },
363
+ "NOUN+PART": {
364
+ "DAUGHTERS": [
365
+ "DAUGHTER",
366
+ "S"
367
+ ],
368
+ "Kids": [
369
+ "Kid",
370
+ "s"
371
+ ],
372
+ "Smokers": [
373
+ "Smoker",
374
+ "s"
375
+ ],
376
+ "Travelers": [
377
+ "Traveler",
378
+ "s"
379
+ ],
380
+ "animals": [
381
+ "animal",
382
+ "s"
383
+ ],
384
+ "bakers": [
385
+ "baker",
386
+ "s"
387
+ ],
388
+ "beginners": [
389
+ "beginner",
390
+ "s"
391
+ ],
392
+ "bettas": [
393
+ "betta",
394
+ "s"
395
+ ],
396
+ "boys": [
397
+ "boy",
398
+ "s"
399
+ ],
400
+ "friends": [
401
+ "friend",
402
+ "s"
403
+ ],
404
+ "grandmas": [
405
+ "grandma",
406
+ "s"
407
+ ],
408
+ "humans": [
409
+ "human",
410
+ "s"
411
+ ],
412
+ "manufacturers": [
413
+ "manufacturer",
414
+ "s"
415
+ ],
416
+ "owners": [
417
+ "owner",
418
+ "s"
419
+ ],
420
+ "scammers": [
421
+ "scammer",
422
+ "s"
423
+ ],
424
+ "teams": [
425
+ "team",
426
+ "s"
427
+ ],
428
+ "visitors": [
429
+ "visitor",
430
+ "s"
431
+ ],
432
+ "workers": [
433
+ "worker",
434
+ "s"
435
+ ]
436
+ },
437
+ "PRON+AUX": {
438
+ "ITS": [
439
+ "IT",
440
+ "S"
441
+ ],
442
+ "Im": [
443
+ "I",
444
+ "m"
445
+ ],
446
+ "Your": [
447
+ "You",
448
+ "r"
449
+ ],
450
+ "id": [
451
+ "i",
452
+ "d"
453
+ ],
454
+ "im": [
455
+ "i",
456
+ "m"
457
+ ],
458
+ "its": [
459
+ "it",
460
+ "s"
461
+ ],
462
+ "their": [
463
+ "thei",
464
+ "r"
465
+ ],
466
+ "there": [
467
+ "the",
468
+ "re"
469
+ ],
470
+ "ur": [
471
+ "u",
472
+ "r"
473
+ ],
474
+ "your": [
475
+ "you",
476
+ "r"
477
+ ],
478
+ "youre": [
479
+ "you",
480
+ "re"
481
+ ]
482
+ },
483
+ "PRON+VERB": {
484
+ "im": [
485
+ "i",
486
+ "m"
487
+ ],
488
+ "its": [
489
+ "it",
490
+ "s"
491
+ ]
492
+ },
493
+ "PROPN+PART": {
494
+ "Friscos": [
495
+ "Frisco",
496
+ "s"
497
+ ],
498
+ "Mortons": [
499
+ "Morton",
500
+ "s"
501
+ ]
502
+ },
503
+ "VERB+ADV": {
504
+ "c'mon": [
505
+ "c'm",
506
+ "on"
507
+ ]
508
+ },
509
+ "VERB+PART": {
510
+ "Gotta": [
511
+ "Got",
512
+ "ta"
513
+ ],
514
+ "aren't": [
515
+ "are",
516
+ "n't"
517
+ ],
518
+ "doesn't": [
519
+ "does",
520
+ "n't"
521
+ ],
522
+ "doesnt": [
523
+ "does",
524
+ "nt"
525
+ ],
526
+ "don't": [
527
+ "do",
528
+ "n't"
529
+ ],
530
+ "gonna": [
531
+ "gon",
532
+ "na"
533
+ ],
534
+ "gotta": [
535
+ "got",
536
+ "ta"
537
+ ],
538
+ "isn't": [
539
+ "is",
540
+ "n't"
541
+ ],
542
+ "wana": [
543
+ "wan",
544
+ "a"
545
+ ],
546
+ "wasn't": [
547
+ "was",
548
+ "n't"
549
+ ]
550
+ }
551
+ }
552
+ },
553
+ "tokenizer_class": "XLMRobertaTokenizerFast",
554
+ "torch_dtype": "float32",
555
+ "transformers_version": "4.11.3",
556
+ "type_vocab_size": 1,
557
+ "use_cache": true,
558
+ "vocab_size": 250002
559
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7cdbee48cfb38bf2dca5243e3abd1b1b577ca7674cbee2884d750d404b8f001
3
+ size 1110136847
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
supar.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff2ccc43d9cc310919e3e002f5c8cca9455e35699fe774a85042e916b34484e3
3
+ size 1168116583
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "tokenizer_class": "XLMRobertaTokenizerFast"}