KoichiYasuoka
commited on
Commit
•
f156762
1
Parent(s):
cd44edc
model improved for transformers 4.42
Browse files- config.json +2 -12
- maker.sh +1 -37
- pytorch_model-00001-of-00006.bin +1 -1
- pytorch_model-00002-of-00006.bin +1 -1
- pytorch_model-00003-of-00006.bin +1 -1
- pytorch_model-00004-of-00006.bin +1 -1
- pytorch_model-00005-of-00006.bin +1 -1
- pytorch_model-00006-of-00006.bin +1 -1
- pytorch_model.bin.index.json +3 -3
- tokenizer.json +575 -569
- tokenizer_config.json +2 -0
- upos.py +1 -40
config.json
CHANGED
@@ -4,22 +4,11 @@
|
|
4 |
],
|
5 |
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
7 |
-
"auto_map": {
|
8 |
-
"AutoModelForTokenClassification": "upos.LlamaForTokenClassification"
|
9 |
-
},
|
10 |
"bos_token_id": 1,
|
11 |
"custom_pipelines": {
|
12 |
"upos": {
|
13 |
"impl": "upos.BellmanFordTokenClassificationPipeline",
|
14 |
"pt": "AutoModelForTokenClassification"
|
15 |
-
},
|
16 |
-
"token-classification": {
|
17 |
-
"impl": "upos.RawTokenClassificationPipeline",
|
18 |
-
"pt": "AutoModelForTokenClassification"
|
19 |
-
},
|
20 |
-
"ner": {
|
21 |
-
"impl": "upos.RawTokenClassificationPipeline",
|
22 |
-
"pt": "AutoModelForTokenClassification"
|
23 |
}
|
24 |
},
|
25 |
"eos_token_id": 2,
|
@@ -153,6 +142,7 @@
|
|
153 |
},
|
154 |
"max_position_embeddings": 4096,
|
155 |
"max_sequence_length": 4096,
|
|
|
156 |
"model_type": "llama",
|
157 |
"num_attention_heads": 32,
|
158 |
"num_hidden_layers": 32,
|
@@ -165,7 +155,7 @@
|
|
165 |
"tie_word_embeddings": false,
|
166 |
"tokenizer_class": "LlamaTokenizerFast",
|
167 |
"torch_dtype": "float32",
|
168 |
-
"transformers_version": "4.
|
169 |
"use_cache": true,
|
170 |
"vocab_size": 43744
|
171 |
}
|
|
|
4 |
],
|
5 |
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
|
|
|
|
|
|
7 |
"bos_token_id": 1,
|
8 |
"custom_pipelines": {
|
9 |
"upos": {
|
10 |
"impl": "upos.BellmanFordTokenClassificationPipeline",
|
11 |
"pt": "AutoModelForTokenClassification"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
}
|
13 |
},
|
14 |
"eos_token_id": 2,
|
|
|
142 |
},
|
143 |
"max_position_embeddings": 4096,
|
144 |
"max_sequence_length": 4096,
|
145 |
+
"mlp_bias": false,
|
146 |
"model_type": "llama",
|
147 |
"num_attention_heads": 32,
|
148 |
"num_hidden_layers": 32,
|
|
|
155 |
"tie_word_embeddings": false,
|
156 |
"tokenizer_class": "LlamaTokenizerFast",
|
157 |
"torch_dtype": "float32",
|
158 |
+
"transformers_version": "4.42.4",
|
159 |
"use_cache": true,
|
160 |
"vocab_size": 43744
|
161 |
}
|
maker.sh
CHANGED
@@ -48,45 +48,9 @@ cat << 'EOF' > $TMPB
|
|
48 |
#! /usr/bin/env deepspeed
|
49 |
src="exSwallow-7b-plus-hf"
|
50 |
tgt="KoichiYasuoka/Swallow-7b-plus-upos"
|
51 |
-
from transformers import LlamaTokenizerFast,
|
52 |
-
from transformers.modeling_outputs import TokenClassifierOutput
|
53 |
from tokenizers.normalizers import Replace
|
54 |
|
55 |
-
class LlamaForTokenClassification(LlamaPreTrainedModel):
|
56 |
-
def __init__(self,config):
|
57 |
-
from torch import nn
|
58 |
-
super().__init__(config)
|
59 |
-
self.num_labels=config.num_labels
|
60 |
-
self.model=LlamaModel(config)
|
61 |
-
if hasattr(config,"classifier_dropout") and config.classifier_dropout is not None:
|
62 |
-
classifier_dropout=config.classifier_dropout
|
63 |
-
elif hasattr(config,"hidden_dropout") and config.hidden_dropout is not None:
|
64 |
-
classifier_dropout=config.hidden_dropout
|
65 |
-
else:
|
66 |
-
classifier_dropout=0.1
|
67 |
-
self.dropout=nn.Dropout(classifier_dropout)
|
68 |
-
self.classifier=nn.Linear(config.hidden_size,config.num_labels)
|
69 |
-
self.post_init()
|
70 |
-
def get_input_embeddings(self):
|
71 |
-
return self.model.embed_tokens
|
72 |
-
def set_input_embeddings(self,value):
|
73 |
-
self.model.embed_tokens=value
|
74 |
-
def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
|
75 |
-
return_dict=return_dict if return_dict is not None else self.config.use_return_dict
|
76 |
-
transformer_outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
|
77 |
-
hidden_states=transformer_outputs[0]
|
78 |
-
hidden_states=self.dropout(hidden_states)
|
79 |
-
logits=self.classifier(hidden_states)
|
80 |
-
loss=None
|
81 |
-
if labels is not None:
|
82 |
-
from torch import nn
|
83 |
-
loss_fct=nn.CrossEntropyLoss()
|
84 |
-
loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
|
85 |
-
if not return_dict:
|
86 |
-
output=(logits,)+transformer_outputs[2:]
|
87 |
-
return ((loss,)+output) if loss is not None else output
|
88 |
-
return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=transformer_outputs.hidden_states,attentions=transformer_outputs.attentions)
|
89 |
-
|
90 |
class UPOSFileDataset(object):
|
91 |
def __init__(self,conllu,tokenizer):
|
92 |
self.conllu=open(conllu,"r",encoding="utf-8")
|
|
|
48 |
#! /usr/bin/env deepspeed
|
49 |
src="exSwallow-7b-plus-hf"
|
50 |
tgt="KoichiYasuoka/Swallow-7b-plus-upos"
|
51 |
+
from transformers import LlamaTokenizerFast,LlamaForTokenClassification,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer
|
|
|
52 |
from tokenizers.normalizers import Replace
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
class UPOSFileDataset(object):
|
55 |
def __init__(self,conllu,tokenizer):
|
56 |
self.conllu=open(conllu,"r",encoding="utf-8")
|
pytorch_model-00001-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4965712452
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15b7ecb40dc994c01d584b8f6061aafb937adb91d3ea88d190874393ef152b77
|
3 |
size 4965712452
|
pytorch_model-00002-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4924328556
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ee7cc71f1d19b1403f8f7fdefee9c2fdc7db7fb2d793fb1676bd9440335cc78
|
3 |
size 4924328556
|
pytorch_model-00003-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4857219294
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:504a5638d62594b8c0025a56083668458291dcd1137455a403efa35093b7c15c
|
3 |
size 4857219294
|
pytorch_model-00004-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4857219294
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1e030bf6ba8296546621ba35acdc1486c0695b500141aa20f242bbc514002a2
|
3 |
size 4857219294
|
pytorch_model-00005-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4857219294
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c068f31f763a9235aeef0f60407db1fb5ffde971ede030ccf9787eb7490bb8d8
|
3 |
size 4857219294
|
pytorch_model-00006-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2161173694
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4fb6cbefff2a23a4edc4414e8b48e6d7d336f9cb4e3338fe21b55273bd827dff
|
3 |
size 2161173694
|
pytorch_model.bin.index.json
CHANGED
@@ -3,8 +3,6 @@
|
|
3 |
"total_size": 26622771440
|
4 |
},
|
5 |
"weight_map": {
|
6 |
-
"classifier.bias": "pytorch_model-00006-of-00006.bin",
|
7 |
-
"classifier.weight": "pytorch_model-00006-of-00006.bin",
|
8 |
"model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin",
|
9 |
"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
|
10 |
"model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
|
@@ -294,6 +292,8 @@
|
|
294 |
"model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
|
295 |
"model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
|
296 |
"model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
|
297 |
-
"model.norm.weight": "pytorch_model-00006-of-00006.bin"
|
|
|
|
|
298 |
}
|
299 |
}
|
|
|
3 |
"total_size": 26622771440
|
4 |
},
|
5 |
"weight_map": {
|
|
|
|
|
6 |
"model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin",
|
7 |
"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
|
8 |
"model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
|
|
|
292 |
"model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
|
293 |
"model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
|
294 |
"model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
|
295 |
+
"model.norm.weight": "pytorch_model-00006-of-00006.bin",
|
296 |
+
"score.bias": "pytorch_model-00006-of-00006.bin",
|
297 |
+
"score.weight": "pytorch_model-00006-of-00006.bin"
|
298 |
}
|
299 |
}
|
tokenizer.json
CHANGED
@@ -38,7 +38,12 @@
|
|
38 |
},
|
39 |
"content": "▁"
|
40 |
},
|
41 |
-
"pre_tokenizer":
|
|
|
|
|
|
|
|
|
|
|
42 |
"post_processor": {
|
43 |
"type": "TemplateProcessing",
|
44 |
"single": [
|
@@ -125,6 +130,7 @@
|
|
125 |
"end_of_word_suffix": null,
|
126 |
"fuse_unk": true,
|
127 |
"byte_fallback": false,
|
|
|
128 |
"vocab": {
|
129 |
"<unk>": 0,
|
130 |
"<s>": 1,
|
@@ -43302,574 +43308,574 @@
|
|
43302 |
"趙": 43173,
|
43303 |
"弛": 43174,
|
43304 |
"徊": 43175,
|
43305 |
-
"
|
43306 |
-
"
|
43307 |
-
"
|
43308 |
-
"
|
43309 |
-
"
|
43310 |
-
"
|
43311 |
-
"
|
43312 |
-
"
|
43313 |
-
"
|
43314 |
-
"
|
43315 |
-
"
|
43316 |
-
"
|
43317 |
-
"
|
43318 |
-
"
|
43319 |
-
"
|
43320 |
-
"
|
43321 |
-
"
|
43322 |
-
"
|
43323 |
-
"
|
43324 |
-
"
|
43325 |
-
"
|
43326 |
-
"
|
43327 |
-
"
|
43328 |
-
"
|
43329 |
-
"
|
43330 |
-
"
|
43331 |
-
"
|
43332 |
-
"
|
43333 |
-
"
|
43334 |
-
"
|
43335 |
-
"
|
43336 |
-
"
|
43337 |
-
"
|
43338 |
-
"
|
43339 |
-
"
|
43340 |
-
"
|
43341 |
-
"
|
43342 |
-
"
|
43343 |
-
"
|
43344 |
-
"
|
43345 |
-
"
|
43346 |
-
"
|
43347 |
-
"
|
43348 |
-
"
|
43349 |
-
"
|
43350 |
-
"
|
43351 |
-
"
|
43352 |
-
"
|
43353 |
-
"
|
43354 |
-
"
|
43355 |
-
"
|
43356 |
-
"
|
43357 |
-
"
|
43358 |
-
"
|
43359 |
-
"
|
43360 |
-
"
|
43361 |
-
"
|
43362 |
-
"
|
43363 |
-
"
|
43364 |
-
"
|
43365 |
-
"
|
43366 |
-
"
|
43367 |
-
"
|
43368 |
-
"
|
43369 |
-
"
|
43370 |
-
"
|
43371 |
-
"
|
43372 |
-
"
|
43373 |
-
"
|
43374 |
-
"
|
43375 |
-
"
|
43376 |
-
"
|
43377 |
-
"
|
43378 |
-
"
|
43379 |
-
"
|
43380 |
-
"
|
43381 |
-
"
|
43382 |
-
"
|
43383 |
-
"
|
43384 |
-
"
|
43385 |
-
"
|
43386 |
-
"
|
43387 |
-
"
|
43388 |
-
"
|
43389 |
-
"
|
43390 |
-
"
|
43391 |
-
"
|
43392 |
-
"
|
43393 |
-
"
|
43394 |
-
"
|
43395 |
-
"
|
43396 |
-
"
|
43397 |
-
"
|
43398 |
-
"
|
43399 |
-
"
|
43400 |
-
"
|
43401 |
-
"
|
43402 |
-
"
|
43403 |
-
"
|
43404 |
-
"
|
43405 |
-
"
|
43406 |
-
"
|
43407 |
-
"
|
43408 |
-
"
|
43409 |
-
"
|
43410 |
-
"
|
43411 |
-
"
|
43412 |
-
"
|
43413 |
-
"
|
43414 |
-
"
|
43415 |
-
"
|
43416 |
-
"
|
43417 |
-
"
|
43418 |
-
"
|
43419 |
-
"
|
43420 |
-
"
|
43421 |
-
"
|
43422 |
-
"
|
43423 |
-
"
|
43424 |
-
"
|
43425 |
-
"
|
43426 |
-
"
|
43427 |
-
"
|
43428 |
-
"
|
43429 |
-
"
|
43430 |
-
"
|
43431 |
-
"
|
43432 |
-
"
|
43433 |
-
"
|
43434 |
-
"
|
43435 |
-
"
|
43436 |
-
"
|
43437 |
-
"
|
43438 |
-
"
|
43439 |
-
"
|
43440 |
-
"
|
43441 |
-
"
|
43442 |
-
"
|
43443 |
-
"
|
43444 |
-
"
|
43445 |
-
"
|
43446 |
-
"
|
43447 |
-
"
|
43448 |
-
"
|
43449 |
-
"
|
43450 |
-
"
|
43451 |
-
"
|
43452 |
-
"
|
43453 |
-
"
|
43454 |
-
"
|
43455 |
-
"
|
43456 |
-
"
|
43457 |
-
"
|
43458 |
-
"
|
43459 |
-
"
|
43460 |
-
"
|
43461 |
-
"
|
43462 |
-
"
|
43463 |
-
"
|
43464 |
-
"
|
43465 |
-
"
|
43466 |
-
"
|
43467 |
-
"
|
43468 |
-
"
|
43469 |
-
"
|
43470 |
-
"
|
43471 |
-
"
|
43472 |
-
"
|
43473 |
-
"
|
43474 |
-
"
|
43475 |
-
"
|
43476 |
-
"
|
43477 |
-
"
|
43478 |
-
"
|
43479 |
-
"
|
43480 |
-
"
|
43481 |
-
"
|
43482 |
-
"
|
43483 |
-
"
|
43484 |
-
"
|
43485 |
-
"
|
43486 |
-
"
|
43487 |
-
"
|
43488 |
-
"
|
43489 |
-
"
|
43490 |
-
"
|
43491 |
-
"
|
43492 |
-
"
|
43493 |
-
"
|
43494 |
-
"
|
43495 |
-
"
|
43496 |
-
"
|
43497 |
-
"
|
43498 |
-
"
|
43499 |
-
"
|
43500 |
-
"
|
43501 |
-
"
|
43502 |
-
"
|
43503 |
-
"
|
43504 |
-
"
|
43505 |
-
"
|
43506 |
-
"
|
43507 |
-
"
|
43508 |
-
"
|
43509 |
-
"
|
43510 |
-
"
|
43511 |
-
"
|
43512 |
-
"
|
43513 |
-
"
|
43514 |
-
"
|
43515 |
-
"
|
43516 |
-
"
|
43517 |
-
"
|
43518 |
-
"
|
43519 |
-
"
|
43520 |
-
"
|
43521 |
-
"
|
43522 |
-
"
|
43523 |
-
"
|
43524 |
-
"
|
43525 |
-
"
|
43526 |
-
"
|
43527 |
-
"
|
43528 |
-
"
|
43529 |
-
"
|
43530 |
-
"
|
43531 |
-
"
|
43532 |
-
"
|
43533 |
-
"
|
43534 |
-
"
|
43535 |
-
"
|
43536 |
-
"
|
43537 |
-
"
|
43538 |
-
"
|
43539 |
-
"
|
43540 |
-
"
|
43541 |
-
"
|
43542 |
-
"
|
43543 |
-
"
|
43544 |
-
"
|
43545 |
-
"
|
43546 |
-
"
|
43547 |
-
"
|
43548 |
-
"
|
43549 |
-
"
|
43550 |
-
"
|
43551 |
-
"
|
43552 |
-
"
|
43553 |
-
"
|
43554 |
-
"
|
43555 |
-
"
|
43556 |
-
"
|
43557 |
-
"
|
43558 |
-
"
|
43559 |
-
"
|
43560 |
-
"
|
43561 |
-
"
|
43562 |
-
"
|
43563 |
-
"
|
43564 |
-
"
|
43565 |
-
"
|
43566 |
-
"
|
43567 |
-
"
|
43568 |
-
"
|
43569 |
-
"
|
43570 |
-
"
|
43571 |
-
"
|
43572 |
-
"
|
43573 |
-
"
|
43574 |
-
"
|
43575 |
-
"
|
43576 |
-
"
|
43577 |
-
"
|
43578 |
-
"
|
43579 |
-
"
|
43580 |
-
"
|
43581 |
-
"
|
43582 |
-
"
|
43583 |
-
"
|
43584 |
-
"
|
43585 |
-
"
|
43586 |
-
"
|
43587 |
-
"
|
43588 |
-
"
|
43589 |
-
"
|
43590 |
-
"
|
43591 |
-
"
|
43592 |
-
"
|
43593 |
-
"
|
43594 |
"鎭": 43465,
|
43595 |
-
"
|
43596 |
-
"
|
43597 |
-
"
|
43598 |
-
"
|
43599 |
-
"
|
43600 |
-
"
|
43601 |
-
"
|
43602 |
-
"
|
43603 |
-
"
|
43604 |
-
"
|
43605 |
-
"
|
43606 |
-
"
|
43607 |
-
"
|
43608 |
-
"
|
43609 |
-
"
|
43610 |
-
"
|
43611 |
-
"
|
43612 |
-
"
|
43613 |
-
"
|
43614 |
-
"
|
43615 |
-
"
|
43616 |
-
"
|
43617 |
-
"
|
43618 |
-
"
|
43619 |
-
"
|
43620 |
-
"
|
43621 |
-
"
|
43622 |
-
"
|
43623 |
-
"
|
43624 |
-
"
|
43625 |
-
"
|
43626 |
-
"
|
43627 |
-
"
|
43628 |
-
"
|
43629 |
-
"
|
43630 |
-
"
|
43631 |
-
"
|
43632 |
-
"
|
43633 |
-
"
|
43634 |
-
"
|
43635 |
-
"
|
43636 |
-
"
|
43637 |
-
"
|
43638 |
-
"
|
43639 |
-
"
|
43640 |
-
"
|
43641 |
-
"
|
43642 |
-
"
|
43643 |
-
"
|
43644 |
-
"
|
43645 |
-
"
|
43646 |
-
"
|
43647 |
-
"
|
43648 |
-
"
|
43649 |
-
"
|
43650 |
-
"
|
43651 |
-
"
|
43652 |
-
"
|
43653 |
-
"
|
43654 |
-
"
|
43655 |
-
"
|
43656 |
-
"
|
43657 |
-
"
|
43658 |
-
"
|
43659 |
-
"
|
43660 |
-
"
|
43661 |
-
"
|
43662 |
-
"
|
43663 |
-
"
|
43664 |
-
"
|
43665 |
-
"
|
43666 |
-
"
|
43667 |
-
"
|
43668 |
-
"
|
43669 |
-
"
|
43670 |
-
"
|
43671 |
-
"
|
43672 |
-
"
|
43673 |
-
"
|
43674 |
-
"
|
43675 |
-
"
|
43676 |
-
"
|
43677 |
-
"
|
43678 |
-
"
|
43679 |
-
"
|
43680 |
-
"
|
43681 |
-
"
|
43682 |
-
"
|
43683 |
-
"
|
43684 |
-
"
|
43685 |
-
"
|
43686 |
-
"
|
43687 |
-
"
|
43688 |
-
"
|
43689 |
-
"
|
43690 |
-
"
|
43691 |
-
"
|
43692 |
-
"
|
43693 |
-
"
|
43694 |
-
"
|
43695 |
-
"
|
43696 |
-
"
|
43697 |
-
"
|
43698 |
-
"
|
43699 |
-
"
|
43700 |
-
"
|
43701 |
-
"
|
43702 |
-
"
|
43703 |
-
"
|
43704 |
-
"
|
43705 |
-
"
|
43706 |
-
"
|
43707 |
-
"
|
43708 |
-
"
|
43709 |
-
"
|
43710 |
-
"
|
43711 |
-
"
|
43712 |
-
"
|
43713 |
-
"
|
43714 |
-
"
|
43715 |
-
"
|
43716 |
-
"
|
43717 |
-
"
|
43718 |
-
"
|
43719 |
-
"
|
43720 |
-
"
|
43721 |
-
"
|
43722 |
-
"
|
43723 |
-
"
|
43724 |
-
"
|
43725 |
-
"
|
43726 |
-
"
|
43727 |
-
"
|
43728 |
-
"
|
43729 |
-
"
|
43730 |
-
"
|
43731 |
-
"
|
43732 |
-
"
|
43733 |
-
"
|
43734 |
-
"
|
43735 |
-
"
|
43736 |
-
"
|
43737 |
-
"
|
43738 |
-
"
|
43739 |
-
"
|
43740 |
-
"
|
43741 |
-
"
|
43742 |
-
"
|
43743 |
-
"
|
43744 |
-
"
|
43745 |
-
"
|
43746 |
-
"
|
43747 |
-
"
|
43748 |
-
"
|
43749 |
-
"
|
43750 |
-
"
|
43751 |
-
"
|
43752 |
-
"
|
43753 |
-
"
|
43754 |
-
"
|
43755 |
-
"
|
43756 |
-
"
|
43757 |
-
"
|
43758 |
-
"
|
43759 |
-
"
|
43760 |
-
"
|
43761 |
-
"
|
43762 |
-
"
|
43763 |
-
"
|
43764 |
-
"
|
43765 |
-
"
|
43766 |
-
"
|
43767 |
-
"
|
43768 |
-
"
|
43769 |
-
"
|
43770 |
-
"
|
43771 |
-
"
|
43772 |
-
"
|
43773 |
-
"
|
43774 |
-
"
|
43775 |
-
"
|
43776 |
-
"
|
43777 |
-
"
|
43778 |
-
"
|
43779 |
-
"
|
43780 |
-
"
|
43781 |
-
"
|
43782 |
-
"
|
43783 |
-
"
|
43784 |
-
"
|
43785 |
-
"
|
43786 |
-
"
|
43787 |
-
"
|
43788 |
-
"
|
43789 |
-
"
|
43790 |
-
"
|
43791 |
-
"
|
43792 |
-
"
|
43793 |
-
"
|
43794 |
-
"
|
43795 |
-
"
|
43796 |
-
"
|
43797 |
-
"
|
43798 |
-
"
|
43799 |
-
"
|
43800 |
-
"
|
43801 |
-
"
|
43802 |
-
"
|
43803 |
-
"
|
43804 |
-
"
|
43805 |
-
"
|
43806 |
-
"
|
43807 |
-
"
|
43808 |
-
"
|
43809 |
-
"
|
43810 |
-
"
|
43811 |
-
"
|
43812 |
-
"
|
43813 |
-
"
|
43814 |
-
"
|
43815 |
-
"
|
43816 |
-
"
|
43817 |
-
"
|
43818 |
-
"
|
43819 |
-
"
|
43820 |
-
"
|
43821 |
-
"
|
43822 |
-
"
|
43823 |
-
"
|
43824 |
-
"
|
43825 |
-
"
|
43826 |
-
"
|
43827 |
-
"
|
43828 |
-
"
|
43829 |
-
"
|
43830 |
-
"
|
43831 |
-
"
|
43832 |
-
"
|
43833 |
-
"
|
43834 |
-
"
|
43835 |
-
"
|
43836 |
-
"
|
43837 |
-
"
|
43838 |
-
"
|
43839 |
-
"
|
43840 |
-
"
|
43841 |
-
"
|
43842 |
-
"
|
43843 |
-
"
|
43844 |
-
"
|
43845 |
-
"
|
43846 |
-
"
|
43847 |
-
"
|
43848 |
-
"
|
43849 |
-
"
|
43850 |
-
"
|
43851 |
-
"
|
43852 |
-
"
|
43853 |
-
"
|
43854 |
-
"
|
43855 |
-
"
|
43856 |
-
"
|
43857 |
-
"
|
43858 |
-
"
|
43859 |
-
"
|
43860 |
-
"
|
43861 |
-
"
|
43862 |
-
"
|
43863 |
-
"
|
43864 |
-
"
|
43865 |
-
"
|
43866 |
-
"
|
43867 |
-
"
|
43868 |
-
"
|
43869 |
-
"
|
43870 |
-
"
|
43871 |
-
"
|
43872 |
-
"
|
43873 |
},
|
43874 |
"merges": [
|
43875 |
"▁ t",
|
@@ -115374,4 +115380,4 @@
|
|
115374 |
"▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁"
|
115375 |
]
|
115376 |
}
|
115377 |
-
}
|
|
|
38 |
},
|
39 |
"content": "▁"
|
40 |
},
|
41 |
+
"pre_tokenizer": {
|
42 |
+
"type": "Metaspace",
|
43 |
+
"replacement": "▁",
|
44 |
+
"prepend_scheme": "first",
|
45 |
+
"split": false
|
46 |
+
},
|
47 |
"post_processor": {
|
48 |
"type": "TemplateProcessing",
|
49 |
"single": [
|
|
|
130 |
"end_of_word_suffix": null,
|
131 |
"fuse_unk": true,
|
132 |
"byte_fallback": false,
|
133 |
+
"ignore_merges": false,
|
134 |
"vocab": {
|
135 |
"<unk>": 0,
|
136 |
"<s>": 1,
|
|
|
43308 |
"趙": 43173,
|
43309 |
"弛": 43174,
|
43310 |
"徊": 43175,
|
43311 |
+
"戲": 43176,
|
43312 |
+
"舍": 43177,
|
43313 |
+
"猾": 43178,
|
43314 |
+
"專": 43179,
|
43315 |
+
"爲": 43180,
|
43316 |
+
"黠": 43181,
|
43317 |
+
"爰": 43182,
|
43318 |
+
"橙": 43183,
|
43319 |
+
"鰭": 43184,
|
43320 |
+
"榜": 43185,
|
43321 |
+
"摧": 43186,
|
43322 |
+
"飜": 43187,
|
43323 |
+
"關": 43188,
|
43324 |
+
"朧": 43189,
|
43325 |
+
"畢": 43190,
|
43326 |
+
"汝": 43191,
|
43327 |
+
"挈": 43192,
|
43328 |
+
"吻": 43193,
|
43329 |
+
"懺": 43194,
|
43330 |
+
"僻": 43195,
|
43331 |
+
"毋": 43196,
|
43332 |
+
"盡": 43197,
|
43333 |
+
"羣": 43198,
|
43334 |
+
"嫡": 43199,
|
43335 |
+
"煥": 43200,
|
43336 |
+
"鐸": 43201,
|
43337 |
+
"忿": 43202,
|
43338 |
+
"參": 43203,
|
43339 |
+
"欵": 43204,
|
43340 |
+
"聘": 43205,
|
43341 |
+
"攝": 43206,
|
43342 |
+
"纂": 43207,
|
43343 |
+
"扣": 43208,
|
43344 |
+
"險": 43209,
|
43345 |
+
"儘": 43210,
|
43346 |
+
"尙": 43211,
|
43347 |
+
"臥": 43212,
|
43348 |
+
"淇": 43213,
|
43349 |
+
"籾": 43214,
|
43350 |
+
"豫": 43215,
|
43351 |
+
"龕": 43216,
|
43352 |
+
"佩": 43217,
|
43353 |
+
"鍾": 43218,
|
43354 |
+
"歟": 43219,
|
43355 |
+
"饜": 43220,
|
43356 |
+
"權": 43221,
|
43357 |
+
"雖": 43222,
|
43358 |
+
"狡": 43223,
|
43359 |
+
"翹": 43224,
|
43360 |
+
"吏": 43225,
|
43361 |
+
"繼": 43226,
|
43362 |
+
"鄙": 43227,
|
43363 |
+
"儕": 43228,
|
43364 |
+
"袈": 43229,
|
43365 |
+
"轍": 43230,
|
43366 |
+
"袁": 43231,
|
43367 |
+
"囮": 43232,
|
43368 |
+
"謭": 43233,
|
43369 |
+
"皺": 43234,
|
43370 |
+
"梃": 43235,
|
43371 |
+
"餘": 43236,
|
43372 |
+
"彝": 43237,
|
43373 |
+
"岨": 43238,
|
43374 |
+
"舒": 43239,
|
43375 |
+
"埸": 43240,
|
43376 |
+
"碍": 43241,
|
43377 |
+
"鸞": 43242,
|
43378 |
+
"胤": 43243,
|
43379 |
+
"屡": 43244,
|
43380 |
+
"崗": 43245,
|
43381 |
+
"靜": 43246,
|
43382 |
+
"禀": 43247,
|
43383 |
+
"聯": 43248,
|
43384 |
+
"皓": 43249,
|
43385 |
+
"騷": 43250,
|
43386 |
+
"擔": 43251,
|
43387 |
+
"逼": 43252,
|
43388 |
+
"盜": 43253,
|
43389 |
+
"譯": 43254,
|
43390 |
+
"駢": 43255,
|
43391 |
+
"勅": 43256,
|
43392 |
+
"錐": 43257,
|
43393 |
+
"僞": 43258,
|
43394 |
+
"厭": 43259,
|
43395 |
+
"丞": 43260,
|
43396 |
+
"臂": 43261,
|
43397 |
+
"憮": 43262,
|
43398 |
+
"轉": 43263,
|
43399 |
+
"蜻": 43264,
|
43400 |
+
"竊": 43265,
|
43401 |
+
"攘": 43266,
|
43402 |
+
"內": 43267,
|
43403 |
+
"鄭": 43268,
|
43404 |
+
"嚴": 43269,
|
43405 |
+
"靱": 43270,
|
43406 |
+
"縣": 43271,
|
43407 |
+
"茫": 43272,
|
43408 |
+
"俟": 43273,
|
43409 |
+
"菑": 43274,
|
43410 |
+
"隋": 43275,
|
43411 |
+
"倩": 43276,
|
43412 |
+
"冰": 43277,
|
43413 |
+
"輕": 43278,
|
43414 |
+
"盍": 43279,
|
43415 |
+
"豈": 43280,
|
43416 |
+
"據": 43281,
|
43417 |
+
"實": 43282,
|
43418 |
+
"紂": 43283,
|
43419 |
+
"滊": 43284,
|
43420 |
+
"墮": 43285,
|
43421 |
+
"臘": 43286,
|
43422 |
+
"濶": 43287,
|
43423 |
+
"歸": 43288,
|
43424 |
+
"驗": 43289,
|
43425 |
+
"寔": 43290,
|
43426 |
+
"脩": 43291,
|
43427 |
+
"糀": 43292,
|
43428 |
+
"禎": 43293,
|
43429 |
+
"藥": 43294,
|
43430 |
+
"絜": 43295,
|
43431 |
+
"羞": 43296,
|
43432 |
+
"寃": 43297,
|
43433 |
+
"亨": 43298,
|
43434 |
+
"濤": 43299,
|
43435 |
+
"恤": 43300,
|
43436 |
+
"鑒": 43301,
|
43437 |
+
"朕": 43302,
|
43438 |
+
"嚼": 43303,
|
43439 |
+
"拔": 43304,
|
43440 |
+
"蓁": 43305,
|
43441 |
+
"臾": 43306,
|
43442 |
+
"滿": 43307,
|
43443 |
+
"蹈": 43308,
|
43444 |
+
"擇": 43309,
|
43445 |
+
"兒": 43310,
|
43446 |
+
"兩": 43311,
|
43447 |
+
"嬰": 43312,
|
43448 |
+
"揠": 43313,
|
43449 |
+
"脹": 43314,
|
43450 |
+
"總": 43315,
|
43451 |
+
"詔": 43316,
|
43452 |
+
"恊": 43317,
|
43453 |
+
"肚": 43318,
|
43454 |
+
"悖": 43319,
|
43455 |
+
"沮": 43320,
|
43456 |
+
"續": 43321,
|
43457 |
+
"媢": 43322,
|
43458 |
+
"駸": 43323,
|
43459 |
+
"慘": 43324,
|
43460 |
+
"賂": 43325,
|
43461 |
+
"礦": 43326,
|
43462 |
+
"哨": 43327,
|
43463 |
+
"蠻": 43328,
|
43464 |
+
"舅": 43329,
|
43465 |
+
"纉": 43330,
|
43466 |
+
"傅": 43331,
|
43467 |
+
"埠": 43332,
|
43468 |
+
"鵬": 43333,
|
43469 |
+
"儁": 43334,
|
43470 |
+
"篆": 43335,
|
43471 |
+
"勵": 43336,
|
43472 |
+
"體": 43337,
|
43473 |
+
"隱": 43338,
|
43474 |
+
"盈": 43339,
|
43475 |
+
"竭": 43340,
|
43476 |
+
"澳": 43341,
|
43477 |
+
"呵": 43342,
|
43478 |
+
"耘": 43343,
|
43479 |
+
"閻": 43344,
|
43480 |
+
"衷": 43345,
|
43481 |
+
"胥": 43346,
|
43482 |
+
"蛾": 43347,
|
43483 |
+
"諺": 43348,
|
43484 |
+
"薛": 43349,
|
43485 |
+
"戊": 43350,
|
43486 |
+
"咎": 43351,
|
43487 |
+
"剝": 43352,
|
43488 |
+
"處": 43353,
|
43489 |
+
"孛": 43354,
|
43490 |
+
"逈": 43355,
|
43491 |
+
"游": 43356,
|
43492 |
+
"尤": 43357,
|
43493 |
+
"遑": 43358,
|
43494 |
+
"雜": 43359,
|
43495 |
+
"嘲": 43360,
|
43496 |
+
"憚": 43361,
|
43497 |
+
"釀": 43362,
|
43498 |
+
"葢": 43363,
|
43499 |
+
"唖": 43364,
|
43500 |
+
"傳": 43365,
|
43501 |
+
"黽": 43366,
|
43502 |
+
"騁": 43367,
|
43503 |
+
"頽": 43368,
|
43504 |
+
"殉": 43369,
|
43505 |
+
"榮": 43370,
|
43506 |
+
"頒": 43371,
|
43507 |
+
"蘊": 43372,
|
43508 |
+
"澁": 43373,
|
43509 |
+
"汀": 43374,
|
43510 |
+
"畫": 43375,
|
43511 |
+
"趨": 43376,
|
43512 |
+
"峻": 43377,
|
43513 |
+
"饒": 43378,
|
43514 |
+
"菉": 43379,
|
43515 |
+
"宍": 43380,
|
43516 |
+
"諮": 43381,
|
43517 |
+
"證": 43382,
|
43518 |
+
"玻": 43383,
|
43519 |
+
"麿": 43384,
|
43520 |
+
"嚮": 43385,
|
43521 |
+
"窘": 43386,
|
43522 |
+
"碇": 43387,
|
43523 |
+
"鬢": 43388,
|
43524 |
+
"緡": 43389,
|
43525 |
+
"蘐": 43390,
|
43526 |
+
"贖": 43391,
|
43527 |
+
"誣": 43392,
|
43528 |
+
"獨": 43393,
|
43529 |
+
"醉": 43394,
|
43530 |
+
"單": 43395,
|
43531 |
+
"丙": 43396,
|
43532 |
+
"戌": 43397,
|
43533 |
+
"覽": 43398,
|
43534 |
+
"燮": 43399,
|
43535 |
+
"撿": 43400,
|
43536 |
+
"允": 43401,
|
43537 |
+
"逬": 43402,
|
43538 |
+
"痍": 43403,
|
43539 |
+
"恣": 43404,
|
43540 |
+
"厘": 43405,
|
43541 |
+
"緝": 43406,
|
43542 |
+
"稱": 43407,
|
43543 |
+
"斤": 43408,
|
43544 |
+
"羸": 43409,
|
43545 |
+
"酉": 43410,
|
43546 |
+
"惠": 43411,
|
43547 |
+
"蹄": 43412,
|
43548 |
+
"胖": 43413,
|
43549 |
+
"錮": 43414,
|
43550 |
+
"璽": 43415,
|
43551 |
+
"愈": 43416,
|
43552 |
+
"斷": 43417,
|
43553 |
+
"戮": 43418,
|
43554 |
+
"竝": 43419,
|
43555 |
+
"彥": 43420,
|
43556 |
+
"棍": 43421,
|
43557 |
+
"菰": 43422,
|
43558 |
+
"亥": 43423,
|
43559 |
+
"懥": 43424,
|
43560 |
+
"攬": 43425,
|
43561 |
+
"殲": 43426,
|
43562 |
+
"殘": 43427,
|
43563 |
+
"駑": 43428,
|
43564 |
+
"俄": 43429,
|
43565 |
+
"恕": 43430,
|
43566 |
+
"茲": 43431,
|
43567 |
+
"悌": 43432,
|
43568 |
+
"蜀": 43433,
|
43569 |
+
"舷": 43434,
|
43570 |
+
"筭": 43435,
|
43571 |
+
"揖": 43436,
|
43572 |
+
"姦": 43437,
|
43573 |
+
"堡": 43438,
|
43574 |
+
"聲": 43439,
|
43575 |
+
"歎": 43440,
|
43576 |
+
"廢": 43441,
|
43577 |
+
"肆": 43442,
|
43578 |
+
"數": 43443,
|
43579 |
+
"滌": 43444,
|
43580 |
+
"鳶": 43445,
|
43581 |
+
"沃": 43446,
|
43582 |
+
"隨": 43447,
|
43583 |
+
"决": 43448,
|
43584 |
+
"屬": 43449,
|
43585 |
+
"蛉": 43450,
|
43586 |
+
"靡": 43451,
|
43587 |
+
"桀": 43452,
|
43588 |
+
"愧": 43453,
|
43589 |
+
"謨": 43454,
|
43590 |
+
"儼": 43455,
|
43591 |
+
"瑟": 43456,
|
43592 |
+
"勞": 43457,
|
43593 |
+
"虧": 43458,
|
43594 |
+
"匈": 43459,
|
43595 |
+
"歔": 43460,
|
43596 |
+
"恙": 43461,
|
43597 |
+
"啜": 43462,
|
43598 |
+
"匍": 43463,
|
43599 |
+
"逓": 43464,
|
43600 |
"鎭": 43465,
|
43601 |
+
"漸": 43466,
|
43602 |
+
"岌": 43467,
|
43603 |
+
"懼": 43468,
|
43604 |
+
"覺": 43469,
|
43605 |
+
"迭": 43470,
|
43606 |
+
"穆": 43471,
|
43607 |
+
"逕": 43472,
|
43608 |
+
"繭": 43473,
|
43609 |
+
"蓉": 43474,
|
43610 |
+
"詭": 43475,
|
43611 |
+
"酋": 43476,
|
43612 |
+
"儡": 43477,
|
43613 |
+
"將": 43478,
|
43614 |
+
"隧": 43479,
|
43615 |
+
"爭": 43480,
|
43616 |
+
"辟": 43481,
|
43617 |
+
"痘": 43482,
|
43618 |
+
"宋": 43483,
|
43619 |
+
"揆": 43484,
|
43620 |
+
"誥": 43485,
|
43621 |
+
"槙": 43486,
|
43622 |
+
"濟": 43487,
|
43623 |
+
"辭": 43488,
|
43624 |
+
"埴": 43489,
|
43625 |
+
"耆": 43490,
|
43626 |
+
"顯": 43491,
|
43627 |
+
"孟": 43492,
|
43628 |
+
"諧": 43493,
|
43629 |
+
"搖": 43494,
|
43630 |
+
"恂": 43495,
|
43631 |
+
"爨": 43496,
|
43632 |
+
"弐": 43497,
|
43633 |
+
"壤": 43498,
|
43634 |
+
"諷": 43499,
|
43635 |
+
"臀": 43500,
|
43636 |
+
"遲": 43501,
|
43637 |
+
"陷": 43502,
|
43638 |
+
"號": 43503,
|
43639 |
+
"辨": 43504,
|
43640 |
+
"聽": 43505,
|
43641 |
+
"堯": 43506,
|
43642 |
+
"亂": 43507,
|
43643 |
+
"烟": 43508,
|
43644 |
+
"碩": 43509,
|
43645 |
+
"遁": 43510,
|
43646 |
+
"瞻": 43511,
|
43647 |
+
"瀑": 43512,
|
43648 |
+
"倹": 43513,
|
43649 |
+
"讀": 43514,
|
43650 |
+
"缺": 43515,
|
43651 |
+
"竄": 43516,
|
43652 |
+
"矩": 43517,
|
43653 |
+
"庸": 43518,
|
43654 |
+
"赫": 43519,
|
43655 |
+
"扈": 43520,
|
43656 |
+
"瑣": 43521,
|
43657 |
+
"毆": 43522,
|
43658 |
+
"柘": 43523,
|
43659 |
+
"猗": 43524,
|
43660 |
+
"駭": 43525,
|
43661 |
+
"糾": 43526,
|
43662 |
+
"裔": 43527,
|
43663 |
+
"圓": 43528,
|
43664 |
+
"尸": 43529,
|
43665 |
+
"楔": 43530,
|
43666 |
+
"賤": 43531,
|
43667 |
+
"頴": 43532,
|
43668 |
+
"禊": 43533,
|
43669 |
+
"墾": 43534,
|
43670 |
+
"惟": 43535,
|
43671 |
+
"籌": 43536,
|
43672 |
+
"珊": 43537,
|
43673 |
+
"壓": 43538,
|
43674 |
+
"寶": 43539,
|
43675 |
+
"從": 43540,
|
43676 |
+
"諛": 43541,
|
43677 |
+
"憊": 43542,
|
43678 |
+
"咤": 43543,
|
43679 |
+
"窩": 43544,
|
43680 |
+
"忒": 43545,
|
43681 |
+
"咀": 43546,
|
43682 |
+
"貶": 43547,
|
43683 |
+
"魯": 43548,
|
43684 |
+
"舜": 43549,
|
43685 |
+
"愼": 43550,
|
43686 |
+
"啻": 43551,
|
43687 |
+
"綬": 43552,
|
43688 |
+
"灼": 43553,
|
43689 |
+
"芒": 43554,
|
43690 |
+
"撹": 43555,
|
43691 |
+
"楷": 43556,
|
43692 |
+
"毫": 43557,
|
43693 |
+
"苅": 43558,
|
43694 |
+
"鉤": 43559,
|
43695 |
+
"楯": 43560,
|
43696 |
+
"馭": 43561,
|
43697 |
+
"綮": 43562,
|
43698 |
+
"戾": 43563,
|
43699 |
+
"勸": 43564,
|
43700 |
+
"渺": 43565,
|
43701 |
+
"矜": 43566,
|
43702 |
+
"敎": 43567,
|
43703 |
+
"擧": 43568,
|
43704 |
+
"欷": 43569,
|
43705 |
+
"賈": 43570,
|
43706 |
+
"筵": 43571,
|
43707 |
+
"廟": 43572,
|
43708 |
+
"豕": 43573,
|
43709 |
+
"彗": 43574,
|
43710 |
+
"諟": 43575,
|
43711 |
+
"樣": 43576,
|
43712 |
+
"貽": 43577,
|
43713 |
+
"蛯": 43578,
|
43714 |
+
"譬": 43579,
|
43715 |
+
"鵠": 43580,
|
43716 |
+
"皈": 43581,
|
43717 |
+
"跋": 43582,
|
43718 |
+
"疆": 43583,
|
43719 |
+
"皷": 43584,
|
43720 |
+
"恰": 43585,
|
43721 |
+
"襄": 43586,
|
43722 |
+
"亞": 43587,
|
43723 |
+
"洽": 43588,
|
43724 |
+
"躓": 43589,
|
43725 |
+
"團": 43590,
|
43726 |
+
"膠": 43591,
|
43727 |
+
"麾": 43592,
|
43728 |
+
"斥": 43593,
|
43729 |
+
"艱": 43594,
|
43730 |
+
"敖": 43595,
|
43731 |
+
"篩": 43596,
|
43732 |
+
"糢": 43597,
|
43733 |
+
"熙": 43598,
|
43734 |
+
"拷": 43599,
|
43735 |
+
"𠮟": 43600,
|
43736 |
+
"當": 43601,
|
43737 |
+
"圖": 43602,
|
43738 |
+
"浹": 43603,
|
43739 |
+
"辜": 43604,
|
43740 |
+
"熾": 43605,
|
43741 |
+
"覊": 43606,
|
43742 |
+
"蕃": 43607,
|
43743 |
+
"僩": 43608,
|
43744 |
+
"驅": 43609,
|
43745 |
+
"宥": 43610,
|
43746 |
+
"簪": 43611,
|
43747 |
+
"變": 43612,
|
43748 |
+
"乘": 43613,
|
43749 |
+
"諠": 43614,
|
43750 |
+
"蔗": 43615,
|
43751 |
+
"僥": 43616,
|
43752 |
+
"狸": 43617,
|
43753 |
+
"巖": 43618,
|
43754 |
+
"甍": 43619,
|
43755 |
+
"醨": 43620,
|
43756 |
+
"巍": 43621,
|
43757 |
+
"翰": 43622,
|
43758 |
+
"闢": 43623,
|
43759 |
+
"擕": 43624,
|
43760 |
+
"抔": 43625,
|
43761 |
+
"裴": 43626,
|
43762 |
+
"頗": 43627,
|
43763 |
+
"禮": 43628,
|
43764 |
+
"雞": 43629,
|
43765 |
+
"雉": 43630,
|
43766 |
+
"毘": 43631,
|
43767 |
+
"僇": 43632,
|
43768 |
+
"陋": 43633,
|
43769 |
+
"儒": 43634,
|
43770 |
+
"讓": 43635,
|
43771 |
+
"闍": 43636,
|
43772 |
+
"虞": 43637,
|
43773 |
+
"伍": 43638,
|
43774 |
+
"沒": 43639,
|
43775 |
+
"婿": 43640,
|
43776 |
+
"聊": 43641,
|
43777 |
+
"劾": 43642,
|
43778 |
+
"檣": 43643,
|
43779 |
+
"嗚": 43644,
|
43780 |
+
"盖": 43645,
|
43781 |
+
"淫": 43646,
|
43782 |
+
"勒": 43647,
|
43783 |
+
"竢": 43648,
|
43784 |
+
"淺": 43649,
|
43785 |
+
"聚": 43650,
|
43786 |
+
"僨": 43651,
|
43787 |
+
"欣": 43652,
|
43788 |
+
"闡": 43653,
|
43789 |
+
"亦": 43654,
|
43790 |
+
"壯": 43655,
|
43791 |
+
"奧": 43656,
|
43792 |
+
"傀": 43657,
|
43793 |
+
"耻": 43658,
|
43794 |
+
"碎": 43659,
|
43795 |
+
"寵": 43660,
|
43796 |
+
"觀": 43661,
|
43797 |
+
"搆": 43662,
|
43798 |
+
"甞": 43663,
|
43799 |
+
"驕": 43664,
|
43800 |
+
"陪": 43665,
|
43801 |
+
"慄": 43666,
|
43802 |
+
"巌": 43667,
|
43803 |
+
"與": 43668,
|
43804 |
+
"尹": 43669,
|
43805 |
+
"揜": 43670,
|
43806 |
+
"畧": 43671,
|
43807 |
+
"竟": 43672,
|
43808 |
+
"徽": 43673,
|
43809 |
+
"魏": 43674,
|
43810 |
+
"醗": 43675,
|
43811 |
+
"頰": 43676,
|
43812 |
+
"拮": 43677,
|
43813 |
+
"罷": 43678,
|
43814 |
+
"况": 43679,
|
43815 |
+
"惡": 43680,
|
43816 |
+
"恪": 43681,
|
43817 |
+
"闊": 43682,
|
43818 |
+
"倖": 43683,
|
43819 |
+
"諫": 43684,
|
43820 |
+
"肅": 43685,
|
43821 |
+
"憾": 43686,
|
43822 |
+
"價": 43687,
|
43823 |
+
"巓": 43688,
|
43824 |
+
"蹊": 43689,
|
43825 |
+
"醵": 43690,
|
43826 |
+
"槓": 43691,
|
43827 |
+
"葦": 43692,
|
43828 |
+
"飮": 43693,
|
43829 |
+
"獻": 43694,
|
43830 |
+
"賣": 43695,
|
43831 |
+
"夭": 43696,
|
43832 |
+
"犂": 43697,
|
43833 |
+
"禽": 43698,
|
43834 |
+
"塡": 43699,
|
43835 |
+
"謁": 43700,
|
43836 |
+
"羹": 43701,
|
43837 |
+
"壹": 43702,
|
43838 |
+
"苟": 43703,
|
43839 |
+
"誦": 43704,
|
43840 |
+
"舊": 43705,
|
43841 |
+
"觸": 43706,
|
43842 |
+
"妾": 43707,
|
43843 |
+
"膽": 43708,
|
43844 |
+
"鐵": 43709,
|
43845 |
+
"邑": 43710,
|
43846 |
+
"乎": 43711,
|
43847 |
+
"發": 43712,
|
43848 |
+
"殷": 43713,
|
43849 |
+
"恆": 43714,
|
43850 |
+
"貮": 43715,
|
43851 |
+
"罕": 43716,
|
43852 |
+
"掩": 43717,
|
43853 |
+
"傚": 43718,
|
43854 |
+
"歐": 43719,
|
43855 |
+
"縱": 43720,
|
43856 |
+
"廠": 43721,
|
43857 |
+
"畏": 43722,
|
43858 |
+
"彭": 43723,
|
43859 |
+
"虔": 43724,
|
43860 |
+
"帥": 43725,
|
43861 |
+
"鏃": 43726,
|
43862 |
+
"憬": 43727,
|
43863 |
+
"鋒": 43728,
|
43864 |
+
"諂": 43729,
|
43865 |
+
"悉": 43730,
|
43866 |
+
"歡": 43731,
|
43867 |
+
"裨": 43732,
|
43868 |
+
"斂": 43733,
|
43869 |
+
"營": 43734,
|
43870 |
+
"諱": 43735,
|
43871 |
+
"假": 43736,
|
43872 |
+
"稍": 43737,
|
43873 |
+
"擾": 43738,
|
43874 |
+
"饗": 43739,
|
43875 |
+
"黨": 43740,
|
43876 |
+
"對": 43741,
|
43877 |
+
"匐": 43742,
|
43878 |
+
"獸": 43743
|
43879 |
},
|
43880 |
"merges": [
|
43881 |
"▁ t",
|
|
|
115380 |
"▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁"
|
115381 |
]
|
115382 |
}
|
115383 |
+
}
|
tokenizer_config.json
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
{
|
2 |
"add_bos_token": true,
|
3 |
"add_eos_token": false,
|
|
|
4 |
"added_tokens_decoder": {
|
5 |
"0": {
|
6 |
"content": "<unk>",
|
@@ -31,6 +32,7 @@
|
|
31 |
"clean_up_tokenization_spaces": false,
|
32 |
"cls_token": "<s>",
|
33 |
"eos_token": "</s>",
|
|
|
34 |
"mask_token": "<unk>",
|
35 |
"model_max_length": 4096,
|
36 |
"pad_token": "</s>",
|
|
|
1 |
{
|
2 |
"add_bos_token": true,
|
3 |
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
"added_tokens_decoder": {
|
6 |
"0": {
|
7 |
"content": "<unk>",
|
|
|
32 |
"clean_up_tokenization_spaces": false,
|
33 |
"cls_token": "<s>",
|
34 |
"eos_token": "</s>",
|
35 |
+
"legacy": true,
|
36 |
"mask_token": "<unk>",
|
37 |
"model_max_length": 4096,
|
38 |
"pad_token": "</s>",
|
upos.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
from transformers import TokenClassificationPipeline
|
2 |
-
from transformers.modeling_outputs import TokenClassifierOutput
|
3 |
|
4 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
5 |
def __init__(self,**kwargs):
|
@@ -40,41 +39,3 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
|
40 |
t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
|
41 |
return w
|
42 |
|
43 |
-
class RawTokenClassificationPipeline(TokenClassificationPipeline):
|
44 |
-
def check_model_type(self,supported_models):
|
45 |
-
pass
|
46 |
-
|
47 |
-
class LlamaForTokenClassification(LlamaPreTrainedModel):
|
48 |
-
def __init__(self,config):
|
49 |
-
from torch import nn
|
50 |
-
super().__init__(config)
|
51 |
-
self.num_labels=config.num_labels
|
52 |
-
self.model=LlamaModel(config)
|
53 |
-
if hasattr(config,"classifier_dropout") and config.classifier_dropout is not None:
|
54 |
-
classifier_dropout=config.classifier_dropout
|
55 |
-
elif hasattr(config,"hidden_dropout") and config.hidden_dropout is not None:
|
56 |
-
classifier_dropout=config.hidden_dropout
|
57 |
-
else:
|
58 |
-
classifier_dropout=0.1
|
59 |
-
self.dropout=nn.Dropout(classifier_dropout)
|
60 |
-
self.classifier=nn.Linear(config.hidden_size,config.num_labels)
|
61 |
-
self.post_init()
|
62 |
-
def get_input_embeddings(self):
|
63 |
-
return self.model.embed_tokens
|
64 |
-
def set_input_embeddings(self,value):
|
65 |
-
self.model.embed_tokens=value
|
66 |
-
def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
|
67 |
-
return_dict=return_dict if return_dict is not None else self.config.use_return_dict
|
68 |
-
transformer_outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
|
69 |
-
hidden_states=transformer_outputs[0]
|
70 |
-
hidden_states=self.dropout(hidden_states)
|
71 |
-
logits=self.classifier(hidden_states)
|
72 |
-
loss=None
|
73 |
-
if labels is not None:
|
74 |
-
from torch import nn
|
75 |
-
loss_fct=nn.CrossEntropyLoss()
|
76 |
-
loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
|
77 |
-
if not return_dict:
|
78 |
-
output=(logits,)+transformer_outputs[2:]
|
79 |
-
return ((loss,)+output) if loss is not None else output
|
80 |
-
return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=transformer_outputs.hidden_states,attentions=transformer_outputs.attentions)
|
|
|
1 |
+
from transformers import TokenClassificationPipeline
|
|
|
2 |
|
3 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
4 |
def __init__(self,**kwargs):
|
|
|
39 |
t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
|
40 |
return w
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|