Improve tokenizers
#1
by
kajyuuen
- opened
- .gitignore +3 -0
- distilbert_japanese_tokenizer.py +11 -172
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.venv
|
2 |
+
poetry.lock
|
3 |
+
pyproject.toml
|
distilbert_japanese_tokenizer.py
CHANGED
@@ -29,11 +29,15 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
29 |
from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
|
30 |
from transformers.utils import is_sentencepiece_available, logging
|
31 |
|
32 |
-
|
33 |
-
if is_sentencepiece_available():
|
34 |
import sentencepiece as spm
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
logger = logging.get_logger(__name__)
|
39 |
|
@@ -436,7 +440,7 @@ class MecabTokenizer:
|
|
436 |
do_lower_case=False,
|
437 |
never_split=None,
|
438 |
normalize_text=True,
|
439 |
-
mecab_dic: Optional[str] = "
|
440 |
mecab_option: Optional[str] = None,
|
441 |
):
|
442 |
"""
|
@@ -450,7 +454,7 @@ class MecabTokenizer:
|
|
450 |
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
|
451 |
**normalize_text**: (*optional*) boolean (default True)
|
452 |
Whether to apply unicode normalization to text before tokenization.
|
453 |
-
**mecab_dic**: (*optional*) string (default "
|
454 |
Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
|
455 |
set this option to `None` and modify *mecab_option*.
|
456 |
**mecab_option**: (*optional*) string
|
@@ -471,18 +475,7 @@ class MecabTokenizer:
|
|
471 |
mecab_option = mecab_option or ""
|
472 |
|
473 |
if mecab_dic is not None:
|
474 |
-
if mecab_dic == "
|
475 |
-
try:
|
476 |
-
import ipadic
|
477 |
-
except ModuleNotFoundError as error:
|
478 |
-
raise error.__class__(
|
479 |
-
"The ipadic dictionary is not installed. "
|
480 |
-
"See https://github.com/polm/ipadic-py for installation."
|
481 |
-
)
|
482 |
-
|
483 |
-
dic_dir = ipadic.DICDIR
|
484 |
-
|
485 |
-
elif mecab_dic == "unidic_lite":
|
486 |
try:
|
487 |
import unidic_lite
|
488 |
except ModuleNotFoundError as error:
|
@@ -492,7 +485,6 @@ class MecabTokenizer:
|
|
492 |
)
|
493 |
|
494 |
dic_dir = unidic_lite.DICDIR
|
495 |
-
|
496 |
elif mecab_dic == "unidic":
|
497 |
try:
|
498 |
import unidic
|
@@ -536,159 +528,6 @@ class MecabTokenizer:
|
|
536 |
return tokens
|
537 |
|
538 |
|
539 |
-
class SudachiTokenizer:
|
540 |
-
"""Runs basic tokenization with Sudachi morphological parser."""
|
541 |
-
|
542 |
-
def __init__(
|
543 |
-
self,
|
544 |
-
do_lower_case=False,
|
545 |
-
never_split=None,
|
546 |
-
normalize_text=True,
|
547 |
-
trim_whitespace=False,
|
548 |
-
sudachi_split_mode="A",
|
549 |
-
sudachi_config_path=None,
|
550 |
-
sudachi_resource_dir=None,
|
551 |
-
sudachi_dict_type="core",
|
552 |
-
):
|
553 |
-
"""
|
554 |
-
Constructs a SudachiTokenizer.
|
555 |
-
|
556 |
-
Args:
|
557 |
-
**do_lower_case**: (*optional*) boolean (default True)
|
558 |
-
Whether to lowercase the input.
|
559 |
-
**never_split**: (*optional*) list of str
|
560 |
-
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
|
561 |
-
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
|
562 |
-
**normalize_text**: (*optional*) boolean (default True)
|
563 |
-
Whether to apply unicode normalization to text before tokenization.
|
564 |
-
**trim_whitespace**: (*optional*) boolean (default False)
|
565 |
-
Whether to trim all whitespace, tab, newline from tokens.
|
566 |
-
**sudachi_split_mode**: (*optional*) string
|
567 |
-
Split mode of sudachi, choose from "A", "B", "C".
|
568 |
-
**sudachi_config_path**: (*optional*) string
|
569 |
-
**sudachi_resource_dir**: (*optional*) string
|
570 |
-
**sudachi_dict_type**: (*optional*) string
|
571 |
-
dict type of sudachi, choose from "small", "core", "full".
|
572 |
-
"""
|
573 |
-
|
574 |
-
self.do_lower_case = do_lower_case
|
575 |
-
self.never_split = never_split if never_split is not None else []
|
576 |
-
self.normalize_text = normalize_text
|
577 |
-
self.trim_whitespace = trim_whitespace
|
578 |
-
|
579 |
-
try:
|
580 |
-
from sudachipy import dictionary, tokenizer
|
581 |
-
except ImportError:
|
582 |
-
raise ImportError(
|
583 |
-
"You need to install sudachipy to use SudachiTokenizer. "
|
584 |
-
"See https://github.com/WorksApplications/SudachiPy for installation."
|
585 |
-
)
|
586 |
-
|
587 |
-
if sudachi_split_mode == "A":
|
588 |
-
self.split_mode = tokenizer.Tokenizer.SplitMode.A
|
589 |
-
elif sudachi_split_mode == "B":
|
590 |
-
self.split_mode = tokenizer.Tokenizer.SplitMode.B
|
591 |
-
elif sudachi_split_mode == "C":
|
592 |
-
self.split_mode = tokenizer.Tokenizer.SplitMode.C
|
593 |
-
else:
|
594 |
-
raise ValueError("Invalid sudachi_split_mode is specified.")
|
595 |
-
|
596 |
-
self.sudachi = dictionary.Dictionary(
|
597 |
-
config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type
|
598 |
-
).create(self.split_mode)
|
599 |
-
|
600 |
-
def tokenize(self, text, never_split=None, **kwargs):
|
601 |
-
"""Tokenizes a piece of text."""
|
602 |
-
if self.normalize_text:
|
603 |
-
text = unicodedata.normalize("NFKC", text)
|
604 |
-
|
605 |
-
never_split = self.never_split + (never_split if never_split is not None else [])
|
606 |
-
tokens = []
|
607 |
-
|
608 |
-
for word in self.sudachi.tokenize(text):
|
609 |
-
token = word.surface()
|
610 |
-
|
611 |
-
if self.do_lower_case and token not in never_split:
|
612 |
-
token = token.lower()
|
613 |
-
|
614 |
-
if self.trim_whitespace:
|
615 |
-
if token.strip() == "":
|
616 |
-
continue
|
617 |
-
else:
|
618 |
-
token = token.strip()
|
619 |
-
|
620 |
-
tokens.append(token)
|
621 |
-
|
622 |
-
return tokens
|
623 |
-
|
624 |
-
|
625 |
-
class JumanppTokenizer:
|
626 |
-
"""Runs basic tokenization with jumanpp morphological parser."""
|
627 |
-
|
628 |
-
def __init__(
|
629 |
-
self,
|
630 |
-
do_lower_case=False,
|
631 |
-
never_split=None,
|
632 |
-
normalize_text=True,
|
633 |
-
trim_whitespace=False,
|
634 |
-
):
|
635 |
-
"""
|
636 |
-
Constructs a JumanppTokenizer.
|
637 |
-
|
638 |
-
Args:
|
639 |
-
**do_lower_case**: (*optional*) boolean (default True)
|
640 |
-
Whether to lowercase the input.
|
641 |
-
**never_split**: (*optional*) list of str
|
642 |
-
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
|
643 |
-
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
|
644 |
-
**normalize_text**: (*optional*) boolean (default True)
|
645 |
-
Whether to apply unicode normalization to text before tokenization.
|
646 |
-
**trim_whitespace**: (*optional*) boolean (default False)
|
647 |
-
Whether to trim all whitespace, tab, newline from tokens.
|
648 |
-
"""
|
649 |
-
|
650 |
-
self.do_lower_case = do_lower_case
|
651 |
-
self.never_split = never_split if never_split is not None else []
|
652 |
-
self.normalize_text = normalize_text
|
653 |
-
self.trim_whitespace = trim_whitespace
|
654 |
-
|
655 |
-
try:
|
656 |
-
import rhoknp
|
657 |
-
except ImportError:
|
658 |
-
raise ImportError(
|
659 |
-
"You need to install rhoknp to use JumanppTokenizer. "
|
660 |
-
"See https://github.com/ku-nlp/rhoknp for installation."
|
661 |
-
)
|
662 |
-
|
663 |
-
self.juman = rhoknp.Jumanpp()
|
664 |
-
|
665 |
-
def tokenize(self, text, never_split=None, **kwargs):
|
666 |
-
"""Tokenizes a piece of text."""
|
667 |
-
if self.normalize_text:
|
668 |
-
text = unicodedata.normalize("NFKC", text)
|
669 |
-
|
670 |
-
text = text.strip()
|
671 |
-
|
672 |
-
never_split = self.never_split + (never_split if never_split is not None else [])
|
673 |
-
tokens = []
|
674 |
-
|
675 |
-
for mrph in self.juman.apply_to_sentence(text).morphemes:
|
676 |
-
token = mrph.text
|
677 |
-
|
678 |
-
if self.do_lower_case and token not in never_split:
|
679 |
-
token = token.lower()
|
680 |
-
|
681 |
-
if self.trim_whitespace:
|
682 |
-
if token.strip() == "":
|
683 |
-
continue
|
684 |
-
else:
|
685 |
-
token = token.strip()
|
686 |
-
|
687 |
-
tokens.append(token)
|
688 |
-
|
689 |
-
return tokens
|
690 |
-
|
691 |
-
|
692 |
class CharacterTokenizer:
|
693 |
"""Runs Character tokenization."""
|
694 |
|
|
|
29 |
from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
|
30 |
from transformers.utils import is_sentencepiece_available, logging
|
31 |
|
32 |
+
try:
|
|
|
33 |
import sentencepiece as spm
|
34 |
+
except ModuleNotFoundError as error:
|
35 |
+
raise error.__class__(
|
36 |
+
"The sentencepiece is not installed. "
|
37 |
+
"See https://github.com/google/sentencepiece for installation."
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
|
42 |
logger = logging.get_logger(__name__)
|
43 |
|
|
|
440 |
do_lower_case=False,
|
441 |
never_split=None,
|
442 |
normalize_text=True,
|
443 |
+
mecab_dic: Optional[str] = "unidic_lite",
|
444 |
mecab_option: Optional[str] = None,
|
445 |
):
|
446 |
"""
|
|
|
454 |
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
|
455 |
**normalize_text**: (*optional*) boolean (default True)
|
456 |
Whether to apply unicode normalization to text before tokenization.
|
457 |
+
**mecab_dic**: (*optional*) string (default "unidic_lite")
|
458 |
Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
|
459 |
set this option to `None` and modify *mecab_option*.
|
460 |
**mecab_option**: (*optional*) string
|
|
|
475 |
mecab_option = mecab_option or ""
|
476 |
|
477 |
if mecab_dic is not None:
|
478 |
+
if mecab_dic == "unidic_lite":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
479 |
try:
|
480 |
import unidic_lite
|
481 |
except ModuleNotFoundError as error:
|
|
|
485 |
)
|
486 |
|
487 |
dic_dir = unidic_lite.DICDIR
|
|
|
488 |
elif mecab_dic == "unidic":
|
489 |
try:
|
490 |
import unidic
|
|
|
528 |
return tokens
|
529 |
|
530 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
531 |
class CharacterTokenizer:
|
532 |
"""Runs Character tokenization."""
|
533 |
|