Improve tokenizers

by kajyuuen - opened Mar 22, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+14

-172

Files changed (2) hide show

.gitignore +3 -0
distilbert_japanese_tokenizer.py +11 -172

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv
+poetry.lock
+pyproject.toml

distilbert_japanese_tokenizer.py CHANGED Viewed

@@ -29,11 +29,15 @@ from typing import Any, Dict, List, Optional, Tuple
 from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from transformers.utils import is_sentencepiece_available, logging
-if is_sentencepiece_available():
     import sentencepiece as spm
-else:
-    spm = None
 logger = logging.get_logger(__name__)
@@ -436,7 +440,7 @@ class MecabTokenizer:
         do_lower_case=False,
         never_split=None,
         normalize_text=True,
-        mecab_dic: Optional[str] = "ipadic",
         mecab_option: Optional[str] = None,
     ):
         """
@@ -450,7 +454,7 @@ class MecabTokenizer:
                 [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
             **normalize_text**: (*optional*) boolean (default True)
                 Whether to apply unicode normalization to text before tokenization.
-            **mecab_dic**: (*optional*) string (default "ipadic")
                 Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
                 set this option to `None` and modify *mecab_option*.
             **mecab_option**: (*optional*) string
@@ -471,18 +475,7 @@ class MecabTokenizer:
         mecab_option = mecab_option or ""
         if mecab_dic is not None:
-            if mecab_dic == "ipadic":
-                try:
-                    import ipadic
-                except ModuleNotFoundError as error:
-                    raise error.__class__(
-                        "The ipadic dictionary is not installed. "
-                        "See https://github.com/polm/ipadic-py for installation."
-                    )
-                dic_dir = ipadic.DICDIR
-            elif mecab_dic == "unidic_lite":
                 try:
                     import unidic_lite
                 except ModuleNotFoundError as error:
@@ -492,7 +485,6 @@ class MecabTokenizer:
                     )
                 dic_dir = unidic_lite.DICDIR
             elif mecab_dic == "unidic":
                 try:
                     import unidic
@@ -536,159 +528,6 @@ class MecabTokenizer:
         return tokens
-class SudachiTokenizer:
-    """Runs basic tokenization with Sudachi morphological parser."""
-    def __init__(
-        self,
-        do_lower_case=False,
-        never_split=None,
-        normalize_text=True,
-        trim_whitespace=False,
-        sudachi_split_mode="A",
-        sudachi_config_path=None,
-        sudachi_resource_dir=None,
-        sudachi_dict_type="core",
-    ):
-        """
-        Constructs a SudachiTokenizer.
-        Args:
-            **do_lower_case**: (*optional*) boolean (default True)
-                Whether to lowercase the input.
-            **never_split**: (*optional*) list of str
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
-            **normalize_text**: (*optional*) boolean (default True)
-                Whether to apply unicode normalization to text before tokenization.
-            **trim_whitespace**: (*optional*) boolean (default False)
-                Whether to trim all whitespace, tab, newline from tokens.
-            **sudachi_split_mode**: (*optional*) string
-                Split mode of sudachi, choose from "A", "B", "C".
-            **sudachi_config_path**: (*optional*) string
-            **sudachi_resource_dir**: (*optional*) string
-            **sudachi_dict_type**: (*optional*) string
-                dict type of sudachi, choose from "small", "core", "full".
-        """
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split if never_split is not None else []
-        self.normalize_text = normalize_text
-        self.trim_whitespace = trim_whitespace
-        try:
-            from sudachipy import dictionary, tokenizer
-        except ImportError:
-            raise ImportError(
-                "You need to install sudachipy to use SudachiTokenizer. "
-                "See https://github.com/WorksApplications/SudachiPy for installation."
-            )
-        if sudachi_split_mode == "A":
-            self.split_mode = tokenizer.Tokenizer.SplitMode.A
-        elif sudachi_split_mode == "B":
-            self.split_mode = tokenizer.Tokenizer.SplitMode.B
-        elif sudachi_split_mode == "C":
-            self.split_mode = tokenizer.Tokenizer.SplitMode.C
-        else:
-            raise ValueError("Invalid sudachi_split_mode is specified.")
-        self.sudachi = dictionary.Dictionary(
-            config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type
-        ).create(self.split_mode)
-    def tokenize(self, text, never_split=None, **kwargs):
-        """Tokenizes a piece of text."""
-        if self.normalize_text:
-            text = unicodedata.normalize("NFKC", text)
-        never_split = self.never_split + (never_split if never_split is not None else [])
-        tokens = []
-        for word in self.sudachi.tokenize(text):
-            token = word.surface()
-            if self.do_lower_case and token not in never_split:
-                token = token.lower()
-            if self.trim_whitespace:
-                if token.strip() == "":
-                    continue
-                else:
-                    token = token.strip()
-            tokens.append(token)
-        return tokens
-class JumanppTokenizer:
-    """Runs basic tokenization with jumanpp morphological parser."""
-    def __init__(
-        self,
-        do_lower_case=False,
-        never_split=None,
-        normalize_text=True,
-        trim_whitespace=False,
-    ):
-        """
-        Constructs a JumanppTokenizer.
-        Args:
-            **do_lower_case**: (*optional*) boolean (default True)
-                Whether to lowercase the input.
-            **never_split**: (*optional*) list of str
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
-            **normalize_text**: (*optional*) boolean (default True)
-                Whether to apply unicode normalization to text before tokenization.
-            **trim_whitespace**: (*optional*) boolean (default False)
-                Whether to trim all whitespace, tab, newline from tokens.
-        """
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split if never_split is not None else []
-        self.normalize_text = normalize_text
-        self.trim_whitespace = trim_whitespace
-        try:
-            import rhoknp
-        except ImportError:
-            raise ImportError(
-                "You need to install rhoknp to use JumanppTokenizer. "
-                "See https://github.com/ku-nlp/rhoknp for installation."
-            )
-        self.juman = rhoknp.Jumanpp()
-    def tokenize(self, text, never_split=None, **kwargs):
-        """Tokenizes a piece of text."""
-        if self.normalize_text:
-            text = unicodedata.normalize("NFKC", text)
-        text = text.strip()
-        never_split = self.never_split + (never_split if never_split is not None else [])
-        tokens = []
-        for mrph in self.juman.apply_to_sentence(text).morphemes:
-            token = mrph.text
-            if self.do_lower_case and token not in never_split:
-                token = token.lower()
-            if self.trim_whitespace:
-                if token.strip() == "":
-                    continue
-                else:
-                    token = token.strip()
-            tokens.append(token)
-        return tokens
 class CharacterTokenizer:
     """Runs Character tokenization."""

 from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from transformers.utils import is_sentencepiece_available, logging
+try:
     import sentencepiece as spm
+except ModuleNotFoundError as error:
+    raise error.__class__(
+        "The sentencepiece is not installed. "
+        "See https://github.com/google/sentencepiece for installation."
+    )
 logger = logging.get_logger(__name__)
         do_lower_case=False,
         never_split=None,
         normalize_text=True,
+        mecab_dic: Optional[str] = "unidic_lite",
         mecab_option: Optional[str] = None,
     ):
         """
                 [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
             **normalize_text**: (*optional*) boolean (default True)
                 Whether to apply unicode normalization to text before tokenization.
+            **mecab_dic**: (*optional*) string (default "unidic_lite")
                 Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
                 set this option to `None` and modify *mecab_option*.
             **mecab_option**: (*optional*) string
         mecab_option = mecab_option or ""
         if mecab_dic is not None:
+            if mecab_dic == "unidic_lite":
                 try:
                     import unidic_lite
                 except ModuleNotFoundError as error:
                     )
                 dic_dir = unidic_lite.DICDIR
             elif mecab_dic == "unidic":
                 try:
                     import unidic
         return tokens
 class CharacterTokenizer:
     """Runs Character tokenization."""