from transformers import DebertaV2Tokenizer class DebertaV2JumanppTokenizer(DebertaV2Tokenizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.juman_tokenizer = JumanppTokenizer() def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]: text = self.juman_tokenizer.tokenize(text) add_prefix_space = kwargs.pop("add_prefix_space", False) if is_split_into_words or add_prefix_space: text = " " + text return (text, kwargs) class JumanppTokenizer: def __init__(self): try: import rhoknp except ImportError: raise ImportError( "You need to install rhoknp to use JumanppPreTokenizer. " "See https://github.com/ku-nlp/rhoknp for installation." ) self.rhoknp = rhoknp self.jumanpp = rhoknp.Jumanpp() def tokenize(self, text: str) -> str: try: morphemes = self.jumanpp.apply_to_sentence(text).morphemes except RuntimeError: doc = self.rhoknp.Document.from_raw_text(text) morphemes = self.jumanpp.apply_to_document(doc).morphemes return " ".join([morpheme.surf for morpheme in morphemes])