deberta-v2-base-japanese-with-auto-jumanpp / tokenization_deberta_v2_jumanpp.py
nobu-g's picture
Update README.md, tokenization_deberta_v2_jumanpp.py, tokenization_deberta_v2_jumanpp_fast.py
299710f
raw
history blame
1.06 kB
from transformers import DebertaV2Tokenizer
class DebertaV2JumanppTokenizer(DebertaV2Tokenizer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.juman_tokenizer = JumanppTokenizer()
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]:
text = self.juman_tokenizer.tokenize(text)
add_prefix_space = kwargs.pop("add_prefix_space", False)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
class JumanppTokenizer:
def __init__(self):
try:
import rhoknp
except ImportError:
raise ImportError(
"You need to install rhoknp to use JumanppPreTokenizer. "
"See https://github.com/ku-nlp/rhoknp for installation."
)
self.jumanpp = rhoknp.Jumanpp()
def tokenize(self, text: str) -> str:
return " ".join([morpheme.surf for morpheme in self.jumanpp.apply_to_sentence(text).morphemes])