File size: 1,284 Bytes
ff444a8 911013a 299710f ff444a8 072cc83 911013a 072cc83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from transformers import DebertaV2Tokenizer
class DebertaV2JumanppTokenizer(DebertaV2Tokenizer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.juman_tokenizer = JumanppTokenizer()
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]:
text = self.juman_tokenizer.tokenize(text)
add_prefix_space = kwargs.pop("add_prefix_space", False)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
class JumanppTokenizer:
def __init__(self):
try:
import rhoknp
except ImportError:
raise ImportError(
"You need to install rhoknp to use JumanppPreTokenizer. "
"See https://github.com/ku-nlp/rhoknp for installation."
)
self.rhoknp = rhoknp
self.jumanpp = rhoknp.Jumanpp()
def tokenize(self, text: str) -> str:
morphemes = self.jumanpp.apply_to_sentence(text).morphemes
if not morphemes:
doc = self.rhoknp.Document.from_raw_text(text)
morphemes = self.jumanpp.apply_to_document(doc).morphemes
return " ".join([morpheme.surf for morpheme in morphemes])
|