File size: 1,284 Bytes
ff444a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911013a
299710f
ff444a8
 
072cc83
 
911013a
072cc83
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from transformers import DebertaV2Tokenizer


class DebertaV2JumanppTokenizer(DebertaV2Tokenizer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.juman_tokenizer = JumanppTokenizer()

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]:
        text = self.juman_tokenizer.tokenize(text)

        add_prefix_space = kwargs.pop("add_prefix_space", False)
        if is_split_into_words or add_prefix_space:
            text = " " + text
        return (text, kwargs)


class JumanppTokenizer:
    def __init__(self):
        try:
            import rhoknp
        except ImportError:
            raise ImportError(
                "You need to install rhoknp to use JumanppPreTokenizer. "
                "See https://github.com/ku-nlp/rhoknp for installation."
            )
        self.rhoknp = rhoknp
        self.jumanpp = rhoknp.Jumanpp()

    def tokenize(self, text: str) -> str:
        morphemes = self.jumanpp.apply_to_sentence(text).morphemes
        if not morphemes:
            doc = self.rhoknp.Document.from_raw_text(text)
            morphemes = self.jumanpp.apply_to_document(doc).morphemes
        return " ".join([morpheme.surf for morpheme in morphemes])