File size: 2,572 Bytes
2f044c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from typing import List, Union

from relik.inference.data.objects import Word


class BaseTokenizer:
    """
    A :obj:`Tokenizer` splits strings of text into single words, optionally adds
    pos tags and perform lemmatization.
    """

    def __call__(
        self,
        texts: Union[str, List[str], List[List[str]]],
        is_split_into_words: bool = False,
        **kwargs
    ) -> List[List[Word]]:
        """
        Tokenize the input into single words.

        Args:
            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
                If :obj:`True` and the input is a string, the input is split on spaces.

        Returns:
            :obj:`List[List[Word]]`: The input text tokenized in single words.
        """
        raise NotImplementedError

    def tokenize(self, text: str) -> List[Word]:
        """
        Implements splitting words into tokens.

        Args:
            text (:obj:`str`):
                Text to tokenize.

        Returns:
            :obj:`List[Word]`: The input text tokenized in single words.

        """
        raise NotImplementedError

    def tokenize_batch(self, texts: List[str]) -> List[List[Word]]:
        """
        Implements batch splitting words into tokens.

        Args:
            texts (:obj:`List[str]`):
                Batch of text to tokenize.

        Returns:
            :obj:`List[List[Word]]`: The input batch tokenized in single words.

        """
        return [self.tokenize(text) for text in texts]

    @staticmethod
    def check_is_batched(
        texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
    ):
        """
        Check if input is batched or a single sample.

        Args:
            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
                Text to check.
            is_split_into_words (:obj:`bool`):
                If :obj:`True` and the input is a string, the input is split on spaces.

        Returns:
            :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
        """
        return bool(
            (not is_split_into_words and isinstance(texts, (list, tuple)))
            or (
                is_split_into_words
                and isinstance(texts, (list, tuple))
                and texts
                and isinstance(texts[0], (list, tuple))
            )
        )