File size: 1,717 Bytes
2f044c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from typing import List, Union


class BaseSentenceSplitter:
    """
    A `BaseSentenceSplitter` splits strings into sentences.
    """

    def __call__(self, *args, **kwargs):
        """
        Calls :meth:`split_sentences`.
        """
        return self.split_sentences(*args, **kwargs)

    def split_sentences(
        self, text: str, max_len: int = 0, *args, **kwargs
    ) -> List[str]:
        """
        Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence.
        """
        raise NotImplementedError

    def split_sentences_batch(
        self, texts: List[str], *args, **kwargs
    ) -> List[List[str]]:
        """
        Default implementation is to just iterate over the texts and call `split_sentences`.
        """
        return [self.split_sentences(text) for text in texts]

    @staticmethod
    def check_is_batched(
        texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
    ):
        """
        Check if input is batched or a single sample.

        Args:
            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
                Text to check.
            is_split_into_words (:obj:`bool`):
                If :obj:`True` and the input is a string, the input is split on spaces.

        Returns:
            :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
        """
        return bool(
            (not is_split_into_words and isinstance(texts, (list, tuple)))
            or (
                is_split_into_words
                and isinstance(texts, (list, tuple))
                and texts
                and isinstance(texts[0], (list, tuple))
            )
        )