3ie-intervention-outcome-entity-linking

Sleeping

File size: 2,572 Bytes

2f044c1

from typing import List, Union

from relik.inference.data.objects import Word


class BaseTokenizer:
    """
    A :obj:`Tokenizer` splits strings of text into single words, optionally adds
    pos tags and perform lemmatization.
    """

    def __call__(
        self,
        texts: Union[str, List[str], List[List[str]]],
        is_split_into_words: bool = False,
        **kwargs
    ) -> List[List[Word]]:
        """
        Tokenize the input into single words.

        Args:
            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
                If :obj:`True` and the input is a string, the input is split on spaces.

        Returns:
            :obj:`List[List[Word]]`: The input text tokenized in single words.
        """
        raise NotImplementedError

    def tokenize(self, text: str) -> List[Word]:
        """
        Implements splitting words into tokens.

        Args:
            text (:obj:`str`):
                Text to tokenize.

        Returns:
            :obj:`List[Word]`: The input text tokenized in single words.

        """
        raise NotImplementedError

    def tokenize_batch(self, texts: List[str]) -> List[List[Word]]:
        """
        Implements batch splitting words into tokens.

        Args:
            texts (:obj:`List[str]`):
                Batch of text to tokenize.

        Returns:
            :obj:`List[List[Word]]`: The input batch tokenized in single words.

        """
        return [self.tokenize(text) for text in texts]

    @staticmethod
    def check_is_batched(
        texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
    ):
        """
        Check if input is batched or a single sample.

        Args:
            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
                Text to check.
            is_split_into_words (:obj:`bool`):
                If :obj:`True` and the input is a string, the input is split on spaces.

        Returns:
            :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
        """
        return bool(
            (not is_split_into_words and isinstance(texts, (list, tuple)))
            or (
                is_split_into_words
                and isinstance(texts, (list, tuple))
                and texts
                and isinstance(texts[0], (list, tuple))
            )
        )