riccorl's picture
first commit
626eca0
raw
history blame
2.57 kB
from typing import List, Union
from relik.inference.data.objects import Word
class BaseTokenizer:
"""
A :obj:`Tokenizer` splits strings of text into single words, optionally adds
pos tags and perform lemmatization.
"""
def __call__(
self,
texts: Union[str, List[str], List[List[str]]],
is_split_into_words: bool = False,
**kwargs
) -> List[List[Word]]:
"""
Tokenize the input into single words.
Args:
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True` and the input is a string, the input is split on spaces.
Returns:
:obj:`List[List[Word]]`: The input text tokenized in single words.
"""
raise NotImplementedError
def tokenize(self, text: str) -> List[Word]:
"""
Implements splitting words into tokens.
Args:
text (:obj:`str`):
Text to tokenize.
Returns:
:obj:`List[Word]`: The input text tokenized in single words.
"""
raise NotImplementedError
def tokenize_batch(self, texts: List[str]) -> List[List[Word]]:
"""
Implements batch splitting words into tokens.
Args:
texts (:obj:`List[str]`):
Batch of text to tokenize.
Returns:
:obj:`List[List[Word]]`: The input batch tokenized in single words.
"""
return [self.tokenize(text) for text in texts]
@staticmethod
def check_is_batched(
texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
):
"""
Check if input is batched or a single sample.
Args:
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Text to check.
is_split_into_words (:obj:`bool`):
If :obj:`True` and the input is a string, the input is split on spaces.
Returns:
:obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
"""
return bool(
(not is_split_into_words and isinstance(texts, (list, tuple)))
or (
is_split_into_words
and isinstance(texts, (list, tuple))
and texts
and isinstance(texts[0], (list, tuple))
)
)