Spaces:
Sleeping
Sleeping
from typing import List, Union | |
from relik.inference.data.objects import Word | |
class BaseTokenizer: | |
""" | |
A :obj:`Tokenizer` splits strings of text into single words, optionally adds | |
pos tags and perform lemmatization. | |
""" | |
def __call__( | |
self, | |
texts: Union[str, List[str], List[List[str]]], | |
is_split_into_words: bool = False, | |
**kwargs | |
) -> List[List[Word]]: | |
""" | |
Tokenize the input into single words. | |
Args: | |
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): | |
Text to tag. It can be a single string, a batch of string and pre-tokenized strings. | |
is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True` and the input is a string, the input is split on spaces. | |
Returns: | |
:obj:`List[List[Word]]`: The input text tokenized in single words. | |
""" | |
raise NotImplementedError | |
def tokenize(self, text: str) -> List[Word]: | |
""" | |
Implements splitting words into tokens. | |
Args: | |
text (:obj:`str`): | |
Text to tokenize. | |
Returns: | |
:obj:`List[Word]`: The input text tokenized in single words. | |
""" | |
raise NotImplementedError | |
def tokenize_batch(self, texts: List[str]) -> List[List[Word]]: | |
""" | |
Implements batch splitting words into tokens. | |
Args: | |
texts (:obj:`List[str]`): | |
Batch of text to tokenize. | |
Returns: | |
:obj:`List[List[Word]]`: The input batch tokenized in single words. | |
""" | |
return [self.tokenize(text) for text in texts] | |
def check_is_batched( | |
texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool | |
): | |
""" | |
Check if input is batched or a single sample. | |
Args: | |
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): | |
Text to check. | |
is_split_into_words (:obj:`bool`): | |
If :obj:`True` and the input is a string, the input is split on spaces. | |
Returns: | |
:obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise. | |
""" | |
return bool( | |
(not is_split_into_words and isinstance(texts, (list, tuple))) | |
or ( | |
is_split_into_words | |
and isinstance(texts, (list, tuple)) | |
and texts | |
and isinstance(texts[0], (list, tuple)) | |
) | |
) | |