Spaces:

riccorl
/

relik-entity-linking

Sleeping

App Files Files Community

relik-entity-linking / relik /inference /data /tokenizers /base_tokenizer.py

riccorl

first commit

626eca0 over 1 year ago

raw

history blame contribute delete

2.57 kB

	from typing import List, Union

	from relik.inference.data.objects import Word


	class BaseTokenizer:
	"""
	A :obj:`Tokenizer` splits strings of text into single words, optionally adds
	pos tags and perform lemmatization.
	"""

	def __call__(
	self,
	texts: Union[str, List[str], List[List[str]]],
	is_split_into_words: bool = False,
	**kwargs
	) -> List[List[Word]]:
	"""
	Tokenize the input into single words.

	Args:
	texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
	Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
	is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
	If :obj:`True` and the input is a string, the input is split on spaces.

	Returns:
	:obj:`List[List[Word]]`: The input text tokenized in single words.
	"""
	raise NotImplementedError

	def tokenize(self, text: str) -> List[Word]:
	"""
	Implements splitting words into tokens.

	Args:
	text (:obj:`str`):
	Text to tokenize.

	Returns:
	:obj:`List[Word]`: The input text tokenized in single words.

	"""
	raise NotImplementedError

	def tokenize_batch(self, texts: List[str]) -> List[List[Word]]:
	"""
	Implements batch splitting words into tokens.

	Args:
	texts (:obj:`List[str]`):
	Batch of text to tokenize.

	Returns:
	:obj:`List[List[Word]]`: The input batch tokenized in single words.

	"""
	return [self.tokenize(text) for text in texts]

	@staticmethod
	def check_is_batched(
	texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
	):
	"""
	Check if input is batched or a single sample.

	Args:
	texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
	Text to check.
	is_split_into_words (:obj:`bool`):
	If :obj:`True` and the input is a string, the input is split on spaces.

	Returns:
	:obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
	"""
	return bool(
	(not is_split_into_words and isinstance(texts, (list, tuple)))
	or (
	is_split_into_words
	and isinstance(texts, (list, tuple))
	and texts
	and isinstance(texts[0], (list, tuple))
	)
	)